linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-08-31 10:08 [PATCH net-next v10 0/2] net: af_packet: optimize " Xin Zhao
@ 2025-08-31 10:08 ` Xin Zhao
  2025-09-01  1:21   ` Willem de Bruijn
  2025-09-02 15:43   ` Jason Xing
  0 siblings, 2 replies; 19+ messages in thread
From: Xin Zhao @ 2025-08-31 10:08 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel, Xin Zhao

In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.

Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
Changes in v8:
- Simplify the logic related to setting timeout.

Changes in v7:
- Only update the hrtimer expire time within the hrtimer callback.

Changes in v1:
- Do not add another config for the current changes.

---
 net/packet/af_packet.c | 79 +++++++++---------------------------------
 net/packet/diag.c      |  2 +-
 net/packet/internal.h  |  6 ++--
 3 files changed, 20 insertions(+), 67 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index d4eb4a4fe..3e3bb4216 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
 static int prb_queue_frozen(struct tpacket_kbdq_core *);
 static void prb_open_block(struct tpacket_kbdq_core *,
 		struct tpacket_block_desc *);
-static void prb_retire_rx_blk_timer_expired(struct timer_list *);
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
 		struct tpacket3_hdr *);
@@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
 	return proto;
 }
 
-static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-	timer_delete_sync(&pkc->retire_blk_timer);
-}
-
 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
 		struct sk_buff_head *rb_queue)
 {
 	struct tpacket_kbdq_core *pkc;
 
 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-
-	spin_lock_bh(&rb_queue->lock);
-	pkc->delete_blk_timer = 1;
-	spin_unlock_bh(&rb_queue->lock);
-
-	prb_del_retire_blk_timer(pkc);
-}
-
-static void prb_setup_retire_blk_timer(struct packet_sock *po)
-{
-	struct tpacket_kbdq_core *pkc;
-
-	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
-		    0);
-	pkc->retire_blk_timer.expires = jiffies;
+	hrtimer_cancel(&pkc->retire_blk_timer);
 }
 
 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
 	p1->version = po->tp_version;
 	po->stats.stats3.tp_freeze_q_cnt = 0;
 	if (req_u->req3.tp_retire_blk_tov)
-		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
+		p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
 	else
-		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
-						req_u->req3.tp_block_size);
-	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
+		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
+						req_u->req3.tp_block_size));
 	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
 	rwlock_init(&p1->blk_fill_in_prog_lock);
 
 	p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
 	prb_init_ft_ops(p1, req_u);
-	prb_setup_retire_blk_timer(po);
+	hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
+		      CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+	hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
+		      HRTIMER_MODE_REL_SOFT);
 	prb_open_block(p1, pbd);
 }
 
-/*  Do NOT update the last_blk_num first.
- *  Assumes sk_buff_head lock is held.
- */
-static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
-{
-	mod_timer(&pkc->retire_blk_timer,
-			jiffies + pkc->tov_in_jiffies);
-}
-
 /*
  * Timer logic:
  * 1) We refresh the timer only when we open a block.
@@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
  * prb_calc_retire_blk_tmo() calculates the tmo.
  *
  */
-static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
 {
 	struct packet_sock *po =
 		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 	frozen = prb_queue_frozen(pkc);
 	pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
 
-	if (unlikely(pkc->delete_blk_timer))
-		goto out;
-
 	/* We only need to plug the race when the block is partially filled.
 	 * tpacket_rcv:
 	 *		lock(); increment BLOCK_NUM_PKTS; unlock()
@@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 	}
 
 	if (!frozen) {
-		if (!BLOCK_NUM_PKTS(pbd)) {
-			/* An empty block. Just refresh the timer. */
-			goto refresh_timer;
+		if (BLOCK_NUM_PKTS(pbd)) {
+			/* Not an empty block. Need retire the block. */
+			prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
+			prb_dispatch_next_block(pkc, po);
 		}
-		prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
-		if (!prb_dispatch_next_block(pkc, po))
-			goto refresh_timer;
-		else
-			goto out;
 	} else {
 		/* Case 1. Queue was frozen because user-space was
 		 * lagging behind.
 		 */
-		if (prb_curr_blk_in_use(pbd)) {
-			/*
-			 * Ok, user-space is still behind.
-			 * So just refresh the timer.
-			 */
-			goto refresh_timer;
-		} else {
+		if (!prb_curr_blk_in_use(pbd)) {
 			/* Case 2. queue was frozen,user-space caught up,
 			 * now the link went idle && the timer fired.
 			 * We don't have a block to close.So we open this
@@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 			 * Thawing/timer-refresh is a side effect.
 			 */
 			prb_open_block(pkc, pbd);
-			goto out;
 		}
 	}
 
-refresh_timer:
-	_prb_refresh_rx_retire_blk_timer(pkc);
-
-out:
+	hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
 	spin_unlock(&po->sk.sk_receive_queue.lock);
+	return HRTIMER_RESTART;
 }
 
 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
@@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
 	pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
 
 	prb_thaw_queue(pkc1);
-	_prb_refresh_rx_retire_blk_timer(pkc1);
 
 	smp_wmb();
 }
diff --git a/net/packet/diag.c b/net/packet/diag.c
index 6ce1dcc28..c8f43e0c1 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
 	pdr.pdr_frame_nr = ring->frame_max + 1;
 
 	if (ver > TPACKET_V2) {
-		pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+		pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
 		pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
 		pdr.pdr_features = ring->prb_bdqc.feature_req_word;
 	} else {
diff --git a/net/packet/internal.h b/net/packet/internal.h
index d367b9f93..f8cfd9213 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
 	unsigned int	feature_req_word;
 	unsigned int	hdrlen;
 	unsigned char	reset_pending_on_curr_blk;
-	unsigned char   delete_blk_timer;
 	unsigned short	kactive_blk_num;
 	unsigned short	blk_sizeof_priv;
 
@@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
 	/* Default is set to 8ms */
 #define DEFAULT_PRB_RETIRE_TOV	(8)
 
-	unsigned short  retire_blk_tov;
+	ktime_t		interval_ktime;
 	unsigned short  version;
-	unsigned long	tov_in_jiffies;
 
 	/* timer to retire an outstanding block */
-	struct timer_list retire_blk_timer;
+	struct hrtimer  retire_blk_timer;
 };
 
 struct pgv {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-08-31 10:08 ` [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the " Xin Zhao
@ 2025-09-01  1:21   ` Willem de Bruijn
  2025-09-02 15:43   ` Jason Xing
  1 sibling, 0 replies; 19+ messages in thread
From: Willem de Bruijn @ 2025-09-01  1:21 UTC (permalink / raw)
  To: Xin Zhao, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel, Xin Zhao

Xin Zhao wrote:
> In a system with high real-time requirements, the timeout mechanism of
> ordinary timers with jiffies granularity is insufficient to meet the
> demands for real-time performance. Meanwhile, the optimization of CPU
> usage with af_packet is quite significant. Use hrtimer instead of timer
> to help compensate for the shortcomings in real-time performance.
> In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> This is unacceptable in some high real-time systems that require timely
> processing of network packets. By replacing it with hrtimer, if a timeout
> of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> 3 ms.
> 
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>

Tiny style point that is probably not worth respinning for.

Otherwise

Reviewed-by: Willem de Bruijn <willemb@google.com>



> ---
> Changes in v8:
> - Simplify the logic related to setting timeout.
> 
> Changes in v7:
> - Only update the hrtimer expire time within the hrtimer callback.
> 
> Changes in v1:
> - Do not add another config for the current changes.
> 
> ---
>  net/packet/af_packet.c | 79 +++++++++---------------------------------
>  net/packet/diag.c      |  2 +-
>  net/packet/internal.h  |  6 ++--
>  3 files changed, 20 insertions(+), 67 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d4eb4a4fe..3e3bb4216 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
>  static int prb_queue_frozen(struct tpacket_kbdq_core *);
>  static void prb_open_block(struct tpacket_kbdq_core *,
>  		struct tpacket_block_desc *);
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
>  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
>  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
>  		struct tpacket3_hdr *);
> @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
>  	return proto;
>  }
>  
> -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -	timer_delete_sync(&pkc->retire_blk_timer);
> -}
> -
>  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
>  		struct sk_buff_head *rb_queue)
>  {
>  	struct tpacket_kbdq_core *pkc;
>  
>  	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -
> -	spin_lock_bh(&rb_queue->lock);
> -	pkc->delete_blk_timer = 1;
> -	spin_unlock_bh(&rb_queue->lock);
> -
> -	prb_del_retire_blk_timer(pkc);
> -}
> -
> -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> -{
> -	struct tpacket_kbdq_core *pkc;
> -
> -	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> -		    0);
> -	pkc->retire_blk_timer.expires = jiffies;
> +	hrtimer_cancel(&pkc->retire_blk_timer);
>  }
>  
>  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
>  	p1->version = po->tp_version;
>  	po->stats.stats3.tp_freeze_q_cnt = 0;
>  	if (req_u->req3.tp_retire_blk_tov)
> -		p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> +		p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
>  	else
> -		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> -						req_u->req3.tp_block_size);
> -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> +		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> +						req_u->req3.tp_block_size));

req_u is not aligned with the line above.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-01  2:27 Xin Zhao
  2025-09-01 13:35 ` Willem de Bruijn
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-01  2:27 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Sun, 2025-08-31 at 21:21 -0400, Willem wrote:

> > -		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > -						req_u->req3.tp_block_size);
> > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > +		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > +						req_u->req3.tp_block_size));
> 
> req_u is not aligned with the line above.

I have some questions regarding the alignment here. According to the alignment requirements,
req_u should be aligned below the po variable. However, if it is aligned below po, the line
will become very long, which may affect readability. In this special case, can I align it to
prb_calc_retire_blk_tmo instead, or should I continue to align it to the po variable?


What should I do next?
Should I change the alignment, and resend PATCH with the reviewed information of version 10?

Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-01  2:27 [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation Xin Zhao
@ 2025-09-01 13:35 ` Willem de Bruijn
  0 siblings, 0 replies; 19+ messages in thread
From: Willem de Bruijn @ 2025-09-01 13:35 UTC (permalink / raw)
  To: Xin Zhao, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

Xin Zhao wrote:
> On Sun, 2025-08-31 at 21:21 -0400, Willem wrote:
> 
> > > -		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > > -						req_u->req3.tp_block_size);
> > > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > > +		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > > +						req_u->req3.tp_block_size));
> > 
> > req_u is not aligned with the line above.
> 
> I have some questions regarding the alignment here. According to the alignment requirements,
> req_u should be aligned below the po variable. However, if it is aligned below po, the line
> will become very long, which may affect readability. In this special case, can I align it to
> prb_calc_retire_blk_tmo instead, or should I continue to align it to the po variable?

The (minor) issue here is with the second req_u. Which is one space
off from the argument above. See checkpath.

In general, the line length and break rules are documented in the
kernel coding style page, which checkpatch follows.
 
> 
> What should I do next?
> Should I change the alignment, and resend PATCH with the reviewed information of version 10?

I did not think this one space was worth resending, so I added my
Reviewed-by. Others may disagree, but so far no other opinions.


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-01 14:16 Xin Zhao
  0 siblings, 0 replies; 19+ messages in thread
From: Xin Zhao @ 2025-09-01 14:16 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Mon, 2025-09-01 at 09:35 -0400, Willem wrote:

> > On Sun, 2025-08-31 at 21:21 -0400, Willem wrote:
> > 
> > > > -		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > > > -						req_u->req3.tp_block_size);
> > > > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > > > +		p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > > > +						req_u->req3.tp_block_size));
> > > 
> > > req_u is not aligned with the line above.
> > 
> > I have some questions regarding the alignment here. According to the alignment requirements,
> > req_u should be aligned below the po variable. However, if it is aligned below po, the line
> > will become very long, which may affect readability. In this special case, can I align it to
> > prb_calc_retire_blk_tmo instead, or should I continue to align it to the po variable?
> 
> The (minor) issue here is with the second req_u. Which is one space
> off from the argument above. See checkpath.
> 
> In general, the line length and break rules are documented in the
> kernel coding style page, which checkpatch follows.
> 
> > 
> > What should I do next?
> > Should I change the alignment, and resend PATCH with the reviewed information of version 10?
> 
> I did not think this one space was worth resending, so I added my
> Reviewed-by. Others may disagree, but so far no other opinions.

Okay, I will not resend the patch if there are no other opinions.


Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-08-31 10:08 ` [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the " Xin Zhao
  2025-09-01  1:21   ` Willem de Bruijn
@ 2025-09-02 15:43   ` Jason Xing
  2025-09-02 16:42     ` Jason Xing
  1 sibling, 1 reply; 19+ messages in thread
From: Jason Xing @ 2025-09-02 15:43 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Sun, Aug 31, 2025 at 6:09 PM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> In a system with high real-time requirements, the timeout mechanism of
> ordinary timers with jiffies granularity is insufficient to meet the
> demands for real-time performance. Meanwhile, the optimization of CPU
> usage with af_packet is quite significant. Use hrtimer instead of timer
> to help compensate for the shortcomings in real-time performance.
> In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> This is unacceptable in some high real-time systems that require timely
> processing of network packets. By replacing it with hrtimer, if a timeout
> of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> 3 ms.
>
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> ---
> Changes in v8:
> - Simplify the logic related to setting timeout.
>
> Changes in v7:
> - Only update the hrtimer expire time within the hrtimer callback.
>
> Changes in v1:
> - Do not add another config for the current changes.
>
> ---
>  net/packet/af_packet.c | 79 +++++++++---------------------------------
>  net/packet/diag.c      |  2 +-
>  net/packet/internal.h  |  6 ++--
>  3 files changed, 20 insertions(+), 67 deletions(-)
>
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index d4eb4a4fe..3e3bb4216 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
>  static int prb_queue_frozen(struct tpacket_kbdq_core *);
>  static void prb_open_block(struct tpacket_kbdq_core *,
>                 struct tpacket_block_desc *);
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
>  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
>  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
>                 struct tpacket3_hdr *);
> @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
>         return proto;
>  }
>
> -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -       timer_delete_sync(&pkc->retire_blk_timer);
> -}
> -
>  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
>                 struct sk_buff_head *rb_queue)
>  {
>         struct tpacket_kbdq_core *pkc;
>
>         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -
> -       spin_lock_bh(&rb_queue->lock);
> -       pkc->delete_blk_timer = 1;
> -       spin_unlock_bh(&rb_queue->lock);
> -
> -       prb_del_retire_blk_timer(pkc);
> -}
> -
> -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> -{
> -       struct tpacket_kbdq_core *pkc;
> -
> -       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -       timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> -                   0);
> -       pkc->retire_blk_timer.expires = jiffies;
> +       hrtimer_cancel(&pkc->retire_blk_timer);
>  }
>
>  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
>         p1->version = po->tp_version;
>         po->stats.stats3.tp_freeze_q_cnt = 0;
>         if (req_u->req3.tp_retire_blk_tov)
> -               p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> +               p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
>         else
> -               p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> -                                               req_u->req3.tp_block_size);
> -       p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> +               p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> +                                               req_u->req3.tp_block_size));
>         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
>         rwlock_init(&p1->blk_fill_in_prog_lock);
>
>         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
>         prb_init_ft_ops(p1, req_u);
> -       prb_setup_retire_blk_timer(po);
> +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> +                     HRTIMER_MODE_REL_SOFT);

You expect to see it start at the setsockopt phase? Even if it's far
from the real use of recv at the moment.

>         prb_open_block(p1, pbd);
>  }
>
> -/*  Do NOT update the last_blk_num first.
> - *  Assumes sk_buff_head lock is held.
> - */
> -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> -{
> -       mod_timer(&pkc->retire_blk_timer,
> -                       jiffies + pkc->tov_in_jiffies);
> -}
> -
>  /*
>   * Timer logic:
>   * 1) We refresh the timer only when we open a block.
> @@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
>   * prb_calc_retire_blk_tmo() calculates the tmo.
>   *
>   */
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
>  {
>         struct packet_sock *po =
>                 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> @@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>         frozen = prb_queue_frozen(pkc);
>         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
>
> -       if (unlikely(pkc->delete_blk_timer))
> -               goto out;
> -
>         /* We only need to plug the race when the block is partially filled.
>          * tpacket_rcv:
>          *              lock(); increment BLOCK_NUM_PKTS; unlock()
> @@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>         }
>
>         if (!frozen) {
> -               if (!BLOCK_NUM_PKTS(pbd)) {
> -                       /* An empty block. Just refresh the timer. */
> -                       goto refresh_timer;
> +               if (BLOCK_NUM_PKTS(pbd)) {
> +                       /* Not an empty block. Need retire the block. */
> +                       prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> +                       prb_dispatch_next_block(pkc, po);
>                 }
> -               prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> -               if (!prb_dispatch_next_block(pkc, po))
> -                       goto refresh_timer;
> -               else
> -                       goto out;
>         } else {
>                 /* Case 1. Queue was frozen because user-space was
>                  * lagging behind.
>                  */
> -               if (prb_curr_blk_in_use(pbd)) {
> -                       /*
> -                        * Ok, user-space is still behind.
> -                        * So just refresh the timer.
> -                        */
> -                       goto refresh_timer;
> -               } else {
> +               if (!prb_curr_blk_in_use(pbd)) {
>                         /* Case 2. queue was frozen,user-space caught up,
>                          * now the link went idle && the timer fired.
>                          * We don't have a block to close.So we open this
> @@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>                          * Thawing/timer-refresh is a side effect.
>                          */
>                         prb_open_block(pkc, pbd);
> -                       goto out;
>                 }
>         }
>
> -refresh_timer:
> -       _prb_refresh_rx_retire_blk_timer(pkc);
> -
> -out:
> +       hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
>         spin_unlock(&po->sk.sk_receive_queue.lock);
> +       return HRTIMER_RESTART;
>  }
>
>  static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
>         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
>
>         prb_thaw_queue(pkc1);
> -       _prb_refresh_rx_retire_blk_timer(pkc1);

Could you say more on why you remove this here and only reset/update
the expiry time in the timer handler? Probably I missed something
appearing in the previous long discussion.

>
>         smp_wmb();
>  }
> diff --git a/net/packet/diag.c b/net/packet/diag.c
> index 6ce1dcc28..c8f43e0c1 100644
> --- a/net/packet/diag.c
> +++ b/net/packet/diag.c
> @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
>         pdr.pdr_frame_nr = ring->frame_max + 1;
>
>         if (ver > TPACKET_V2) {
> -               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
> +               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
>                 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
>                 pdr.pdr_features = ring->prb_bdqc.feature_req_word;
>         } else {
> diff --git a/net/packet/internal.h b/net/packet/internal.h
> index d367b9f93..f8cfd9213 100644
> --- a/net/packet/internal.h
> +++ b/net/packet/internal.h
> @@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
>         unsigned int    feature_req_word;
>         unsigned int    hdrlen;
>         unsigned char   reset_pending_on_curr_blk;
> -       unsigned char   delete_blk_timer;
>         unsigned short  kactive_blk_num;
>         unsigned short  blk_sizeof_priv;
>
> @@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
>         /* Default is set to 8ms */
>  #define DEFAULT_PRB_RETIRE_TOV (8)
>
> -       unsigned short  retire_blk_tov;
> +       ktime_t         interval_ktime;
>         unsigned short  version;
> -       unsigned long   tov_in_jiffies;
>
>         /* timer to retire an outstanding block */
> -       struct timer_list retire_blk_timer;
> +       struct hrtimer  retire_blk_timer;
>  };

The whole structure needs a new organization?

Before:
        /* size: 152, cachelines: 3, members: 22 */
        /* sum members: 144, holes: 2, sum holes: 8 */
        /* paddings: 1, sum paddings: 4 */
        /* last cacheline: 24 bytes */
After:
        /* size: 176, cachelines: 3, members: 19 */
        /* sum members: 163, holes: 4, sum holes: 13 */
        /* paddings: 1, sum paddings: 4 */
        /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
        /* last cacheline: 48 bytes */

Thanks,
Jason

>
>  struct pgv {
> --
> 2.34.1
>
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-02 15:43   ` Jason Xing
@ 2025-09-02 16:42     ` Jason Xing
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Xing @ 2025-09-02 16:42 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Tue, Sep 2, 2025 at 11:43 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Sun, Aug 31, 2025 at 6:09 PM Xin Zhao <jackzxcui1989@163.com> wrote:
> >
> > In a system with high real-time requirements, the timeout mechanism of
> > ordinary timers with jiffies granularity is insufficient to meet the
> > demands for real-time performance. Meanwhile, the optimization of CPU
> > usage with af_packet is quite significant. Use hrtimer instead of timer
> > to help compensate for the shortcomings in real-time performance.
> > In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> > enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> > This is unacceptable in some high real-time systems that require timely
> > processing of network packets. By replacing it with hrtimer, if a timeout
> > of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> > 3 ms.
> >
> > Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> > ---
> > Changes in v8:
> > - Simplify the logic related to setting timeout.
> >
> > Changes in v7:
> > - Only update the hrtimer expire time within the hrtimer callback.
> >
> > Changes in v1:
> > - Do not add another config for the current changes.
> >
> > ---
> >  net/packet/af_packet.c | 79 +++++++++---------------------------------
> >  net/packet/diag.c      |  2 +-
> >  net/packet/internal.h  |  6 ++--
> >  3 files changed, 20 insertions(+), 67 deletions(-)
> >
> > diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> > index d4eb4a4fe..3e3bb4216 100644
> > --- a/net/packet/af_packet.c
> > +++ b/net/packet/af_packet.c
> > @@ -203,8 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
> >  static int prb_queue_frozen(struct tpacket_kbdq_core *);
> >  static void prb_open_block(struct tpacket_kbdq_core *,
> >                 struct tpacket_block_desc *);
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
> >  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
> >  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
> >                 struct tpacket3_hdr *);
> > @@ -579,33 +578,13 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
> >         return proto;
> >  }
> >
> > -static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > -       timer_delete_sync(&pkc->retire_blk_timer);
> > -}
> > -
> >  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
> >                 struct sk_buff_head *rb_queue)
> >  {
> >         struct tpacket_kbdq_core *pkc;
> >
> >         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > -
> > -       spin_lock_bh(&rb_queue->lock);
> > -       pkc->delete_blk_timer = 1;

One more review from my side is that as to the removal of
delete_blk_timer, I'm afraid it deserves a clarification in the commit
message.

> > -       spin_unlock_bh(&rb_queue->lock);
> > -
> > -       prb_del_retire_blk_timer(pkc);
> > -}
> > -
> > -static void prb_setup_retire_blk_timer(struct packet_sock *po)
> > -{
> > -       struct tpacket_kbdq_core *pkc;
> > -
> > -       pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> > -       timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > -                   0);
> > -       pkc->retire_blk_timer.expires = jiffies;
> > +       hrtimer_cancel(&pkc->retire_blk_timer);
> >  }
> >
> >  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> > @@ -671,29 +650,22 @@ static void init_prb_bdqc(struct packet_sock *po,
> >         p1->version = po->tp_version;
> >         po->stats.stats3.tp_freeze_q_cnt = 0;
> >         if (req_u->req3.tp_retire_blk_tov)
> > -               p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
> > +               p1->interval_ktime = ms_to_ktime(req_u->req3.tp_retire_blk_tov);
> >         else
> > -               p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
> > -                                               req_u->req3.tp_block_size);
> > -       p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > +               p1->interval_ktime = ms_to_ktime(prb_calc_retire_blk_tmo(po,
> > +                                               req_u->req3.tp_block_size));
> >         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
> >         rwlock_init(&p1->blk_fill_in_prog_lock);
> >
> >         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
> >         prb_init_ft_ops(p1, req_u);
> > -       prb_setup_retire_blk_timer(po);
> > +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> > +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> > +                     HRTIMER_MODE_REL_SOFT);
>
> You expect to see it start at the setsockopt phase? Even if it's far
> from the real use of recv at the moment.
>
> >         prb_open_block(p1, pbd);
> >  }
> >
> > -/*  Do NOT update the last_blk_num first.
> > - *  Assumes sk_buff_head lock is held.
> > - */
> > -static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> > -{
> > -       mod_timer(&pkc->retire_blk_timer,
> > -                       jiffies + pkc->tov_in_jiffies);
> > -}
> > -
> >  /*
> >   * Timer logic:
> >   * 1) We refresh the timer only when we open a block.
> > @@ -717,7 +689,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
> >   * prb_calc_retire_blk_tmo() calculates the tmo.
> >   *
> >   */
> > -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
> >  {
> >         struct packet_sock *po =
> >                 timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> > @@ -730,9 +702,6 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >         frozen = prb_queue_frozen(pkc);
> >         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
> >
> > -       if (unlikely(pkc->delete_blk_timer))
> > -               goto out;
> > -
> >         /* We only need to plug the race when the block is partially filled.
> >          * tpacket_rcv:
> >          *              lock(); increment BLOCK_NUM_PKTS; unlock()
> > @@ -749,26 +718,16 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >         }
> >
> >         if (!frozen) {
> > -               if (!BLOCK_NUM_PKTS(pbd)) {
> > -                       /* An empty block. Just refresh the timer. */
> > -                       goto refresh_timer;
> > +               if (BLOCK_NUM_PKTS(pbd)) {
> > +                       /* Not an empty block. Need retire the block. */
> > +                       prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > +                       prb_dispatch_next_block(pkc, po);
> >                 }
> > -               prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
> > -               if (!prb_dispatch_next_block(pkc, po))
> > -                       goto refresh_timer;
> > -               else
> > -                       goto out;
> >         } else {
> >                 /* Case 1. Queue was frozen because user-space was
> >                  * lagging behind.
> >                  */
> > -               if (prb_curr_blk_in_use(pbd)) {
> > -                       /*
> > -                        * Ok, user-space is still behind.
> > -                        * So just refresh the timer.
> > -                        */
> > -                       goto refresh_timer;
> > -               } else {
> > +               if (!prb_curr_blk_in_use(pbd)) {
> >                         /* Case 2. queue was frozen,user-space caught up,
> >                          * now the link went idle && the timer fired.
> >                          * We don't have a block to close.So we open this
> > @@ -777,15 +736,12 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> >                          * Thawing/timer-refresh is a side effect.
> >                          */
> >                         prb_open_block(pkc, pbd);
> > -                       goto out;
> >                 }
> >         }
> >
> > -refresh_timer:
> > -       _prb_refresh_rx_retire_blk_timer(pkc);
> > -
> > -out:
> > +       hrtimer_forward_now(&pkc->retire_blk_timer, pkc->interval_ktime);
> >         spin_unlock(&po->sk.sk_receive_queue.lock);
> > +       return HRTIMER_RESTART;
> >  }
> >
> >  static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> > @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
> >         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
> >
> >         prb_thaw_queue(pkc1);
> > -       _prb_refresh_rx_retire_blk_timer(pkc1);
>
> Could you say more on why you remove this here and only reset/update
> the expiry time in the timer handler? Probably I missed something
> appearing in the previous long discussion.

I gradually understand your thought behind this modification. You're
trying to move the timer operation out of prb_open_block() and then
spread the timer operation into each caller.

You probably miss the following call trace:
packet_current_rx_frame() -> __packet_lookup_frame_in_block() ->
prb_open_block() -> _prb_refresh_rx_retire_blk_timer()
?

May I ask why bother introducing so many changes like this instead of
leaving it as-is?

Thanks,
Jason

>
> >
> >         smp_wmb();
> >  }
> > diff --git a/net/packet/diag.c b/net/packet/diag.c
> > index 6ce1dcc28..c8f43e0c1 100644
> > --- a/net/packet/diag.c
> > +++ b/net/packet/diag.c
> > @@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
> >         pdr.pdr_frame_nr = ring->frame_max + 1;
> >
> >         if (ver > TPACKET_V2) {
> > -               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
> > +               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
> >                 pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
> >                 pdr.pdr_features = ring->prb_bdqc.feature_req_word;
> >         } else {
> > diff --git a/net/packet/internal.h b/net/packet/internal.h
> > index d367b9f93..f8cfd9213 100644
> > --- a/net/packet/internal.h
> > +++ b/net/packet/internal.h
> > @@ -20,7 +20,6 @@ struct tpacket_kbdq_core {
> >         unsigned int    feature_req_word;
> >         unsigned int    hdrlen;
> >         unsigned char   reset_pending_on_curr_blk;
> > -       unsigned char   delete_blk_timer;
> >         unsigned short  kactive_blk_num;
> >         unsigned short  blk_sizeof_priv;
> >
> > @@ -39,12 +38,11 @@ struct tpacket_kbdq_core {
> >         /* Default is set to 8ms */
> >  #define DEFAULT_PRB_RETIRE_TOV (8)
> >
> > -       unsigned short  retire_blk_tov;
> > +       ktime_t         interval_ktime;
> >         unsigned short  version;
> > -       unsigned long   tov_in_jiffies;
> >
> >         /* timer to retire an outstanding block */
> > -       struct timer_list retire_blk_timer;
> > +       struct hrtimer  retire_blk_timer;
> >  };
>
> The whole structure needs a new organization?
>
> Before:
>         /* size: 152, cachelines: 3, members: 22 */
>         /* sum members: 144, holes: 2, sum holes: 8 */
>         /* paddings: 1, sum paddings: 4 */
>         /* last cacheline: 24 bytes */
> After:
>         /* size: 176, cachelines: 3, members: 19 */
>         /* sum members: 163, holes: 4, sum holes: 13 */
>         /* paddings: 1, sum paddings: 4 */
>         /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
>         /* last cacheline: 48 bytes */
>
> Thanks,
> Jason
>
> >
> >  struct pgv {
> > --
> > 2.34.1
> >
> >

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-03 16:17 Xin Zhao
  2025-09-04  2:50 ` Jason Xing
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-03 16:17 UTC (permalink / raw)
  To: kerneljasonxing, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Tue, Sep 2, 2025 at 23:43 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:

> >         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
> >         prb_init_ft_ops(p1, req_u);
> > -       prb_setup_retire_blk_timer(po);
> > +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> > +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> > +                     HRTIMER_MODE_REL_SOFT);
> 
> You expect to see it start at the setsockopt phase? Even if it's far
> from the real use of recv at the moment.
> 
> >         prb_open_block(p1, pbd);
> >  }

Before applying this patch, init_prb_bdqc also start the timer by mod_timer:

init_prb_bdqc
  prb_open_block
    _prb_refresh_rx_retire_blk_timer
      mod_timer

So the current timer's start time is almost the same as it was before applying
the patch.


> > @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
> >         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
> >
> >         prb_thaw_queue(pkc1);
> > -       _prb_refresh_rx_retire_blk_timer(pkc1);
> 
> Could you say more on why you remove this here and only reset/update
> the expiry time in the timer handler? Probably I missed something
> appearing in the previous long discussion.
> 
> >
> >         smp_wmb();
> >  }

In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:

Changes in v7:
  When the callback return, without sk_buff_head lock protection, __run_hrtimer will
  enqueue the timer if return HRTIMER_RESTART. Setting the hrtimer expires while
  enqueuing a timer may cause chaos in the hrtimer red-black tree.

Neither hrtimer_set_expires nor hrtimer_forward_now is allowed when the hrtimer has
already been enqueued. Therefore, the only place where the hrtimer timeout can be set is
within the callback, at which point the hrtimer is in a non-enqueued state and can have
its timeout set.


Changes in v8:
  Simplify the logic related to setting timeout, as suggestd by Willem de Bruijn.
  Currently timer callback just restarts itself unconditionally, so delete the
 'out:' label, do not forward hrtimer in prb_open_block, call hrtimer_forward_now
  directly and always return HRTIMER_RESTART. The only special case is when
  prb_open_block is called from tpacket_rcv. That would set the timeout further
  into the future than the already queued timer. An earlier timeout is not
  problematic. No need to add complexity to avoid that.

This paragraph explains that if the block's retire timeout is not adjusted within
the timer callback, it will only result in an earlier-than-expected retire timeout,
which is not problematic. Therefore, it is unnecessary to increase the logical complexity
to ensure block retire timeout occurs as expected each time.


> The whole structure needs a new organization?
> 
> Before:
>         /* size: 152, cachelines: 3, members: 22 */
>         /* sum members: 144, holes: 2, sum holes: 8 */
>         /* paddings: 1, sum paddings: 4 */
>         /* last cacheline: 24 bytes */
> After:
>         /* size: 176, cachelines: 3, members: 19 */
>         /* sum members: 163, holes: 4, sum holes: 13 */
>         /* paddings: 1, sum paddings: 4 */
>         /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
>         /* last cacheline: 48 bytes */

What about the following organization:?

/* kbdq - kernel block descriptor queue */
struct tpacket_kbdq_core {
	struct pgv	*pkbdq;
	unsigned int	feature_req_word;
	unsigned int	hdrlen;
	unsigned short	kactive_blk_num;
	unsigned short	blk_sizeof_priv;
	unsigned char	reset_pending_on_curr_blk;

	char		*pkblk_start;
	char		*pkblk_end;
	int		kblk_size;
	unsigned int	max_frame_len;
	unsigned int	knum_blocks;
	char		*prev;
	char		*nxt_offset;

	unsigned short  version;
	
	uint64_t	knxt_seq_num;
	struct sk_buff	*skb;

	rwlock_t	blk_fill_in_prog_lock;

	/* timer to retire an outstanding block */
	struct hrtimer  retire_blk_timer;

	/* Default is set to 8ms */
#define DEFAULT_PRB_RETIRE_TOV	(8)

	ktime_t		interval_ktime;
};



Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-03 17:07 Xin Zhao
  2025-09-04  3:26 ` Jason Xing
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-03 17:07 UTC (permalink / raw)
  To: kerneljasonxing, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Wed, Sep 3, 2025 at 00:42 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:

> One more review from my side is that as to the removal of
> delete_blk_timer, I'm afraid it deserves a clarification in the commit
> message.
> 
> > > -       spin_unlock_bh(&rb_queue->lock);
> > > -
> > > -       prb_del_retire_blk_timer(pkc);
> > > -}
> > > -

In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:

Changes in v8:
- Delete delete_blk_timer field, as suggested by Willem de Bruijn,
  hrtimer_cancel will check and wait until the timer callback return and ensure
  enter enter callback again;

I will also emphasize the removal of delete_blk_timer in the commit message for this 2/2
commit. The updated commit message for the 2/2 patch is as follows:

Changes in v8:
- Simplify the logic related to setting timeout.
- Delete delete_blk_timer field, hrtimer_cancel will check and wait until
  the timer callback return.


> I gradually understand your thought behind this modification. You're
> trying to move the timer operation out of prb_open_block() and then
> spread the timer operation into each caller.
> 
> You probably miss the following call trace:
> packet_current_rx_frame() -> __packet_lookup_frame_in_block() ->
> prb_open_block() -> _prb_refresh_rx_retire_blk_timer()
> ?
> 
> May I ask why bother introducing so many changes like this instead of
> leaving it as-is?




Consider the following timing sequence:
timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
0       hrtimer_run_softirq
1         __hrtimer_run_queues
2           __run_hrtimer
3             prb_retire_rx_blk_timer_expired
4               spin_lock(&po->sk.sk_receive_queue.lock);
5               _prb_refresh_rx_retire_blk_timer
6                 hrtimer_forward_now
7               spin_unlock(&po->sk.sk_receive_queue.lock)
8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
10                                                               packet_current_rx_frame
11                                                                 __packet_lookup_frame_in_block
12            finish enqueue_hrtimer                                 prb_open_block
13                                                                     _prb_refresh_rx_retire_blk_timer
14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
15                                                                       hrtimer_forward_now
16                                                                         WARN_ON
On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).

In my previous email, I provided an explanation. As a supplement, I would
like to reiterate a paragraph from my earlier response to Willem.
The point is that when the hrtimer is in the enqueued state, you cannot
call interfaces like hrtimer_forward_now. The kernel has a WARN_ON check
in hrtimer_forward_now for this reason. Similarly, you also cannot call
interfaces like hrtimer_set_expires. The kernel does not include a WARN_ON
check in hrtimer_set_expires to avoid increasing the code size, as
hrtimer_set_expires is an inline function.


Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-03 16:17 Xin Zhao
@ 2025-09-04  2:50 ` Jason Xing
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Xing @ 2025-09-04  2:50 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Thu, Sep 4, 2025 at 12:17 AM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> On Tue, Sep 2, 2025 at 23:43 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> > >         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
> > >         prb_init_ft_ops(p1, req_u);
> > > -       prb_setup_retire_blk_timer(po);
> > > +       hrtimer_setup(&p1->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> > > +                     CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> > > +       hrtimer_start(&p1->retire_blk_timer, p1->interval_ktime,
> > > +                     HRTIMER_MODE_REL_SOFT);
> >
> > You expect to see it start at the setsockopt phase? Even if it's far
> > from the real use of recv at the moment.
> >
> > >         prb_open_block(p1, pbd);
> > >  }
>
> Before applying this patch, init_prb_bdqc also start the timer by mod_timer:
>
> init_prb_bdqc
>   prb_open_block
>     _prb_refresh_rx_retire_blk_timer
>       mod_timer
>
> So the current timer's start time is almost the same as it was before applying
> the patch.
>
>
> > > @@ -917,7 +873,6 @@ static void prb_open_block(struct tpacket_kbdq_core *pkc1,
> > >         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
> > >
> > >         prb_thaw_queue(pkc1);
> > > -       _prb_refresh_rx_retire_blk_timer(pkc1);
> >
> > Could you say more on why you remove this here and only reset/update
> > the expiry time in the timer handler? Probably I missed something
> > appearing in the previous long discussion.
> >
> > >
> > >         smp_wmb();
> > >  }
>
> In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
>
> Changes in v7:
>   When the callback return, without sk_buff_head lock protection, __run_hrtimer will
>   enqueue the timer if return HRTIMER_RESTART. Setting the hrtimer expires while
>   enqueuing a timer may cause chaos in the hrtimer red-black tree.
>
> Neither hrtimer_set_expires nor hrtimer_forward_now is allowed when the hrtimer has
> already been enqueued. Therefore, the only place where the hrtimer timeout can be set is
> within the callback, at which point the hrtimer is in a non-enqueued state and can have
> its timeout set.

Can we use hrtimer_is_queued() instead? Please see tcp_pacing_check()
as an example. But considering your following explanation, I think
it's okay now.

>
>
> Changes in v8:
>   Simplify the logic related to setting timeout, as suggestd by Willem de Bruijn.
>   Currently timer callback just restarts itself unconditionally, so delete the
>  'out:' label, do not forward hrtimer in prb_open_block, call hrtimer_forward_now
>   directly and always return HRTIMER_RESTART. The only special case is when
>   prb_open_block is called from tpacket_rcv. That would set the timeout further
>   into the future than the already queued timer. An earlier timeout is not
>   problematic. No need to add complexity to avoid that.
>
> This paragraph explains that if the block's retire timeout is not adjusted within
> the timer callback, it will only result in an earlier-than-expected retire timeout,
> which is not problematic. Therefore, it is unnecessary to increase the logical complexity
> to ensure block retire timeout occurs as expected each time.

Sounds fair.

>
>
> > The whole structure needs a new organization?
> >
> > Before:
> >         /* size: 152, cachelines: 3, members: 22 */
> >         /* sum members: 144, holes: 2, sum holes: 8 */
> >         /* paddings: 1, sum paddings: 4 */
> >         /* last cacheline: 24 bytes */
> > After:
> >         /* size: 176, cachelines: 3, members: 19 */
> >         /* sum members: 163, holes: 4, sum holes: 13 */
> >         /* paddings: 1, sum paddings: 4 */
> >         /* forced alignments: 1, forced holes: 1, sum forced holes: 6 */
> >         /* last cacheline: 48 bytes */
>
> What about the following organization:?
>
> /* kbdq - kernel block descriptor queue */
> struct tpacket_kbdq_core {
>         struct pgv      *pkbdq;
>         unsigned int    feature_req_word;
>         unsigned int    hdrlen;
>         unsigned short  kactive_blk_num;
>         unsigned short  blk_sizeof_priv;
>         unsigned char   reset_pending_on_curr_blk;
>
>         char            *pkblk_start;
>         char            *pkblk_end;
>         int             kblk_size;
>         unsigned int    max_frame_len;
>         unsigned int    knum_blocks;
>         char            *prev;
>         char            *nxt_offset;
>
>         unsigned short  version;
>
>         uint64_t        knxt_seq_num;
>         struct sk_buff  *skb;
>
>         rwlock_t        blk_fill_in_prog_lock;
>
>         /* timer to retire an outstanding block */
>         struct hrtimer  retire_blk_timer;
>
>         /* Default is set to 8ms */
> #define DEFAULT_PRB_RETIRE_TOV  (8)
>
>         ktime_t         interval_ktime;
> };

Could you share the result after running 'pahole --hex -C
tpacket_kbdq_core vmlinux'?

Thanks,
Jason

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-03 17:07 Xin Zhao
@ 2025-09-04  3:26 ` Jason Xing
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Xing @ 2025-09-04  3:26 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Thu, Sep 4, 2025 at 1:07 AM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> On Wed, Sep 3, 2025 at 00:42 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> > One more review from my side is that as to the removal of
> > delete_blk_timer, I'm afraid it deserves a clarification in the commit
> > message.
> >
> > > > -       spin_unlock_bh(&rb_queue->lock);
> > > > -
> > > > -       prb_del_retire_blk_timer(pkc);
> > > > -}
> > > > -
>
> In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
>
> Changes in v8:
> - Delete delete_blk_timer field, as suggested by Willem de Bruijn,
>   hrtimer_cancel will check and wait until the timer callback return and ensure
>   enter enter callback again;

I see the reason now :)

Please know that the history changes through versions will finally be
removed, only the official message that will be kept in the git. So
this kind of change, I think, should be clarified officially since
you're removing a structure member. Adding more descriptions will be
helpful to readers in the future. Thank you.

>
> I will also emphasize the removal of delete_blk_timer in the commit message for this 2/2
> commit. The updated commit message for the 2/2 patch is as follows:
>
> Changes in v8:
> - Simplify the logic related to setting timeout.
> - Delete delete_blk_timer field, hrtimer_cancel will check and wait until
>   the timer callback return.
>
>
> > I gradually understand your thought behind this modification. You're
> > trying to move the timer operation out of prb_open_block() and then
> > spread the timer operation into each caller.
> >
> > You probably miss the following call trace:
> > packet_current_rx_frame() -> __packet_lookup_frame_in_block() ->
> > prb_open_block() -> _prb_refresh_rx_retire_blk_timer()
> > ?
> >
> > May I ask why bother introducing so many changes like this instead of
> > leaving it as-is?
>
>
>
>
> Consider the following timing sequence:
> timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
> 0       hrtimer_run_softirq
> 1         __hrtimer_run_queues
> 2           __run_hrtimer
> 3             prb_retire_rx_blk_timer_expired
> 4               spin_lock(&po->sk.sk_receive_queue.lock);
> 5               _prb_refresh_rx_retire_blk_timer
> 6                 hrtimer_forward_now
> 7               spin_unlock(&po->sk.sk_receive_queue.lock)
> 8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
> 9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
> 10                                                               packet_current_rx_frame
> 11                                                                 __packet_lookup_frame_in_block
> 12            finish enqueue_hrtimer                                 prb_open_block
> 13                                                                     _prb_refresh_rx_retire_blk_timer
> 14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
> 15                                                                       hrtimer_forward_now
> 16                                                                         WARN_ON
> On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
> while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).
>
> In my previous email, I provided an explanation. As a supplement, I would
> like to reiterate a paragraph from my earlier response to Willem.
> The point is that when the hrtimer is in the enqueued state, you cannot

How about tring hrtimer_is_queued() beforehand?

IIUC, with this patch applied, we will lose the opportunity to refresh
the timer when the lookup function (in the above path I mentioned)
gets called compared to before. If the packet socket tries to look up
a new block and it doesn't update its expiry time, the timer will soon
wake up. Does it sound unreasonable?

Thanks,
Jason

> call interfaces like hrtimer_forward_now. The kernel has a WARN_ON check
> in hrtimer_forward_now for this reason. Similarly, you also cannot call
> interfaces like hrtimer_set_expires. The kernel does not include a WARN_ON
> check in hrtimer_set_expires to avoid increasing the code size, as
> hrtimer_set_expires is an inline function.
>
>
> Thanks
> Xin Zhao
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-04 14:59 Xin Zhao
  2025-09-05  0:09 ` Jason Xing
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-04 14:59 UTC (permalink / raw)
  To: kerneljasonxing, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Thu, Sep 4, 2025 at 10:50 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:

> > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> >
> > Changes in v7:
> >   When the callback return, without sk_buff_head lock protection, __run_hrtimer will
> >   enqueue the timer if return HRTIMER_RESTART. Setting the hrtimer expires while
> >   enqueuing a timer may cause chaos in the hrtimer red-black tree.
> >
> > Neither hrtimer_set_expires nor hrtimer_forward_now is allowed when the hrtimer has
> > already been enqueued. Therefore, the only place where the hrtimer timeout can be set is
> > within the callback, at which point the hrtimer is in a non-enqueued state and can have
> > its timeout set.
> 
> Can we use hrtimer_is_queued() instead? Please see tcp_pacing_check()
> as an example. But considering your following explanation, I think
> it's okay now.

Okay, let's keep the current logic as it is.



> > /* kbdq - kernel block descriptor queue */
> > struct tpacket_kbdq_core {
> >         struct pgv      *pkbdq;
> >         unsigned int    feature_req_word;
> >         unsigned int    hdrlen;
> >         unsigned short  kactive_blk_num;
> >         unsigned short  blk_sizeof_priv;
> >         unsigned char   reset_pending_on_curr_blk;
> >
> >         char            *pkblk_start;
> >         char            *pkblk_end;
> >         int             kblk_size;
> >         unsigned int    max_frame_len;
> >         unsigned int    knum_blocks;
> >         char            *prev;
> >         char            *nxt_offset;
> >
> >         unsigned short  version;
> >
> >         uint64_t        knxt_seq_num;
> >         struct sk_buff  *skb;
> >
> >         rwlock_t        blk_fill_in_prog_lock;
> >
> >         /* timer to retire an outstanding block */
> >         struct hrtimer  retire_blk_timer;
> >
> >         /* Default is set to 8ms */
> > #define DEFAULT_PRB_RETIRE_TOV  (8)
> >
> >         ktime_t         interval_ktime;
> > };
> 
> Could you share the result after running 'pahole --hex -C
> tpacket_kbdq_core vmlinux'?

I change the struct tpacket_kbdq_core as following:

/* kbdq - kernel block descriptor queue */
struct tpacket_kbdq_core {
	struct pgv	*pkbdq;
	unsigned int	feature_req_word;
	unsigned int	hdrlen;
	unsigned char	reset_pending_on_curr_blk;
	unsigned short	kactive_blk_num;
	unsigned short	blk_sizeof_priv;

	unsigned short  version;

	char		*pkblk_start;
	char		*pkblk_end;
	int		kblk_size;
	unsigned int	max_frame_len;
	unsigned int	knum_blocks;
	char		*prev;
	char		*nxt_offset;

	uint64_t	knxt_seq_num;
	struct sk_buff	*skb;

	rwlock_t	blk_fill_in_prog_lock;

	/* timer to retire an outstanding block */
	struct hrtimer  retire_blk_timer;

	/* Default is set to 8ms */
#define DEFAULT_PRB_RETIRE_TOV	(8)

	ktime_t		interval_ktime;
};


pahole --hex -C tpacket_kbdq_core vmlinux

running results:

struct tpacket_kbdq_core {
        struct pgv *               pkbdq;                /*     0   0x8 */
        unsigned int               feature_req_word;     /*   0x8   0x4 */
        unsigned int               hdrlen;               /*   0xc   0x4 */
        unsigned char              reset_pending_on_curr_blk; /*  0x10   0x1 */

        /* XXX 1 byte hole, try to pack */

        short unsigned int         kactive_blk_num;      /*  0x12   0x2 */
        short unsigned int         blk_sizeof_priv;      /*  0x14   0x2 */
        short unsigned int         version;              /*  0x16   0x2 */
        char *                     pkblk_start;          /*  0x18   0x8 */
        char *                     pkblk_end;            /*  0x20   0x8 */
        int                        kblk_size;            /*  0x28   0x4 */
        unsigned int               max_frame_len;        /*  0x2c   0x4 */
        unsigned int               knum_blocks;          /*  0x30   0x4 */

        /* XXX 4 bytes hole, try to pack */

        char *                     prev;                 /*  0x38   0x8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        char *                     nxt_offset;           /*  0x40   0x8 */
        uint64_t                   knxt_seq_num;         /*  0x48   0x8 */
        struct sk_buff *           skb;                  /*  0x50   0x8 */
        rwlock_t                   blk_fill_in_prog_lock; /*  0x58  0x30 */
        /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
        struct hrtimer             retire_blk_timer __attribute__((__aligned__(8))); /*  0x88  0x40 */

        /* XXX last struct has 4 bytes of padding */

        /* --- cacheline 3 boundary (192 bytes) was 8 bytes ago --- */
        ktime_t                    interval_ktime;       /*  0xc8   0x8 */

        /* size: 208, cachelines: 4, members: 19 */
        /* sum members: 203, holes: 2, sum holes: 5 */
        /* paddings: 1, sum paddings: 4 */
        /* forced alignments: 1 */
        /* last cacheline: 16 bytes */
} __attribute__((__aligned__(8)));


Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-04 14:59 Xin Zhao
@ 2025-09-05  0:09 ` Jason Xing
  0 siblings, 0 replies; 19+ messages in thread
From: Jason Xing @ 2025-09-05  0:09 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Thu, Sep 4, 2025 at 11:00 PM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> On Thu, Sep 4, 2025 at 10:50 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> > > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> > >
> > > Changes in v7:
> > >   When the callback return, without sk_buff_head lock protection, __run_hrtimer will
> > >   enqueue the timer if return HRTIMER_RESTART. Setting the hrtimer expires while
> > >   enqueuing a timer may cause chaos in the hrtimer red-black tree.
> > >
> > > Neither hrtimer_set_expires nor hrtimer_forward_now is allowed when the hrtimer has
> > > already been enqueued. Therefore, the only place where the hrtimer timeout can be set is
> > > within the callback, at which point the hrtimer is in a non-enqueued state and can have
> > > its timeout set.
> >
> > Can we use hrtimer_is_queued() instead? Please see tcp_pacing_check()
> > as an example. But considering your following explanation, I think
> > it's okay now.
>
> Okay, let's keep the current logic as it is.

In case I didn't clearly state it, you don't need to change the
overall logic but only add back one missing point as I replied in the
last email? That is lookup path needing to refresh/update the timer.

>
>
>
> > > /* kbdq - kernel block descriptor queue */
> > > struct tpacket_kbdq_core {
> > >         struct pgv      *pkbdq;
> > >         unsigned int    feature_req_word;
> > >         unsigned int    hdrlen;
> > >         unsigned short  kactive_blk_num;
> > >         unsigned short  blk_sizeof_priv;
> > >         unsigned char   reset_pending_on_curr_blk;
> > >
> > >         char            *pkblk_start;
> > >         char            *pkblk_end;
> > >         int             kblk_size;
> > >         unsigned int    max_frame_len;
> > >         unsigned int    knum_blocks;
> > >         char            *prev;
> > >         char            *nxt_offset;
> > >
> > >         unsigned short  version;
> > >
> > >         uint64_t        knxt_seq_num;
> > >         struct sk_buff  *skb;
> > >
> > >         rwlock_t        blk_fill_in_prog_lock;
> > >
> > >         /* timer to retire an outstanding block */
> > >         struct hrtimer  retire_blk_timer;
> > >
> > >         /* Default is set to 8ms */
> > > #define DEFAULT_PRB_RETIRE_TOV  (8)
> > >
> > >         ktime_t         interval_ktime;
> > > };
> >
> > Could you share the result after running 'pahole --hex -C
> > tpacket_kbdq_core vmlinux'?
>
> I change the struct tpacket_kbdq_core as following:
>
> /* kbdq - kernel block descriptor queue */
> struct tpacket_kbdq_core {
>         struct pgv      *pkbdq;
>         unsigned int    feature_req_word;
>         unsigned int    hdrlen;
>         unsigned char   reset_pending_on_curr_blk;
>         unsigned short  kactive_blk_num;
>         unsigned short  blk_sizeof_priv;
>
>         unsigned short  version;
>
>         char            *pkblk_start;
>         char            *pkblk_end;
>         int             kblk_size;
>         unsigned int    max_frame_len;
>         unsigned int    knum_blocks;
>         char            *prev;
>         char            *nxt_offset;
>
>         uint64_t        knxt_seq_num;
>         struct sk_buff  *skb;
>
>         rwlock_t        blk_fill_in_prog_lock;
>
>         /* timer to retire an outstanding block */
>         struct hrtimer  retire_blk_timer;
>
>         /* Default is set to 8ms */
> #define DEFAULT_PRB_RETIRE_TOV  (8)
>
>         ktime_t         interval_ktime;
> };
>
>
> pahole --hex -C tpacket_kbdq_core vmlinux
>
> running results:
>
> struct tpacket_kbdq_core {
>         struct pgv *               pkbdq;                /*     0   0x8 */
>         unsigned int               feature_req_word;     /*   0x8   0x4 */
>         unsigned int               hdrlen;               /*   0xc   0x4 */
>         unsigned char              reset_pending_on_curr_blk; /*  0x10   0x1 */
>
>         /* XXX 1 byte hole, try to pack */
>
>         short unsigned int         kactive_blk_num;      /*  0x12   0x2 */
>         short unsigned int         blk_sizeof_priv;      /*  0x14   0x2 */
>         short unsigned int         version;              /*  0x16   0x2 */
>         char *                     pkblk_start;          /*  0x18   0x8 */
>         char *                     pkblk_end;            /*  0x20   0x8 */
>         int                        kblk_size;            /*  0x28   0x4 */
>         unsigned int               max_frame_len;        /*  0x2c   0x4 */
>         unsigned int               knum_blocks;          /*  0x30   0x4 */
>
>         /* XXX 4 bytes hole, try to pack */
>
>         char *                     prev;                 /*  0x38   0x8 */
>         /* --- cacheline 1 boundary (64 bytes) --- */
>         char *                     nxt_offset;           /*  0x40   0x8 */
>         uint64_t                   knxt_seq_num;         /*  0x48   0x8 */
>         struct sk_buff *           skb;                  /*  0x50   0x8 */
>         rwlock_t                   blk_fill_in_prog_lock; /*  0x58  0x30 */
>         /* --- cacheline 2 boundary (128 bytes) was 8 bytes ago --- */
>         struct hrtimer             retire_blk_timer __attribute__((__aligned__(8))); /*  0x88  0x40 */
>
>         /* XXX last struct has 4 bytes of padding */
>
>         /* --- cacheline 3 boundary (192 bytes) was 8 bytes ago --- */
>         ktime_t                    interval_ktime;       /*  0xc8   0x8 */
>
>         /* size: 208, cachelines: 4, members: 19 */
>         /* sum members: 203, holes: 2, sum holes: 5 */
>         /* paddings: 1, sum paddings: 4 */
>         /* forced alignments: 1 */
>         /* last cacheline: 16 bytes */
> } __attribute__((__aligned__(8)));

How about this one? The 'size' would be shrinked to168 and the 'sum
holes' remains 5.
# pahole --hex -C tpacket_kbdq_core vmlinux
struct tpacket_kbdq_core {
        struct pgv *               pkbdq;                /*     0   0x8 */
        unsigned int               feature_req_word;     /*   0x8   0x4 */
        unsigned int               hdrlen;               /*   0xc   0x4 */
        short unsigned int         kactive_blk_num;      /*  0x10   0x2 */
        short unsigned int         blk_sizeof_priv;      /*  0x12   0x2 */
        short unsigned int         version;              /*  0x14   0x2 */
        unsigned char              reset_pending_on_curr_blk; /*  0x16   0x1 */

        /* XXX 1 byte hole, try to pack */

        char *                     pkblk_start;          /*  0x18   0x8 */
        char *                     pkblk_end;            /*  0x20   0x8 */
        int                        kblk_size;            /*  0x28   0x4 */
        unsigned int               max_frame_len;        /*  0x2c   0x4 */
        unsigned int               knum_blocks;          /*  0x30   0x4 */

        /* XXX 4 bytes hole, try to pack */

        uint64_t                   knxt_seq_num;         /*  0x38   0x8 */
        /* --- cacheline 1 boundary (64 bytes) --- */
        char *                     prev;                 /*  0x40   0x8 */
        char *                     nxt_offset;           /*  0x48   0x8 */
        struct sk_buff *           skb;                  /*  0x50   0x8 */
        rwlock_t                   blk_fill_in_prog_lock; /*  0x58   0x8 */
        ktime_t                    interval_ktime;       /*  0x60   0x8 */
        struct hrtimer             retire_blk_timer
__attribute__((__aligned__(8))); /*  0x68  0x40 */

        /* XXX last struct has 4 bytes of padding */

        /* size: 168, cachelines: 3, members: 19 */
        /* sum members: 163, holes: 2, sum holes: 5 */
        /* paddings: 1, sum paddings: 4 */
        /* forced alignments: 1 */
        /* last cacheline: 40 bytes */
} __attribute__((__aligned__(8)));

I didn't want a more aggressive adjustment around the remaining holes.
At least, the current statistics show a better result than before :)

Thanks,
Jason

>
> Thanks
> Xin Zhao
>

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-05  4:00 Xin Zhao
  2025-09-05  6:03 ` Jason Xing
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-05  4:00 UTC (permalink / raw)
  To: kerneljasonxing, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Thu, Sep 4, 2025 at 11:26 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:

> > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> >
> > Changes in v8:
> > - Delete delete_blk_timer field, as suggested by Willem de Bruijn,
> >   hrtimer_cancel will check and wait until the timer callback return and ensure
> >   enter enter callback again;
> 
> I see the reason now :)
> 
> Please know that the history changes through versions will finally be
> removed, only the official message that will be kept in the git. So
> this kind of change, I think, should be clarified officially since
> you're removing a structure member. Adding more descriptions will be
> helpful to readers in the future. Thank you.

I will add some more information to the commit message of this 2/2 PATCH.



> > Consider the following timing sequence:
> > timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
> > 0       hrtimer_run_softirq
> > 1         __hrtimer_run_queues
> > 2           __run_hrtimer
> > 3             prb_retire_rx_blk_timer_expired
> > 4               spin_lock(&po->sk.sk_receive_queue.lock);
> > 5               _prb_refresh_rx_retire_blk_timer
> > 6                 hrtimer_forward_now
> > 7               spin_unlock(&po->sk.sk_receive_queue.lock)
> > 8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
> > 9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
> > 10                                                               packet_current_rx_frame
> > 11                                                                 __packet_lookup_frame_in_block
> > 12            finish enqueue_hrtimer                                 prb_open_block
> > 13                                                                     _prb_refresh_rx_retire_blk_timer
> > 14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
> > 15                                                                       hrtimer_forward_now
> > 16                                                                         WARN_ON
> > On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
> > while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).
> >
> > In my previous email, I provided an explanation. As a supplement, I would
> > like to reiterate a paragraph from my earlier response to Willem.
> > The point is that when the hrtimer is in the enqueued state, you cannot
> 
> How about tring hrtimer_is_queued() beforehand?
> 
> IIUC, with this patch applied, we will lose the opportunity to refresh
> the timer when the lookup function (in the above path I mentioned)
> gets called compared to before. If the packet socket tries to look up
> a new block and it doesn't update its expiry time, the timer will soon
> wake up. Does it sound unreasonable?


I actually pointed out the issue with the timeout setting in a previous email:
https://lore.kernel.org/netdev/20250826030328.878001-1-jackzxcui1989@163.com/.

Regarding the method you mentioned, using hrtimer_is_queued to assist in judgment, I had
discussed this extensively with Willem in previous emails, and the conclusion was that
it is not feasible. The reason is that in our scenario, the hrtimer always returns
HRTIMER_RESTART, unlike the places you pointed out, such as tcp_pacing_check, where the
corresponding hrtimer callbacks all return HRTIMER_NORESTART. Since our scenario returns
HRTIMER_RESTART, this can lead to many troublesome issues. The fundamental reason is that
if HRTIMER_RESTART is returned, the hrtimer module will enqueue the hrtimer after the
callback returns, which leads to exiting the protection of our sk_receive_queue lock.

Returning to the functionality here, if we really want to update the hrtimer's timeout
outside of the timer callback, there are two key points to note:

1. Accurately knowing whether the current context is a timer callback or tpacket_rcv.
2. How to update the hrtimer's timeout in a non-timer callback scenario.

To start with the first point, it has already been explained in previous emails that
executing hrtimer_forward outside of a timer callback is not allowed. Therefore, we
must accurately determine whether we are in a timer callback; only in that context can
we use the hrtimer_forward function to update.
In the original code, since the same _prb_refresh_rx_retire_blk_timer function was called,
distinguishing between contexts required code restructuring. Now that this patch removes
the _prb_refresh_rx_retire_blk_timer function, achieving this accurate distinction is not
too difficult.
The key issue is the second point. If we are not inside the hrtimer's callback, we cannot
use hrtimer_forward to update the timeout. So what other interface can we use? You might
suggest using hrtimer_start, but fundamentally, hrtimer_start cannot be called if it has
already been started previously. Therefore, wouldn’t you need to add hrtimer_cancel to
confirm that the hrtimer has been canceled? Once hrtimer_cancel is added, there will also
be scenarios where it is restarted, which means we need to consider the concurrent
scenario when the socket exits and also calls hrtimer_cancel. This might require adding
logic for that concurrency scenario, and you might even need to reintroduce the
delete_blk_timer variable to indicate whether the packet_release operation has been
triggered so that the hrtimer does not restart in the tpacket_rcv scenario.

In fact, in a previous v7 version, I proposed a change that I personally thought was
quite good, which can be seen here:
https://lore.kernel.org/netdev/20250822132051.266787-1-jackzxcui1989@163.com/. However,
this change introduced an additional variable and more logic. Willem also pointed out
that the added complexity to avoid a non-problematic issue was unnecessary.

As mentioned in Changes in v8:
  The only special case is when prb_open_block is called from tpacket_rcv.
  That would set the timeout further into the future than the already queued
  timer. An earlier timeout is not problematic. No need to add complexity to
  avoid that.

It is not problematic, as Willem point it out in
https://lore.kernel.org/netdev/willemdebruijn.kernel.2d7599ee951fd@gmail.com/


In the end:

So, if you agree with the current changes in v10 and do not wish to add the timeout
setting under tpacket_rcv, that’s fine.
If you do not agree, then the only alternative I can think of is to use a combination
of hrtimer_cancel and hrtimer_start under prb_open_block, and we would also need to
reintroduce the delete_blk_timer variable to help determine whether the hrtimer was
canceled due to the packet_release behavior. If we really want to make this change,
it does add quite a bit of logic, and we would also need Willem's agreement.


Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-05  4:00 Xin Zhao
@ 2025-09-05  6:03 ` Jason Xing
  2025-09-05  6:45   ` Jason Xing
  0 siblings, 1 reply; 19+ messages in thread
From: Jason Xing @ 2025-09-05  6:03 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Fri, Sep 5, 2025 at 12:01 PM Xin Zhao <jackzxcui1989@163.com> wrote:
>
> On Thu, Sep 4, 2025 at 11:26 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> > > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> > >
> > > Changes in v8:
> > > - Delete delete_blk_timer field, as suggested by Willem de Bruijn,
> > >   hrtimer_cancel will check and wait until the timer callback return and ensure
> > >   enter enter callback again;
> >
> > I see the reason now :)
> >
> > Please know that the history changes through versions will finally be
> > removed, only the official message that will be kept in the git. So
> > this kind of change, I think, should be clarified officially since
> > you're removing a structure member. Adding more descriptions will be
> > helpful to readers in the future. Thank you.
>
> I will add some more information to the commit message of this 2/2 PATCH.
>
>
>
> > > Consider the following timing sequence:
> > > timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
> > > 0       hrtimer_run_softirq
> > > 1         __hrtimer_run_queues
> > > 2           __run_hrtimer
> > > 3             prb_retire_rx_blk_timer_expired
> > > 4               spin_lock(&po->sk.sk_receive_queue.lock);
> > > 5               _prb_refresh_rx_retire_blk_timer
> > > 6                 hrtimer_forward_now
> > > 7               spin_unlock(&po->sk.sk_receive_queue.lock)
> > > 8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
> > > 9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
> > > 10                                                               packet_current_rx_frame
> > > 11                                                                 __packet_lookup_frame_in_block
> > > 12            finish enqueue_hrtimer                                 prb_open_block
> > > 13                                                                     _prb_refresh_rx_retire_blk_timer
> > > 14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
> > > 15                                                                       hrtimer_forward_now
> > > 16                                                                         WARN_ON
> > > On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
> > > while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).
> > >
> > > In my previous email, I provided an explanation. As a supplement, I would
> > > like to reiterate a paragraph from my earlier response to Willem.
> > > The point is that when the hrtimer is in the enqueued state, you cannot
> >
> > How about tring hrtimer_is_queued() beforehand?
> >
> > IIUC, with this patch applied, we will lose the opportunity to refresh
> > the timer when the lookup function (in the above path I mentioned)
> > gets called compared to before. If the packet socket tries to look up
> > a new block and it doesn't update its expiry time, the timer will soon
> > wake up. Does it sound unreasonable?
>
>
> I actually pointed out the issue with the timeout setting in a previous email:
> https://lore.kernel.org/netdev/20250826030328.878001-1-jackzxcui1989@163.com/.
>
> Regarding the method you mentioned, using hrtimer_is_queued to assist in judgment, I had
> discussed this extensively with Willem in previous emails, and the conclusion was that
> it is not feasible. The reason is that in our scenario, the hrtimer always returns
> HRTIMER_RESTART, unlike the places you pointed out, such as tcp_pacing_check, where the
> corresponding hrtimer callbacks all return HRTIMER_NORESTART. Since our scenario returns
> HRTIMER_RESTART, this can lead to many troublesome issues. The fundamental reason is that
> if HRTIMER_RESTART is returned, the hrtimer module will enqueue the hrtimer after the
> callback returns, which leads to exiting the protection of our sk_receive_queue lock.
>
> Returning to the functionality here, if we really want to update the hrtimer's timeout
> outside of the timer callback, there are two key points to note:
>
> 1. Accurately knowing whether the current context is a timer callback or tpacket_rcv.
> 2. How to update the hrtimer's timeout in a non-timer callback scenario.
>
> To start with the first point, it has already been explained in previous emails that
> executing hrtimer_forward outside of a timer callback is not allowed. Therefore, we
> must accurately determine whether we are in a timer callback; only in that context can
> we use the hrtimer_forward function to update.
> In the original code, since the same _prb_refresh_rx_retire_blk_timer function was called,
> distinguishing between contexts required code restructuring. Now that this patch removes
> the _prb_refresh_rx_retire_blk_timer function, achieving this accurate distinction is not
> too difficult.
> The key issue is the second point. If we are not inside the hrtimer's callback, we cannot
> use hrtimer_forward to update the timeout.
> So what other interface can we use? You might
> suggest using hrtimer_start, but fundamentally, hrtimer_start cannot be called if it has
> already been started previously. Therefore, wouldn’t you need to add hrtimer_cancel to
> confirm that the hrtimer has been canceled? Once hrtimer_cancel is added, there will also
> be scenarios where it is restarted, which means we need to consider the concurrent
> scenario when the socket exits and also calls hrtimer_cancel. This might require adding
> logic for that concurrency scenario, and you might even need to reintroduce the
> delete_blk_timer variable to indicate whether the packet_release operation has been
> triggered so that the hrtimer does not restart in the tpacket_rcv scenario.
>
> In fact, in a previous v7 version, I proposed a change that I personally thought was
> quite good, which can be seen here:
> https://lore.kernel.org/netdev/20250822132051.266787-1-jackzxcui1989@163.com/. However,
> this change introduced an additional variable and more logic. Willem also pointed out
> that the added complexity to avoid a non-problematic issue was unnecessary.

Admittedly it's a bit complex.

>
> As mentioned in Changes in v8:
>   The only special case is when prb_open_block is called from tpacket_rcv.
>   That would set the timeout further into the future than the already queued
>   timer. An earlier timeout is not problematic. No need to add complexity to
>   avoid that.

It'd be better to highlight this in the commit message as well to
avoid further repeat questions from others. It's an obvious change in
this patch :)

In the meanwhile, you should adjust the comments above
prb_retire_rx_blk_timer_expired() and prb_open_block() because you
removed the refresh logic there.

>
> It is not problematic, as Willem point it out in
> https://lore.kernel.org/netdev/willemdebruijn.kernel.2d7599ee951fd@gmail.com/
>
>
> In the end:
>
> So, if you agree with the current changes in v10 and do not wish to add the timeout
> setting under tpacket_rcv, that’s fine.
> If you do not agree, then the only alternative I can think of is to use a combination
> of hrtimer_cancel and hrtimer_start under prb_open_block, and we would also need to
> reintroduce the delete_blk_timer variable to help determine whether the hrtimer was
> canceled due to the packet_release behavior. If we really want to make this change,
> it does add quite a bit of logic, and we would also need Willem's agreement.

Thanks, I think I know enough about this long background. Let's stick
with the approach you provided in the next respin :)

Thanks,
Jason

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-05  6:03 ` Jason Xing
@ 2025-09-05  6:45   ` Jason Xing
  2025-09-05 16:16     ` Willem de Bruijn
  0 siblings, 1 reply; 19+ messages in thread
From: Jason Xing @ 2025-09-05  6:45 UTC (permalink / raw)
  To: Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

On Fri, Sep 5, 2025 at 2:03 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
>
> On Fri, Sep 5, 2025 at 12:01 PM Xin Zhao <jackzxcui1989@163.com> wrote:
> >
> > On Thu, Sep 4, 2025 at 11:26 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > > > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> > > >
> > > > Changes in v8:
> > > > - Delete delete_blk_timer field, as suggested by Willem de Bruijn,
> > > >   hrtimer_cancel will check and wait until the timer callback return and ensure
> > > >   enter enter callback again;
> > >
> > > I see the reason now :)
> > >
> > > Please know that the history changes through versions will finally be
> > > removed, only the official message that will be kept in the git. So
> > > this kind of change, I think, should be clarified officially since
> > > you're removing a structure member. Adding more descriptions will be
> > > helpful to readers in the future. Thank you.
> >
> > I will add some more information to the commit message of this 2/2 PATCH.
> >
> >
> >
> > > > Consider the following timing sequence:
> > > > timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
> > > > 0       hrtimer_run_softirq
> > > > 1         __hrtimer_run_queues
> > > > 2           __run_hrtimer
> > > > 3             prb_retire_rx_blk_timer_expired
> > > > 4               spin_lock(&po->sk.sk_receive_queue.lock);
> > > > 5               _prb_refresh_rx_retire_blk_timer
> > > > 6                 hrtimer_forward_now
> > > > 7               spin_unlock(&po->sk.sk_receive_queue.lock)
> > > > 8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
> > > > 9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
> > > > 10                                                               packet_current_rx_frame
> > > > 11                                                                 __packet_lookup_frame_in_block
> > > > 12            finish enqueue_hrtimer                                 prb_open_block
> > > > 13                                                                     _prb_refresh_rx_retire_blk_timer
> > > > 14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
> > > > 15                                                                       hrtimer_forward_now
> > > > 16                                                                         WARN_ON
> > > > On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
> > > > while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).
> > > >
> > > > In my previous email, I provided an explanation. As a supplement, I would
> > > > like to reiterate a paragraph from my earlier response to Willem.
> > > > The point is that when the hrtimer is in the enqueued state, you cannot
> > >
> > > How about tring hrtimer_is_queued() beforehand?
> > >
> > > IIUC, with this patch applied, we will lose the opportunity to refresh
> > > the timer when the lookup function (in the above path I mentioned)
> > > gets called compared to before. If the packet socket tries to look up
> > > a new block and it doesn't update its expiry time, the timer will soon
> > > wake up. Does it sound unreasonable?
> >
> >
> > I actually pointed out the issue with the timeout setting in a previous email:
> > https://lore.kernel.org/netdev/20250826030328.878001-1-jackzxcui1989@163.com/.
> >
> > Regarding the method you mentioned, using hrtimer_is_queued to assist in judgment, I had
> > discussed this extensively with Willem in previous emails, and the conclusion was that
> > it is not feasible. The reason is that in our scenario, the hrtimer always returns
> > HRTIMER_RESTART, unlike the places you pointed out, such as tcp_pacing_check, where the
> > corresponding hrtimer callbacks all return HRTIMER_NORESTART. Since our scenario returns
> > HRTIMER_RESTART, this can lead to many troublesome issues. The fundamental reason is that
> > if HRTIMER_RESTART is returned, the hrtimer module will enqueue the hrtimer after the
> > callback returns, which leads to exiting the protection of our sk_receive_queue lock.
> >
> > Returning to the functionality here, if we really want to update the hrtimer's timeout
> > outside of the timer callback, there are two key points to note:
> >
> > 1. Accurately knowing whether the current context is a timer callback or tpacket_rcv.
> > 2. How to update the hrtimer's timeout in a non-timer callback scenario.
> >
> > To start with the first point, it has already been explained in previous emails that
> > executing hrtimer_forward outside of a timer callback is not allowed. Therefore, we
> > must accurately determine whether we are in a timer callback; only in that context can
> > we use the hrtimer_forward function to update.
> > In the original code, since the same _prb_refresh_rx_retire_blk_timer function was called,
> > distinguishing between contexts required code restructuring. Now that this patch removes
> > the _prb_refresh_rx_retire_blk_timer function, achieving this accurate distinction is not
> > too difficult.
> > The key issue is the second point. If we are not inside the hrtimer's callback, we cannot
> > use hrtimer_forward to update the timeout.
> > So what other interface can we use? You might
> > suggest using hrtimer_start, but fundamentally, hrtimer_start cannot be called if it has
> > already been started previously. Therefore, wouldn’t you need to add hrtimer_cancel to
> > confirm that the hrtimer has been canceled? Once hrtimer_cancel is added, there will also
> > be scenarios where it is restarted, which means we need to consider the concurrent
> > scenario when the socket exits and also calls hrtimer_cancel. This might require adding
> > logic for that concurrency scenario, and you might even need to reintroduce the
> > delete_blk_timer variable to indicate whether the packet_release operation has been
> > triggered so that the hrtimer does not restart in the tpacket_rcv scenario.
> >
> > In fact, in a previous v7 version, I proposed a change that I personally thought was
> > quite good, which can be seen here:
> > https://lore.kernel.org/netdev/20250822132051.266787-1-jackzxcui1989@163.com/. However,
> > this change introduced an additional variable and more logic. Willem also pointed out
> > that the added complexity to avoid a non-problematic issue was unnecessary.
>
> Admittedly it's a bit complex.
>
> >
> > As mentioned in Changes in v8:
> >   The only special case is when prb_open_block is called from tpacket_rcv.
> >   That would set the timeout further into the future than the already queued
> >   timer. An earlier timeout is not problematic. No need to add complexity to
> >   avoid that.
>
> It'd be better to highlight this in the commit message as well to
> avoid further repeat questions from others. It's an obvious change in
> this patch :)

BTW, I have to emphasize that after this patch, the hrtimer will run
periodically and unconditionally. As far as I know, it's not possible
to run hundreds and thousands packet sockets in production, so it
might not be a huge problem. Or else, numerous timers are likely to
cause spikes/jitters, especially when timeout is very small (which can
be 1ms timeout for HZ=1000 system). It would be great if you state the
possible side effects in the next version.

Willem, any thoughts on this?

Thanks,
Jason

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-09-05 14:47 Xin Zhao
  2025-09-05 16:16 ` Willem de Bruijn
  0 siblings, 1 reply; 19+ messages in thread
From: Xin Zhao @ 2025-09-05 14:47 UTC (permalink / raw)
  To: kerneljasonxing, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Fri, Sep 5, 2025 at 14:45 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:

> BTW, I have to emphasize that after this patch, the hrtimer will run
> periodically and unconditionally. As far as I know, it's not possible
> to run hundreds and thousands packet sockets in production, so it
> might not be a huge problem. Or else, numerous timers are likely to
> cause spikes/jitters, especially when timeout is very small (which can
> be 1ms timeout for HZ=1000 system). It would be great if you state the
> possible side effects in the next version.

The original logic actually involves an unconditional restart in the timer's
callback. You might be suggesting that if packets come in particularly fast,
the original logic would reset the timeout when opening a new block in
tpacket_rcv, so the timeout does not expire immediately. However, if packets
arrive very quickly, it will also lead to frequent timeout resets, which can
waste CPU resources.
I will emphasize in the comments that the current hrtimer expiration logic
is unconditional and periodic.


Thanks
Xin Zhao


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-05  6:45   ` Jason Xing
@ 2025-09-05 16:16     ` Willem de Bruijn
  0 siblings, 0 replies; 19+ messages in thread
From: Willem de Bruijn @ 2025-09-05 16:16 UTC (permalink / raw)
  To: Jason Xing, Xin Zhao
  Cc: willemdebruijn.kernel, edumazet, ferenc, davem, kuba, pabeni,
	horms, netdev, linux-kernel

Jason Xing wrote:
> On Fri, Sep 5, 2025 at 2:03 PM Jason Xing <kerneljasonxing@gmail.com> wrote:
> >
> > On Fri, Sep 5, 2025 at 12:01 PM Xin Zhao <jackzxcui1989@163.com> wrote:
> > >
> > > On Thu, Sep 4, 2025 at 11:26 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
> > >
> > > > > In the description of [PATCH net-next v10 0/2] net: af_packet: optimize retire operation:
> > > > >
> > > > > Changes in v8:
> > > > > - Delete delete_blk_timer field, as suggested by Willem de Bruijn,
> > > > >   hrtimer_cancel will check and wait until the timer callback return and ensure
> > > > >   enter enter callback again;
> > > >
> > > > I see the reason now :)
> > > >
> > > > Please know that the history changes through versions will finally be
> > > > removed, only the official message that will be kept in the git. So
> > > > this kind of change, I think, should be clarified officially since
> > > > you're removing a structure member. Adding more descriptions will be
> > > > helpful to readers in the future. Thank you.
> > >
> > > I will add some more information to the commit message of this 2/2 PATCH.
> > >
> > >
> > >
> > > > > Consider the following timing sequence:
> > > > > timer   cpu0 (softirq context, hrtimer timeout)                cpu1 (process context)
> > > > > 0       hrtimer_run_softirq
> > > > > 1         __hrtimer_run_queues
> > > > > 2           __run_hrtimer
> > > > > 3             prb_retire_rx_blk_timer_expired
> > > > > 4               spin_lock(&po->sk.sk_receive_queue.lock);
> > > > > 5               _prb_refresh_rx_retire_blk_timer
> > > > > 6                 hrtimer_forward_now
> > > > > 7               spin_unlock(&po->sk.sk_receive_queue.lock)
> > > > > 8             raw_spin_lock_irq(&cpu_base->lock);              tpacket_rcv
> > > > > 9             enqueue_hrtimer                                    spin_lock(&sk->sk_receive_queue.lock);
> > > > > 10                                                               packet_current_rx_frame
> > > > > 11                                                                 __packet_lookup_frame_in_block
> > > > > 12            finish enqueue_hrtimer                                 prb_open_block
> > > > > 13                                                                     _prb_refresh_rx_retire_blk_timer
> > > > > 14                                                                       hrtimer_is_queued(&pkc->retire_blk_timer) == true
> > > > > 15                                                                       hrtimer_forward_now
> > > > > 16                                                                         WARN_ON
> > > > > On cpu0 in the timing sequence above, enqueue_hrtimer is not protected by sk_receive_queue.lock,
> > > > > while the hrtimer_forward_now is not protected by raw_spin_lock_irq(&cpu_base->lock).
> > > > >
> > > > > In my previous email, I provided an explanation. As a supplement, I would
> > > > > like to reiterate a paragraph from my earlier response to Willem.
> > > > > The point is that when the hrtimer is in the enqueued state, you cannot
> > > >
> > > > How about tring hrtimer_is_queued() beforehand?
> > > >
> > > > IIUC, with this patch applied, we will lose the opportunity to refresh
> > > > the timer when the lookup function (in the above path I mentioned)
> > > > gets called compared to before. If the packet socket tries to look up
> > > > a new block and it doesn't update its expiry time, the timer will soon
> > > > wake up. Does it sound unreasonable?
> > >
> > >
> > > I actually pointed out the issue with the timeout setting in a previous email:
> > > https://lore.kernel.org/netdev/20250826030328.878001-1-jackzxcui1989@163.com/.
> > >
> > > Regarding the method you mentioned, using hrtimer_is_queued to assist in judgment, I had
> > > discussed this extensively with Willem in previous emails, and the conclusion was that
> > > it is not feasible. The reason is that in our scenario, the hrtimer always returns
> > > HRTIMER_RESTART, unlike the places you pointed out, such as tcp_pacing_check, where the
> > > corresponding hrtimer callbacks all return HRTIMER_NORESTART. Since our scenario returns
> > > HRTIMER_RESTART, this can lead to many troublesome issues. The fundamental reason is that
> > > if HRTIMER_RESTART is returned, the hrtimer module will enqueue the hrtimer after the
> > > callback returns, which leads to exiting the protection of our sk_receive_queue lock.
> > >
> > > Returning to the functionality here, if we really want to update the hrtimer's timeout
> > > outside of the timer callback, there are two key points to note:
> > >
> > > 1. Accurately knowing whether the current context is a timer callback or tpacket_rcv.
> > > 2. How to update the hrtimer's timeout in a non-timer callback scenario.
> > >
> > > To start with the first point, it has already been explained in previous emails that
> > > executing hrtimer_forward outside of a timer callback is not allowed. Therefore, we
> > > must accurately determine whether we are in a timer callback; only in that context can
> > > we use the hrtimer_forward function to update.
> > > In the original code, since the same _prb_refresh_rx_retire_blk_timer function was called,
> > > distinguishing between contexts required code restructuring. Now that this patch removes
> > > the _prb_refresh_rx_retire_blk_timer function, achieving this accurate distinction is not
> > > too difficult.
> > > The key issue is the second point. If we are not inside the hrtimer's callback, we cannot
> > > use hrtimer_forward to update the timeout.
> > > So what other interface can we use? You might
> > > suggest using hrtimer_start, but fundamentally, hrtimer_start cannot be called if it has
> > > already been started previously. Therefore, wouldn’t you need to add hrtimer_cancel to
> > > confirm that the hrtimer has been canceled? Once hrtimer_cancel is added, there will also
> > > be scenarios where it is restarted, which means we need to consider the concurrent
> > > scenario when the socket exits and also calls hrtimer_cancel. This might require adding
> > > logic for that concurrency scenario, and you might even need to reintroduce the
> > > delete_blk_timer variable to indicate whether the packet_release operation has been
> > > triggered so that the hrtimer does not restart in the tpacket_rcv scenario.
> > >
> > > In fact, in a previous v7 version, I proposed a change that I personally thought was
> > > quite good, which can be seen here:
> > > https://lore.kernel.org/netdev/20250822132051.266787-1-jackzxcui1989@163.com/. However,
> > > this change introduced an additional variable and more logic. Willem also pointed out
> > > that the added complexity to avoid a non-problematic issue was unnecessary.
> >
> > Admittedly it's a bit complex.
> >
> > >
> > > As mentioned in Changes in v8:
> > >   The only special case is when prb_open_block is called from tpacket_rcv.
> > >   That would set the timeout further into the future than the already queued
> > >   timer. An earlier timeout is not problematic. No need to add complexity to
> > >   avoid that.
> >
> > It'd be better to highlight this in the commit message as well to
> > avoid further repeat questions from others. It's an obvious change in
> > this patch :)
> 
> BTW, I have to emphasize that after this patch, the hrtimer will run
> periodically and unconditionally. As far as I know, it's not possible
> to run hundreds and thousands packet sockets in production, so it
> might not be a huge problem.

In tcp each sk has its own hrtimers (tcp_init_xmit_timers). 

> Or else, numerous timers are likely to
> cause spikes/jitters, especially when timeout is very small (which can
> be 1ms timeout for HZ=1000 system). It would be great if you state the
> possible side effects in the next version.
> 
> Willem, any thoughts on this?

Essentially that is what the existing timer also did. There was no
path where the timer was being disabled.

It could be continuously delayed from tpacket_rcv if that happened
came before the timer fires each time. That is no longer the case
with this change. It is probably good to call that out explicitly.



^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation
  2025-09-05 14:47 Xin Zhao
@ 2025-09-05 16:16 ` Willem de Bruijn
  0 siblings, 0 replies; 19+ messages in thread
From: Willem de Bruijn @ 2025-09-05 16:16 UTC (permalink / raw)
  To: Xin Zhao, kerneljasonxing, willemdebruijn.kernel, edumazet,
	ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

Xin Zhao wrote:
> On Fri, Sep 5, 2025 at 14:45 +0800 Jason Xing <kerneljasonxing@gmail.com> wrote:
> 
> > BTW, I have to emphasize that after this patch, the hrtimer will run
> > periodically and unconditionally. As far as I know, it's not possible
> > to run hundreds and thousands packet sockets in production, so it
> > might not be a huge problem. Or else, numerous timers are likely to
> > cause spikes/jitters, especially when timeout is very small (which can
> > be 1ms timeout for HZ=1000 system). It would be great if you state the
> > possible side effects in the next version.
> 
> The original logic actually involves an unconditional restart in the timer's
> callback. You might be suggesting that if packets come in particularly fast,
> the original logic would reset the timeout when opening a new block in
> tpacket_rcv, so the timeout does not expire immediately. However, if packets
> arrive very quickly, it will also lead to frequent timeout resets, which can
> waste CPU resources.
> I will emphasize in the comments that the current hrtimer expiration logic
> is unconditional and periodic.

+1 that should suffice


^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2025-09-05 16:16 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-09-01  2:27 [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the retire operation Xin Zhao
2025-09-01 13:35 ` Willem de Bruijn
  -- strict thread matches above, loose matches on Subject: below --
2025-09-05 14:47 Xin Zhao
2025-09-05 16:16 ` Willem de Bruijn
2025-09-05  4:00 Xin Zhao
2025-09-05  6:03 ` Jason Xing
2025-09-05  6:45   ` Jason Xing
2025-09-05 16:16     ` Willem de Bruijn
2025-09-04 14:59 Xin Zhao
2025-09-05  0:09 ` Jason Xing
2025-09-03 17:07 Xin Zhao
2025-09-04  3:26 ` Jason Xing
2025-09-03 16:17 Xin Zhao
2025-09-04  2:50 ` Jason Xing
2025-09-01 14:16 Xin Zhao
2025-08-31 10:08 [PATCH net-next v10 0/2] net: af_packet: optimize " Xin Zhao
2025-08-31 10:08 ` [PATCH net-next v10 2/2] net: af_packet: Use hrtimer to do the " Xin Zhao
2025-09-01  1:21   ` Willem de Bruijn
2025-09-02 15:43   ` Jason Xing
2025-09-02 16:42     ` Jason Xing

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).