[PATCH v2] net: af_packet: Use hrtimer to do the retire operation

linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-08-15  4:41 Xin Zhao
  2025-08-15 10:12 ` Willem de Bruijn
  0 siblings, 1 reply; 5+ messages in thread
From: Xin Zhao @ 2025-08-15  4:41 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel, Xin Zhao

In a system with high real-time requirements, the timeout mechanism of
ordinary timers with jiffies granularity is insufficient to meet the
demands for real-time performance. Meanwhile, the optimization of CPU
usage with af_packet is quite significant. Use hrtimer instead of timer
to help compensate for the shortcomings in real-time performance.
In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
enough, with fluctuations reaching over 8ms (on a system with HZ=250).
This is unacceptable in some high real-time systems that require timely
processing of network packets. By replacing it with hrtimer, if a timeout
of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
3 ms.

Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
---
 net/packet/af_packet.c | 19 ++++++++++---------
 net/packet/internal.h  |  3 +--
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index bc438d0d9..c4746a9cb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -203,7 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
 static int prb_queue_frozen(struct tpacket_kbdq_core *);
 static void prb_open_block(struct tpacket_kbdq_core *,
 		struct tpacket_block_desc *);
-static void prb_retire_rx_blk_timer_expired(struct timer_list *);
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
 static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
 static void prb_clear_rxhash(struct tpacket_kbdq_core *,
@@ -581,7 +581,7 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
 
 static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 {
-	timer_delete_sync(&pkc->retire_blk_timer);
+	hrtimer_cancel(&pkc->retire_blk_timer);
 }
 
 static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
@@ -603,9 +603,10 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po)
 	struct tpacket_kbdq_core *pkc;
 
 	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
-	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
-		    0);
-	pkc->retire_blk_timer.expires = jiffies;
+	hrtimer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
+		      CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
+	hrtimer_start(&pkc->retire_blk_timer, ms_to_ktime(pkc->retire_blk_tov),
+		      HRTIMER_MODE_REL_SOFT);
 }
 
 static int prb_calc_retire_blk_tmo(struct packet_sock *po,
@@ -676,7 +677,6 @@ static void init_prb_bdqc(struct packet_sock *po,
 	else
 		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
 						req_u->req3.tp_block_size);
-	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
 	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
 	rwlock_init(&p1->blk_fill_in_prog_lock);
 
@@ -691,8 +691,8 @@ static void init_prb_bdqc(struct packet_sock *po,
  */
 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 {
-	mod_timer(&pkc->retire_blk_timer,
-			jiffies + pkc->tov_in_jiffies);
+	hrtimer_set_expires(&pkc->retire_blk_timer,
+			    ktime_add(ktime_get(), ms_to_ktime(pkc->retire_blk_tov)));
 	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
 }
 
@@ -719,7 +719,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
  * prb_calc_retire_blk_tmo() calculates the tmo.
  *
  */
-static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
+static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
 {
 	struct packet_sock *po =
 		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
@@ -790,6 +790,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
 
 out:
 	spin_unlock(&po->sk.sk_receive_queue.lock);
+	return HRTIMER_RESTART;
 }
 
 static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
diff --git a/net/packet/internal.h b/net/packet/internal.h
index 1e743d031..9812feb3d 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -47,10 +47,9 @@ struct tpacket_kbdq_core {
 
 	unsigned short  retire_blk_tov;
 	unsigned short  version;
-	unsigned long	tov_in_jiffies;
 
 	/* timer to retire an outstanding block */
-	struct timer_list retire_blk_timer;
+	struct hrtimer  retire_blk_timer;
 };
 
 struct pgv {
-- 
2.34.1


^ permalink raw reply related	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] net: af_packet: Use hrtimer to do the retire operation
  2025-08-15  4:41 Xin Zhao
@ 2025-08-15 10:12 ` Willem de Bruijn
  0 siblings, 0 replies; 5+ messages in thread
From: Willem de Bruijn @ 2025-08-15 10:12 UTC (permalink / raw)
  To: Xin Zhao, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel, Xin Zhao

Xin Zhao wrote:
> In a system with high real-time requirements, the timeout mechanism of
> ordinary timers with jiffies granularity is insufficient to meet the
> demands for real-time performance. Meanwhile, the optimization of CPU
> usage with af_packet is quite significant. Use hrtimer instead of timer
> to help compensate for the shortcomings in real-time performance.
> In HZ=100 or HZ=250 system, the update of TP_STATUS_USER is not real-time
> enough, with fluctuations reaching over 8ms (on a system with HZ=250).
> This is unacceptable in some high real-time systems that require timely
> processing of network packets. By replacing it with hrtimer, if a timeout
> of 2ms is set, the update of TP_STATUS_USER can be stabilized to within
> 3 ms.
> 
> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>

Please clearly label PATCH net-next and include a changelog and link
to previous versions.

See also other recently sent patches and
https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html
https://docs.kernel.org/process/submitting-patches.html

> ---
>  net/packet/af_packet.c | 19 ++++++++++---------
>  net/packet/internal.h  |  3 +--
>  2 files changed, 11 insertions(+), 11 deletions(-)
> 
> diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
> index bc438d0d9..c4746a9cb 100644
> --- a/net/packet/af_packet.c
> +++ b/net/packet/af_packet.c
> @@ -203,7 +203,7 @@ static void prb_retire_current_block(struct tpacket_kbdq_core *,
>  static int prb_queue_frozen(struct tpacket_kbdq_core *);
>  static void prb_open_block(struct tpacket_kbdq_core *,
>  		struct tpacket_block_desc *);
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *);
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *);
>  static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
>  static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
>  static void prb_clear_rxhash(struct tpacket_kbdq_core *,
> @@ -581,7 +581,7 @@ static __be16 vlan_get_protocol_dgram(const struct sk_buff *skb)
>  
>  static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
>  {
> -	timer_delete_sync(&pkc->retire_blk_timer);
> +	hrtimer_cancel(&pkc->retire_blk_timer);
>  }
>  
>  static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
> @@ -603,9 +603,10 @@ static void prb_setup_retire_blk_timer(struct packet_sock *po)
>  	struct tpacket_kbdq_core *pkc;
>  
>  	pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
> -	timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> -		    0);
> -	pkc->retire_blk_timer.expires = jiffies;
> +	hrtimer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
> +		      CLOCK_MONOTONIC, HRTIMER_MODE_REL_SOFT);
> +	hrtimer_start(&pkc->retire_blk_timer, ms_to_ktime(pkc->retire_blk_tov),
> +		      HRTIMER_MODE_REL_SOFT);
>  }
>  
>  static int prb_calc_retire_blk_tmo(struct packet_sock *po,
> @@ -676,7 +677,6 @@ static void init_prb_bdqc(struct packet_sock *po,
>  	else
>  		p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
>  						req_u->req3.tp_block_size);
> -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);

Since the hrtimer API takes ktime, and there is no other user for
retire_blk_tov, remove that too and instead have interval_ktime.

>  	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
>  	rwlock_init(&p1->blk_fill_in_prog_lock);
>  
> @@ -691,8 +691,8 @@ static void init_prb_bdqc(struct packet_sock *po,
>   */
>  static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
>  {
> -	mod_timer(&pkc->retire_blk_timer,
> -			jiffies + pkc->tov_in_jiffies);
> +	hrtimer_set_expires(&pkc->retire_blk_timer,
> +			    ktime_add(ktime_get(), ms_to_ktime(pkc->retire_blk_tov)));

More common for HRTIMER_RESTART timers is hrtimer_forward_now.

>  	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
>  }
>  
> @@ -719,7 +719,7 @@ static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
>   * prb_calc_retire_blk_tmo() calculates the tmo.
>   *
>   */
> -static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
>  {
>  	struct packet_sock *po =
>  		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> @@ -790,6 +790,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
>  
>  out:
>  	spin_unlock(&po->sk.sk_receive_queue.lock);
> +	return HRTIMER_RESTART;

This always restart the timer. But that is not the current behavior.
Per prb_retire_rx_blk_timer_expired:

   * 1) We refresh the timer only when we open a block.
 
Look at the five different paths that can reach label out.

In particular, if the block is retired in this timer, and no new block
is available to be opened, no timer should be armed.

>  }
>  
>  static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
> diff --git a/net/packet/internal.h b/net/packet/internal.h
> index 1e743d031..9812feb3d 100644
> --- a/net/packet/internal.h
> +++ b/net/packet/internal.h
> @@ -47,10 +47,9 @@ struct tpacket_kbdq_core {
>  
>  	unsigned short  retire_blk_tov;
>  	unsigned short  version;
> -	unsigned long	tov_in_jiffies;
>  
>  	/* timer to retire an outstanding block */
> -	struct timer_list retire_blk_timer;
> +	struct hrtimer  retire_blk_timer;
>  };
>  
>  struct pgv {
> -- 
> 2.34.1
> 



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-08-15 17:08 Xin Zhao
  2025-08-16  9:33 ` Willem de Bruijn
  0 siblings, 1 reply; 5+ messages in thread
From: Xin Zhao @ 2025-08-15 17:08 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Fri, 2025-08-15 at 18:12 +0800, Willem wrote:

> Signed-off-by: Xin Zhao <jackzxcui1989@163.com>

> Please clearly label PATCH net-next and include a changelog and link
> to previous versions.
> 
> See also other recently sent patches and
> https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html
> https://docs.kernel.org/process/submitting-patches.html
> 
> > ---

Dear Willem,

I will add the details in PATCH v3.

> > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> 
> Since the hrtimer API takes ktime, and there is no other user for
> retire_blk_tov, remove that too and instead have interval_ktime.
> 
> >  	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;

We cannot simply remove the retire_blk_tov field, because in net/packet/diag.c 
retire_blk_tov is being used in function pdiag_put_ring. Since there are still
people using it, I personally prefer not to remove this variable for now. If
you think it is still necessary, I can remove it later and adjust the logic in
diag.c accordingly, using ktime_to_ms to convert the ktime_t format value back
to the u32 type needed in the pdiag_put_ring function.

> > +	hrtimer_set_expires(&pkc->retire_blk_timer,
> > +			    ktime_add(ktime_get(), ms_to_ktime(pkc->retire_blk_tov)));
> 
> More common for HRTIMER_RESTART timers is hrtimer_forward_now.
> 
> >  	pkc->last_kactive_blk_num = pkc->kactive_blk_num;

As I mentioned in my previous response, we cannot use hrtimer_forward_now here
because the function _prb_refresh_rx_retire_blk_timer can be called not only
when the retire timer expires, but also when the kernel logic for receiving
network packets detects that a network packet has filled up a block and calls
prb_open_block to use the next block. This can lead to a WARN_ON being triggered
in hrtimer_forward_now when it checks if the timer has already been enqueued
(WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)).
I encountered this issue when I initially used hrtimer_forward_now. This is the
reason why the existing logic for the regular timer uses mod_timer instead of
add_timer, as mod_timer is designed to handle such scenarios. A relevant comment
in the mod_timer implementation states:
 * Note that if there are multiple unserialized concurrent users of the
 * same timer, then mod_timer() is the only safe way to modify the timeout,
 * since add_timer() cannot modify an already running timer.

> > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
> >  {
> >  	struct packet_sock *po =
> >  		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> > @@ -790,6 +790,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > 
> >  out:
> >  	spin_unlock(&po->sk.sk_receive_queue.lock);
> > +	return HRTIMER_RESTART;
> 
> This always restart the timer. But that is not the current behavior.
> Per prb_retire_rx_blk_timer_expired:
> 
>    * 1) We refresh the timer only when we open a block.
> 
> Look at the five different paths that can reach label out.
> 
> In particular, if the block is retired in this timer, and no new block
> is available to be opened, no timer should be armed.
> 
> >  }

I have sorted out the logic in this area; please take a look and see if it's correct.

We are discussing the conditions under which we should return HRTIMER_NORESTART. We only
need to focus on the three 'goto out' statements in this function (because if it don't
call 'goto out', it will definitely not skip the 'refresh_timer:' label, and if it don't
skip the refresh_timer label, it will definitely execute the _prb_refresh_rx_retire_blk_timer
function, which expects to return HRTIMER_RESTART):
Case 1:
  if (unlikely(pkc->delete_blk_timer))
    goto out;
  This case indicates that the hrtimer has already been stopped. In this situation, it 
  should return HRTIMER_NORESTART, and I will make this change in PATCH v3.
Case 2:
  if (!prb_dispatch_next_block(pkc, po))
    goto refresh_timer;
  else
    goto out;
  In this case, the execution will only reach the out label if prb_dispatch_next_block
  returns a non-zero value. If prb_dispatch_next_block returns a non-zero value, it must
  have executed prb_open_block, which in turn will call _prb_refresh_rx_retire_blk_timer
  to set the new timeout for the retire timer. Therefore, in this scenario, the hrtimer
  should return HRTIMER_RESTART.
Case 3:
  } else {
     ...
     prb_open_block(pkc, pbd);
     goto out;
  }
  This goto out clearly follows a call to prb_open_block, and as mentioned in the case 2,
  it will set a new timeout and expects the hrtimer to restart.
Based on the analysis above, I only need to modify the situation described in case 1 in
PATCH v3 to return HRTIMER_NORESTART. If there are any inaccuracies, please provide
further guidance.

Thanks
Xin Zhao

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] net: af_packet: Use hrtimer to do the retire operation
  2025-08-15 17:08 [PATCH v2] net: af_packet: Use hrtimer to do the retire operation Xin Zhao
@ 2025-08-16  9:33 ` Willem de Bruijn
  0 siblings, 0 replies; 5+ messages in thread
From: Willem de Bruijn @ 2025-08-16  9:33 UTC (permalink / raw)
  To: Xin Zhao, willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

Xin Zhao wrote:
> On Fri, 2025-08-15 at 18:12 +0800, Willem wrote:
> 
> > Signed-off-by: Xin Zhao <jackzxcui1989@163.com>
> 
> > Please clearly label PATCH net-next and include a changelog and link
> > to previous versions.
> > 
> > See also other recently sent patches and
> > https://www.kernel.org/doc/html/latest/process/maintainer-netdev.html
> > https://docs.kernel.org/process/submitting-patches.html
> > 
> > > ---
> 
> Dear Willem,
> 
> I will add the details in PATCH v3.
> 
> 
> > > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > 
> > Since the hrtimer API takes ktime, and there is no other user for
> > retire_blk_tov, remove that too and instead have interval_ktime.
> > 
> > >  	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
> 
> We cannot simply remove the retire_blk_tov field, because in net/packet/diag.c 
> retire_blk_tov is being used in function pdiag_put_ring. Since there are still
> people using it, I personally prefer not to remove this variable for now. If
> you think it is still necessary, I can remove it later and adjust the logic in
> diag.c accordingly, using ktime_to_ms to convert the ktime_t format value back
> to the u32 type needed in the pdiag_put_ring function.

Yes, let's remove the unnecessary extra field.
 
> 
> > > +	hrtimer_set_expires(&pkc->retire_blk_timer,
> > > +			    ktime_add(ktime_get(), ms_to_ktime(pkc->retire_blk_tov)));
> > 
> > More common for HRTIMER_RESTART timers is hrtimer_forward_now.
> > 
> > >  	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
> 
> As I mentioned in my previous response, we cannot use hrtimer_forward_now here
> because the function _prb_refresh_rx_retire_blk_timer can be called not only
> when the retire timer expires, but also when the kernel logic for receiving
> network packets detects that a network packet has filled up a block and calls
> prb_open_block to use the next block. This can lead to a WARN_ON being triggered
> in hrtimer_forward_now when it checks if the timer has already been enqueued
> (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)).
> I encountered this issue when I initially used hrtimer_forward_now. This is the
> reason why the existing logic for the regular timer uses mod_timer instead of
> add_timer, as mod_timer is designed to handle such scenarios. A relevant comment
> in the mod_timer implementation states:
>  * Note that if there are multiple unserialized concurrent users of the
>  * same timer, then mod_timer() is the only safe way to modify the timeout,
>  * since add_timer() cannot modify an already running timer.

Please add a comment above the call to hrtimer_set_expires and/or in
the commit message. As this is non-obvious and someone may easily
later miss that and update.

So mod_timer can also work as add_timer.

But for hrtimer, is it safe for an hrtimer_setup call to run while a
timer is already queued? And same for hrtimer_start.

> 
> > > +static enum hrtimer_restart prb_retire_rx_blk_timer_expired(struct hrtimer *t)
> > >  {
> > >  	struct packet_sock *po =
> > >  		timer_container_of(po, t, rx_ring.prb_bdqc.retire_blk_timer);
> > > @@ -790,6 +790,7 @@ static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
> > > 
> > >  out:
> > >  	spin_unlock(&po->sk.sk_receive_queue.lock);
> > > +	return HRTIMER_RESTART;
> > 
> > This always restart the timer. But that is not the current behavior.
> > Per prb_retire_rx_blk_timer_expired:
> > 
> >    * 1) We refresh the timer only when we open a block.
> > 
> > Look at the five different paths that can reach label out.
> > 
> > In particular, if the block is retired in this timer, and no new block
> > is available to be opened, no timer should be armed.
> > 
> > >  }
> 
> I have sorted out the logic in this area; please take a look and see if it's correct.
> 
> We are discussing the conditions under which we should return HRTIMER_NORESTART. We only
> need to focus on the three 'goto out' statements in this function (because if it don't
> call 'goto out', it will definitely not skip the 'refresh_timer:' label, and if it don't
> skip the refresh_timer label, it will definitely execute the _prb_refresh_rx_retire_blk_timer
> function, which expects to return HRTIMER_RESTART):
> Case 1:
>   if (unlikely(pkc->delete_blk_timer))
>     goto out;
>   This case indicates that the hrtimer has already been stopped. In this situation, it 
>   should return HRTIMER_NORESTART, and I will make this change in PATCH v3.
> Case 2:
>   if (!prb_dispatch_next_block(pkc, po))
>     goto refresh_timer;
>   else
>     goto out;
>   In this case, the execution will only reach the out label if prb_dispatch_next_block
>   returns a non-zero value. If prb_dispatch_next_block returns a non-zero value, it must
>   have executed prb_open_block, which in turn will call _prb_refresh_rx_retire_blk_timer
>   to set the new timeout for the retire timer. Therefore, in this scenario, the hrtimer
>   should return HRTIMER_RESTART.

Above I am talking about this case, where the hrtimer is reinitialized
and started in _prb_refresh_rx_retire_blk_timer and after that also
restarts itself with HRTIMER_RESTART.

> Case 3:
>   } else {
>      ...
>      prb_open_block(pkc, pbd);
>      goto out;
>   }
>   This goto out clearly follows a call to prb_open_block, and as mentioned in the case 2,
>   it will set a new timeout and expects the hrtimer to restart.
> Based on the analysis above, I only need to modify the situation described in case 1 in
> PATCH v3 to return HRTIMER_NORESTART. If there are any inaccuracies, please provide
> further guidance.
> 
> 
> Thanks
> Xin Zhao
> 



^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [PATCH v2] net: af_packet: Use hrtimer to do the retire operation
@ 2025-08-16 16:42 Xin Zhao
  0 siblings, 0 replies; 5+ messages in thread
From: Xin Zhao @ 2025-08-16 16:42 UTC (permalink / raw)
  To: willemdebruijn.kernel, edumazet, ferenc
  Cc: davem, kuba, pabeni, horms, netdev, linux-kernel

On Sat, 2025-08-16 at 17:33 +0800, Willem wrote:

> > > > -	p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
> > > 
> > > Since the hrtimer API takes ktime, and there is no other user for
> > > retire_blk_tov, remove that too and instead have interval_ktime.
> > > 
> > > >  	p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
> > 
> > We cannot simply remove the retire_blk_tov field, because in net/packet/diag.c 
> > retire_blk_tov is being used in function pdiag_put_ring. Since there are still
> > people using it, I personally prefer not to remove this variable for now. If
> > you think it is still necessary, I can remove it later and adjust the logic in
> > diag.c accordingly, using ktime_to_ms to convert the ktime_t format value back
> > to the u32 type needed in the pdiag_put_ring function.
> 
> Yes, let's remove the unnecessary extra field.
>  
> > 

OK, I will delete the 'retire_blk_tov' and add field named 'interval_ktime' instead.
And Accordingly, we also need to modify the logic in diag.c. The related changes are
as follows:
index 6ce1dcc28..c8f43e0c1 100644
--- a/net/packet/diag.c
+++ b/net/packet/diag.c
@@ -83,7 +83,7 @@ static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
        pdr.pdr_frame_nr = ring->frame_max + 1;

        if (ver > TPACKET_V2) {
-               pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
+               pdr.pdr_retire_tmo = ktime_to_ms(ring->prb_bdqc.interval_ktime);
                pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
                pdr.pdr_features = ring->prb_bdqc.feature_req_word;
        } else {


> > 
> > > > +	hrtimer_set_expires(&pkc->retire_blk_timer,
> > > > +			    ktime_add(ktime_get(), ms_to_ktime(pkc->retire_blk_tov)));
> > > 
> > > More common for HRTIMER_RESTART timers is hrtimer_forward_now.
> > > 
> > > >  	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
> > 
> > As I mentioned in my previous response, we cannot use hrtimer_forward_now here
> > because the function _prb_refresh_rx_retire_blk_timer can be called not only
> > when the retire timer expires, but also when the kernel logic for receiving
> > network packets detects that a network packet has filled up a block and calls
> > prb_open_block to use the next block. This can lead to a WARN_ON being triggered
> > in hrtimer_forward_now when it checks if the timer has already been enqueued
> > (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)).
> > I encountered this issue when I initially used hrtimer_forward_now. This is the
> > reason why the existing logic for the regular timer uses mod_timer instead of
> > add_timer, as mod_timer is designed to handle such scenarios. A relevant comment
> > in the mod_timer implementation states:
> >  * Note that if there are multiple unserialized concurrent users of the
> >  * same timer, then mod_timer() is the only safe way to modify the timeout,
> >  * since add_timer() cannot modify an already running timer.
> 
> Please add a comment above the call to hrtimer_set_expires and/or in
> the commit message. As this is non-obvious and someone may easily
> later miss that and update.


I will add a comment in PATCH v3 as below:
 static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
 {
-       mod_timer(&pkc->retire_blk_timer,
-                       jiffies + pkc->tov_in_jiffies);
+       /* We cannot use hrtimer_forward_now here because the function
+        * _prb_refresh_rx_retire_blk_timer can be called not only when
+        * the retire timer expires, but also when the kernel logic for
+        * receiving network packets detects that a network packet has
+        * filled up a block and calls prb_open_block to use the next
+        * block. This can lead to a WARN_ON being triggered in
+        * hrtimer_forward_now when it checks if the timer has already
+        * been enqueued.
+        */
+       hrtimer_set_expires(&pkc->retire_blk_timer,
+                           ktime_add(ktime_get(), pkc->interval_ktime));
        pkc->last_kactive_blk_num = pkc->kactive_blk_num;
 }



> 
> So mod_timer can also work as add_timer.
> 
> But for hrtimer, is it safe for an hrtimer_setup call to run while a
> timer is already queued? And same for hrtimer_start.


In the current patch, the hrtimer_setup and hrtimer_start will be called only when
the af_packet-mmap user call setsockopt, and then the following call chain:
  packet_setsockopt
    packet_set_ring
      init_prb_bdqc
        prb_setup_retire_blk_timer
          hrtimer_setup
          hrtimer_start
So once the socket setup, hrtimer_setup and hrtimer_start will never be called again.
However, I also tested calling hrtimer_setup and hrtimer_start within the hrtimer
expiration callback function, and it seems that it does not affect the normal
operation of the timer (the first line of the hrtimer_start comment states that
* hrtimer_start - (re)start an hrtimer, indicating that it can handle this scenario).
As you previously suggested, the hrtimer expiration callback functions typically use
hrtimer_set_expires or hrtimer_forward_now. In addition, repeatedly calling
hrtimer_setup and hrtimer_start within the hrtimer expiration callback function is
also a waste of CPU resources. Therefore, my current patch does not use hrtimer_start
within the hrtimer expiration callback function. I already change it in PATCH v1 as I
mentioned it in Changes in v1:
...
- Use hrtimer_set_expires instead of hrtimer_start_range_ns when retire timer needs update
  as suggested by Willem de Bruijn. Start the hrtimer in prb_setup_retire_blk_timer;
...
The implementation of _prb_refresh_rx_retire_blk_timer in the current patch is as follows:
static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
{
	/* We cannot use hrtimer_forward_now here because the function
	 * _prb_refresh_rx_retire_blk_timer can be called not only when
	 * the retire timer expires, but also when the kernel logic for
	 * receiving network packets detects that a network packet has
	 * filled up a block and calls prb_open_block to use the next
	 * block. This can lead to a WARN_ON being triggered in
	 * hrtimer_forward_now when it checks if the timer has already
	 * been enqueued.
	 */
	hrtimer_set_expires(&pkc->retire_blk_timer,
			    ktime_add(ktime_get(), pkc->interval_ktime));
	pkc->last_kactive_blk_num = pkc->kactive_blk_num;
}


Thanks
Xin Zhao


^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-08-16 16:43 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-08-15 17:08 [PATCH v2] net: af_packet: Use hrtimer to do the retire operation Xin Zhao
2025-08-16  9:33 ` Willem de Bruijn
  -- strict thread matches above, loose matches on Subject: below --
2025-08-16 16:42 Xin Zhao
2025-08-15  4:41 Xin Zhao
2025-08-15 10:12 ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).