Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next v05 6/6] hinic3: Remove unneeded coalesce parameters
From: Fan Gong @ 2026-04-11  3:37 UTC (permalink / raw)
  To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
	Ioana Ciornei, Mohsin Bashir
  Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
	Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>

  Remove unneeded coalesce parameters in irq handling.

Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
 drivers/net/ethernet/huawei/hinic3/hinic3_irq.c | 6 +-----
 drivers/net/ethernet/huawei/hinic3/hinic3_rx.h  | 3 ---
 2 files changed, 1 insertion(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index d3b3927b5408..42464c007174 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -156,13 +156,9 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
 	spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
 
 	err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
-	if (err) {
+	if (err)
 		netdev_err(netdev,
 			   "Failed to modify moderation for Queue: %u\n", q_id);
-	} else {
-		nic_dev->rxqs[q_id].last_coalesc_timer_cfg = coalesc_timer_cfg;
-		nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
-	}
 
 	return err;
 }
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index c11d080408a7..2ab691ed11a9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -111,9 +111,6 @@ struct hinic3_rxq {
 	dma_addr_t             cqe_start_paddr;
 
 	struct dim             dim;
-
-	u8                     last_coalesc_timer_cfg;
-	u8                     last_pending_limit;
 } ____cacheline_aligned;
 
 struct hinic3_dyna_rxq_res {
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net v4 1/2] flow_dissector: do not dissect PPPoE PFC frames
From: Qingfang Deng @ 2026-04-11  3:56 UTC (permalink / raw)
  To: Simon Horman
  Cc: linux-ppp, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Guillaume Nault, Wojciech Drewek, Tony Nguyen,
	linux-kernel, netdev, Paul Mackerras, Jaco Kroon, James Carlson,
	Marcin Szycik
In-Reply-To: <20260410171056.GD469338@kernel.org>

Hi,

On 4/11/2026 1:10 AM, Simon Horman wrote:
> On Fri, Apr 10, 2026 at 11:36:20AM +0800, Qingfang Deng wrote:
>> @@ -1361,7 +1376,7 @@ bool __skb_flow_dissect(const struct net *net,
>>   			struct pppoe_hdr hdr;
>>   			__be16 proto;
>>   		} *hdr, _hdr;
>> -		u16 ppp_proto;
>> +		__be16 ppp_proto;
> 
> I'm unclear of the relationship between changing the type of ppp_proto
> and the problem described in the patch description. And it
> is creating a log of churn in this patch. I suggest dropping it.

The intention is to restore the original behavior before the blamed 
commit. If you find it too verbose for a fix, I can drop it and then 
repost that part later to net-next.

>> @@ -1374,27 +1389,19 @@ bool __skb_flow_dissect(const struct net *net,
>>   			break;
>>   		}
>>   
>> -		/* least significant bit of the most significant octet
>> -		 * indicates if protocol field was compressed
>> -		 */
>> -		ppp_proto = ntohs(hdr->proto);
>> -		if (ppp_proto & 0x0100) {
>> -			ppp_proto = ppp_proto >> 8;
>> -			nhoff += PPPOE_SES_HLEN - 1;
>> -		} else {
>> -			nhoff += PPPOE_SES_HLEN;
>> -		}
> 
> Could we go for something like this?
> 
> 		ppp_proto = ntohs(hdr->proto);
> 		nhoff += PPPOE_SES_HLEN;
> 
> 		/* Explanation of what is going on */
> 		if (ppp_proto & 0x0100)
> 			ppp_proto = some invalid value like 0
>

I think it is redundant. ppp_proto_is_valid() already requires 
uncompressed frames.

>> +		ppp_proto = hdr->proto;
>> +		nhoff += PPPOE_SES_HLEN;
>>   
>> -		if (ppp_proto == PPP_IP) {
>> +		if (ppp_proto == htons(PPP_IP)) {
>>   			proto = htons(ETH_P_IP);
>>   			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> -		} else if (ppp_proto == PPP_IPV6) {
>> +		} else if (ppp_proto == htons(PPP_IPV6)) {
>>   			proto = htons(ETH_P_IPV6);
>>   			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> -		} else if (ppp_proto == PPP_MPLS_UC) {
>> +		} else if (ppp_proto == htons(PPP_MPLS_UC)) {
>>   			proto = htons(ETH_P_MPLS_UC);
>>   			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> -		} else if (ppp_proto == PPP_MPLS_MC) {
>> +		} else if (ppp_proto == htons(PPP_MPLS_MC)) {
>>   			proto = htons(ETH_P_MPLS_MC);
>>   			fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>>   		} else if (ppp_proto_is_valid(ppp_proto)) {

^ permalink raw reply

* [PATCH net-next] r8169: Use napi_schedule_irqoff()
From: Matt Vollrath @ 2026-04-11  4:16 UTC (permalink / raw)
  To: netdev; +Cc: Matt Vollrath

napi_schedule() masks hard interrupts while doing its work, which is
redundant when called from an interrupt handler where hard interrupts
are already masked. Use napi_schedule_irqoff() instead to bypass this
redundant masking. This is an optimization.

Tested on a Lenovo RTL8168h/8111h.

Signed-off-by: Matt Vollrath <tactii@gmail.com>
---
 drivers/net/ethernet/realtek/r8169_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 791277e750ba..4c0ad0de3410 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4873,7 +4873,7 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
 		phy_mac_interrupt(tp->phydev);
 
 	rtl_irq_disable(tp);
-	napi_schedule(&tp->napi);
+	napi_schedule_irqoff(&tp->napi);
 out:
 	rtl_ack_events(tp, status);
 
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH bpf-next v2 0/3] Use kmalloc_nolock() universally in BPF local storage
From: patchwork-bot+netdevbpf @ 2026-04-11  4:30 UTC (permalink / raw)
  To: Amery Hung
  Cc: bpf, netdev, alexei.starovoitov, andrii, daniel, martin.lau,
	memxor, kernel-team
In-Reply-To: <20260411015419.114016-1-ameryhung@gmail.com>

Hello:

This series was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:

On Fri, 10 Apr 2026 18:54:15 -0700 you wrote:
> Socket local storage did not convert to use kmalloc_nolock() since there
> were observable performance degredation due to kfree_nolock() hitting the
> slow path and the lack of kfree_rcu()-like batching freeing. Now that
> these concern were addressed in slub, convert all remaining local storage
> flavors to use kmalloc_nolock().
> 
> v1 -> v2:
>   - Fix build (CI, Alexei)
> 
> [...]

Here is the summary with links:
  - [bpf-next,v2,1/3] selftests/bpf: Remove kmalloc tracing from local storage create bench
    https://git.kernel.org/bpf/bpf-next/c/78ee02a966ad
  - [bpf-next,v2,2/3] bpf: Use kmalloc_nolock() universally in local storage
    https://git.kernel.org/bpf/bpf-next/c/5063e7758899
  - [bpf-next,v2,3/3] bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
    https://git.kernel.org/bpf/bpf-next/c/136deea435dc

You are awesome, thank you!
-- 
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html



^ permalink raw reply

* Re: [PATCH bpf-next v2 2/3] bpf: Use kmalloc_nolock() universally in local storage
From: Alexei Starovoitov @ 2026-04-11  4:39 UTC (permalink / raw)
  To: bot+bpf-ci
  Cc: Amery Hung, bpf, Network Development, Andrii Nakryiko,
	Daniel Borkmann, Martin KaFai Lau, Kumar Kartikeya Dwivedi,
	Kernel Team, Alexei Starovoitov, Eduard, Yonghong Song,
	Chris Mason, Ihor Solodrai
In-Reply-To: <efdb35c2d8151489fb031ad63ce8bd6000b8bbaa2bf5a523c927907399c93ab0@mail.kernel.org>

On Fri, Apr 10, 2026 at 7:36 PM <bot+bpf-ci@kernel.org> wrote:
>
>
> This allows value sizes up to ~65KB. Before this patch, socket and
> inode storage used bpf_map_kzalloc() (backed by regular kmalloc)
> which could handle those large sizes. After this patch, any
> elem_size above KMALLOC_MAX_CACHE_SIZE will silently fail: the map
> creation succeeds via bpf_local_storage_map_alloc_check() but every
> element allocation returns NULL.
>
> Should BPF_LOCAL_STORAGE_MAX_VALUE_SIZE be updated to use
> KMALLOC_MAX_CACHE_SIZE instead of KMALLOC_MAX_SIZE now that all
> storage types go through kmalloc_nolock()?
>
> Slava Imameev raised the same concern for task storage in
> https://lore.kernel.org/bpf/20260410014341.47043-1-slava.imameev@crowdstrike.com/

Right. Let's update it, but I don't think it's a regression.
On a loaded system kmalloc_large() rarely succeeds for order 2+.
That's why kmalloc_nolock() doesn't attempt to bridge that gap.
One or two contiguous physical pages is the best one can expect.
In early bpf days we picked KMALLOC_MAX_SIZE assuming that
it's a realistic max for kmalloc().
It turned out to be wishful thinking.
kmalloc_large concept should really be removed.
It deceives users into thinking that it's usable.

^ permalink raw reply

* Re: [PATCH net-next 4/5] net/sched: netem: add per-impairment extended statistics
From: Stephen Hemminger @ 2026-04-11  5:08 UTC (permalink / raw)
  To: Paolo Abeni
  Cc: netdev, Jamal Hadi Salim, Jiri Pirko, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Simon Horman, open list
In-Reply-To: <36e86b68-f7d9-4455-a0b5-613f717e670a@redhat.com>

On Thu, 9 Apr 2026 11:30:00 +0200
Paolo Abeni <pabeni@redhat.com> wrote:

> On 4/4/26 12:52 AM, Stephen Hemminger wrote:
> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > index 66e8072f44df..fada10cb9b7b 100644
> > --- a/include/uapi/linux/pkt_sched.h
> > +++ b/include/uapi/linux/pkt_sched.h
> > @@ -569,6 +569,15 @@ struct tc_netem_gemodel {
> >  #define NETEM_DIST_SCALE	8192
> >  #define NETEM_DIST_MAX		16384
> >  
> > +struct tc_netem_xstats {
> > +	__u32	delayed;	/* packets delayed */
> > +	__u32	dropped;	/* packets dropped by loss model      */
> > +	__u32	corrupted;	/* packets with bit errors injected   */
> > +	__u32	duplicated;	/* duplicate packets generated        */
> > +	__u32	reordered;	/* packets sent out of order          */
> > +	__u32	ecn_marked;	/* packets ECN CE-marked (not dropped)*/
> > +};  
> 
> Sashiko notes that the counters size will be set in stone by the uAPI,
> and u32 can wraparound very quickly (especially for unconditional delay).
> 
> I see other qdiscs generally use __u32, but some have __u64 too, so I
> assume there are no architectural blocker to larger counter.
> 
> Could you please move use __u64 above?
> 
> Thanks,
> 
> Paolo
> 

Sure larger counters are fine, mostly just following the herd.
I assume don't need to need about 32 bit tearing when reading these?

^ permalink raw reply

* [PATTCH net v5 0/8] net/sched: netem bug fixes
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger

These bugs were identified while using AI-assisted code review of
sch_netem.c to analyze the packet duplication re-entrancy problem
(CVE-2025-37890, CVE-2025-38001), which are addressed in a separate
series.

The review uncovered several additional issues:

- probability gaps in the 4-state Markov loss model where
  boundary values produce no state transition
- queue limit check not accounting for reordered packets
- PRNG reseeded on every tc change, breaking reproducibility
- the core dequeue re-entrancy issue with child qdiscs
  causing HFSC eltree corruption and DRR class stalls
- missing NULL termination on the tfifo linear list tail
- slot delay configuration not validated for inverted ranges
- slot delay arithmetic overflow for ranges above ~2.1 seconds

v5 - fix slot dynamics in the dequeue change

v4 - split refactoring and fix for dequeue into two patches

Stephen Hemminger (8):
  net/sched: netem: fix probability gaps in 4-state loss model
  net/sched: netem: fix queue limit check to include reordered packets
  net/sched: netem: only reseed PRNG when seed is explicitly provided
  net/sched: netem: refactor dequeue into helper functions
  net/sched: netem: batch-transfer ready packets to avoid child
    re-entrancy
  net/sched: netem: null-terminate tfifo linear queue tail
  net/sched: netem: check for invalid slot range
  net/sched: netem: fix slot delay calculation overflow

 net/sched/sch_netem.c | 238 ++++++++++++++++++++++++++++--------------
 1 file changed, 159 insertions(+), 79 deletions(-)

-- 
2.53.0


^ permalink raw reply

* [PATTCH net v5 1/8] net/sched: netem: fix probability gaps in 4-state loss model
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

The 4-state Markov chain in loss_4state() has gaps at the boundaries
between transition probability ranges. The comparisons use:

  if (rnd < a4)
  else if (a4 < rnd && rnd < a1 + a4)

When rnd equals a boundary value exactly, neither branch matches and
no state transition occurs. The redundant lower-bound check (a4 < rnd)
is already implied by being in the else branch.

Remove the unnecessary lower-bound comparisons so the ranges are
contiguous and every random value produces a transition, matching
the GI (General and Intuitive) loss model specification.

This bug goes back to original implementation of this model.

Fixes: 661b79725fea ("netem: revised correlated loss generator")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 20df1c08b1e9..8ee72cac1faf 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -227,10 +227,10 @@ static bool loss_4state(struct netem_sched_data *q)
 		if (rnd < clg->a4) {
 			clg->state = LOST_IN_GAP_PERIOD;
 			return true;
-		} else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+		} else if (rnd < clg->a1 + clg->a4) {
 			clg->state = LOST_IN_BURST_PERIOD;
 			return true;
-		} else if (clg->a1 + clg->a4 < rnd) {
+		} else {
 			clg->state = TX_IN_GAP_PERIOD;
 		}
 
@@ -247,9 +247,9 @@ static bool loss_4state(struct netem_sched_data *q)
 	case LOST_IN_BURST_PERIOD:
 		if (rnd < clg->a3)
 			clg->state = TX_IN_BURST_PERIOD;
-		else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+		else if (rnd < clg->a2 + clg->a3) {
 			clg->state = TX_IN_GAP_PERIOD;
-		} else if (clg->a2 + clg->a3 < rnd) {
+		} else {
 			clg->state = LOST_IN_BURST_PERIOD;
 			return true;
 		}
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 2/8] net/sched: netem: fix queue limit check to include reordered packets
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

The queue limit check in netem_enqueue() uses q->t_len which only
counts packets in the internal tfifo. Packets placed in sch->q by
the reorder path (__qdisc_enqueue_head) are not counted, allowing
the total queue occupancy to exceed sch->limit under reordering.

Include sch->q.qlen in the limit check.

Fixes: 50612537e9ab ("netem: fix classful handling")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8ee72cac1faf..d400a730eadd 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -524,7 +524,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 				1 << get_random_u32_below(8);
 	}
 
-	if (unlikely(q->t_len >= sch->limit)) {
+	if (unlikely(sch->q.qlen >= sch->limit)) {
 		/* re-link segs, so that qdisc_drop_all() frees them all */
 		skb->next = segs;
 		qdisc_drop_all(skb, sch, to_free);
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 3/8] net/sched: netem: only reseed PRNG when seed is explicitly provided
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	François Michel, open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

netem_change() unconditionally reseeds the PRNG on every tc change
command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed
is generated, destroying reproducibility for users who set a
deterministic seed on a previous change.

Move the initial random seed generation to netem_init() and only
reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly
provided by the user.

Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index d400a730eadd..556f9747f0e7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1112,11 +1112,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 	/* capping jitter to the range acceptable by tabledist() */
 	q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
 
-	if (tb[TCA_NETEM_PRNG_SEED])
+	if (tb[TCA_NETEM_PRNG_SEED]) {
 		q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
-	else
-		q->prng.seed = get_random_u64();
-	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+		prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+	}
 
 unlock:
 	sch_tree_unlock(sch);
@@ -1139,6 +1138,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt,
 		return -EINVAL;
 
 	q->loss_model = CLG_RANDOM;
+	q->prng.seed = get_random_u64();
+	prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+
 	ret = netem_change(sch, opt, extack);
 	if (ret)
 		pr_info("netem: change failed\n");
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 4/8] net/sched: netem: refactor dequeue into helper functions
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

Extract the tfifo removal, slot accounting, and child/direct dequeue
paths from the monolithic netem_dequeue() into separate helpers:

  netem_pull_tfifo()    - remove head packet from tfifo
  netem_slot_account()  - update slot pacing counters
  netem_dequeue_child() - enqueue to child, then dequeue from child
  netem_dequeue_direct()- dequeue from tfifo when no child

This replaces the goto-based control flow with straightforward function
calls, making the code easier to follow and modify.

No functional change intended.

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 190 +++++++++++++++++++++++++++---------------
 1 file changed, 123 insertions(+), 67 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 556f9747f0e7..e264f7aefb97 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -689,99 +689,155 @@ static struct sk_buff *netem_peek(struct netem_sched_data *q)
 	return q->t_head;
 }
 
-static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+/*
+ * Pop the head packet from the tfifo and prepare it for delivery.
+ * skb->dev shares the rbnode area and must be restored after removal.
+ */
+static struct sk_buff *netem_pull_tfifo(struct netem_sched_data *q,
+					struct Qdisc *sch)
 {
-	if (skb == q->t_head) {
+	struct sk_buff *skb;
+
+	if (q->t_head) {
+		skb = q->t_head;
 		q->t_head = skb->next;
 		if (!q->t_head)
 			q->t_tail = NULL;
 	} else {
-		rb_erase(&skb->rbnode, &q->t_root);
+		struct rb_node *p = rb_first(&q->t_root);
+
+		if (!p)
+			return NULL;
+		skb = rb_to_skb(p);
+		rb_erase(p, &q->t_root);
 	}
+
+	q->t_len--;
+	skb->next = NULL;
+	skb->prev = NULL;
+	skb->dev = qdisc_dev(sch);
+
+	return skb;
 }
 
-static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+/* Update slot pacing counters after releasing a packet */
+static void netem_slot_account(struct netem_sched_data *q,
+			       const struct sk_buff *skb, u64 now)
+{
+	if (!q->slot.slot_next)
+		return;
+
+	q->slot.packets_left--;
+	q->slot.bytes_left -= qdisc_pkt_len(skb);
+	if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0)
+		get_slot_next(q, now);
+}
+
+/*
+ * Transfer time-ready packets from the tfifo into the child qdisc,
+ * then dequeue from the child.
+ */
+static struct sk_buff *netem_dequeue_child(struct Qdisc *sch)
 {
 	struct netem_sched_data *q = qdisc_priv(sch);
+	u64 now = ktime_get_ns();
 	struct sk_buff *skb;
 
-tfifo_dequeue:
-	skb = __qdisc_dequeue_head(&sch->q);
-	if (skb) {
-deliver:
-		qdisc_qstats_backlog_dec(sch, skb);
-		qdisc_bstats_update(sch, skb);
-		return skb;
-	}
 	skb = netem_peek(q);
 	if (skb) {
-		u64 time_to_send;
-		u64 now = ktime_get_ns();
+		u64 time_to_send = netem_skb_cb(skb)->time_to_send;
 
-		/* if more time remaining? */
-		time_to_send = netem_skb_cb(skb)->time_to_send;
 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 			get_slot_next(q, now);
 
 		if (time_to_send <= now && q->slot.slot_next <= now) {
-			netem_erase_head(q, skb);
-			q->t_len--;
-			skb->next = NULL;
-			skb->prev = NULL;
-			/* skb->dev shares skb->rbnode area,
-			 * we need to restore its value.
-			 */
-			skb->dev = qdisc_dev(sch);
-
-			if (q->slot.slot_next) {
-				q->slot.packets_left--;
-				q->slot.bytes_left -= qdisc_pkt_len(skb);
-				if (q->slot.packets_left <= 0 ||
-				    q->slot.bytes_left <= 0)
-					get_slot_next(q, now);
-			}
-
-			if (q->qdisc) {
-				unsigned int pkt_len = qdisc_pkt_len(skb);
-				struct sk_buff *to_free = NULL;
-				int err;
-
-				err = qdisc_enqueue(skb, q->qdisc, &to_free);
-				kfree_skb_list(to_free);
-				if (err != NET_XMIT_SUCCESS) {
-					if (net_xmit_drop_count(err))
-						qdisc_qstats_drop(sch);
-					sch->qstats.backlog -= pkt_len;
-					sch->q.qlen--;
-					qdisc_tree_reduce_backlog(sch, 1, pkt_len);
-				}
-				goto tfifo_dequeue;
-			}
-			sch->q.qlen--;
-			goto deliver;
-		}
-
-		if (q->qdisc) {
-			skb = q->qdisc->ops->dequeue(q->qdisc);
-			if (skb) {
+			struct sk_buff *to_free = NULL;
+			unsigned int pkt_len;
+			int err;
+
+			skb = netem_pull_tfifo(q, sch);
+			netem_slot_account(q, skb, now);
+
+			pkt_len = qdisc_pkt_len(skb);
+			err = qdisc_enqueue(skb, q->qdisc, &to_free);
+			kfree_skb_list(to_free);
+			if (err != NET_XMIT_SUCCESS) {
+				if (net_xmit_drop_count(err))
+					qdisc_qstats_drop(sch);
+				sch->qstats.backlog -= pkt_len;
 				sch->q.qlen--;
-				goto deliver;
+				qdisc_tree_reduce_backlog(sch, 1, pkt_len);
 			}
 		}
-
-		qdisc_watchdog_schedule_ns(&q->watchdog,
-					   max(time_to_send,
-					       q->slot.slot_next));
 	}
 
-	if (q->qdisc) {
-		skb = q->qdisc->ops->dequeue(q->qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			goto deliver;
-		}
+	skb = q->qdisc->ops->dequeue(q->qdisc);
+	if (skb)
+		sch->q.qlen--;
+
+	return skb;
+}
+
+/* Dequeue directly from the tfifo when no child qdisc is configured. */
+static struct sk_buff *netem_dequeue_direct(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+	u64 time_to_send;
+	u64 now;
+
+	skb = netem_peek(q);
+	if (!skb)
+		return NULL;
+
+	now = ktime_get_ns();
+	time_to_send = netem_skb_cb(skb)->time_to_send;
+
+	if (q->slot.slot_next && q->slot.slot_next < time_to_send)
+		get_slot_next(q, now);
+
+	if (time_to_send > now || q->slot.slot_next > now)
+		return NULL;
+
+	skb = netem_pull_tfifo(q, sch);
+	netem_slot_account(q, skb, now);
+	sch->q.qlen--;
+
+	return skb;
+}
+
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+	struct netem_sched_data *q = qdisc_priv(sch);
+	struct sk_buff *skb;
+
+	/* First check the reorder queue */
+	skb = __qdisc_dequeue_head(&sch->q);
+	if (skb)
+		goto deliver;
+
+	if (q->qdisc)
+		skb = netem_dequeue_child(sch);
+	else
+		skb = netem_dequeue_direct(sch);
+
+	if (skb)
+		goto deliver;
+
+	/* Nothing ready — schedule watchdog for next packet */
+	skb = netem_peek(q);
+	if (skb) {
+		u64 time_to_send = netem_skb_cb(skb)->time_to_send;
+
+		qdisc_watchdog_schedule_ns(&q->watchdog,
+					   max(time_to_send, q->slot.slot_next));
 	}
 	return NULL;
+
+deliver:
+	qdisc_qstats_backlog_dec(sch, skb);
+	qdisc_bstats_update(sch, skb);
+	return skb;
 }
 
 static void netem_reset(struct Qdisc *sch)
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 5/8] net/sched: netem: batch-transfer ready packets to avoid child re-entrancy
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Jamal Hadi Salim, Jiri Pirko, David S. Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman,
	open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

netem_dequeue_child() previously transferred one packet from the tfifo
to the child qdisc per dequeue call. Parents like HFSC that track
class active/inactive state on qlen transitions could see an enqueue
during dequeue, causing double-insertion into the eltree
(CVE-2025-37890, CVE-2025-38001). Non-work-conserving children like
TBF could also refuse to return a just-enqueued packet, making netem
return NULL despite having backlog, which causes parents like DRR to
incorrectly deactivate the class.

Move all time-ready packets into the child before calling its dequeue.
This separates the enqueue and dequeue phases so the parent sees
consistent qlen transitions.

Fixes: 50612537e9ab ("netem: fix classful handling")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 48 +++++++++++++++++++++++--------------------
 1 file changed, 26 insertions(+), 22 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index e264f7aefb97..98931bb4354b 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -734,8 +734,10 @@ static void netem_slot_account(struct netem_sched_data *q,
 }
 
 /*
- * Transfer time-ready packets from the tfifo into the child qdisc,
- * then dequeue from the child.
+ * Transfer all time-ready packets from the tfifo into the child qdisc,
+ * then dequeue from the child.  Batching the transfers avoids calling
+ * qdisc_enqueue() inside the parent's dequeue path, which confuses
+ * parents that track active/inactive state on qlen transitions (HFSC).
  */
 static struct sk_buff *netem_dequeue_child(struct Qdisc *sch)
 {
@@ -743,31 +745,33 @@ static struct sk_buff *netem_dequeue_child(struct Qdisc *sch)
 	u64 now = ktime_get_ns();
 	struct sk_buff *skb;
 
-	skb = netem_peek(q);
-	if (skb) {
-		u64 time_to_send = netem_skb_cb(skb)->time_to_send;
+	while ((skb = netem_peek(q)) != NULL) {
+		struct sk_buff *to_free = NULL;
+		unsigned int pkt_len;
+		u64 time_to_send;
+		int err;
 
+		time_to_send = netem_skb_cb(skb)->time_to_send;
 		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 			get_slot_next(q, now);
 
-		if (time_to_send <= now && q->slot.slot_next <= now) {
-			struct sk_buff *to_free = NULL;
-			unsigned int pkt_len;
-			int err;
-
-			skb = netem_pull_tfifo(q, sch);
-			netem_slot_account(q, skb, now);
+		if (time_to_send > now)
+			break;
+		if (q->slot.slot_next > now)
+			break;
 
-			pkt_len = qdisc_pkt_len(skb);
-			err = qdisc_enqueue(skb, q->qdisc, &to_free);
-			kfree_skb_list(to_free);
-			if (err != NET_XMIT_SUCCESS) {
-				if (net_xmit_drop_count(err))
-					qdisc_qstats_drop(sch);
-				sch->qstats.backlog -= pkt_len;
-				sch->q.qlen--;
-				qdisc_tree_reduce_backlog(sch, 1, pkt_len);
-			}
+		skb = netem_pull_tfifo(q, sch);
+		netem_slot_account(q, skb, now);
+
+		pkt_len = qdisc_pkt_len(skb);
+		err = qdisc_enqueue(skb, q->qdisc, &to_free);
+		kfree_skb_list(to_free);
+		if (unlikely(err != NET_XMIT_SUCCESS)) {
+			if (net_xmit_drop_count(err))
+				qdisc_qstats_drop(sch);
+			sch->qstats.backlog -= pkt_len;
+			sch->q.qlen--;
+			qdisc_tree_reduce_backlog(sch, 1, pkt_len);
 		}
 	}
 
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 6/8] net/sched: netem: null-terminate tfifo linear queue tail
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Peter Oskolkov, open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

When tfifo_enqueue() appends a packet to the linear queue tail,
nskb->next is never set to NULL. The list terminates correctly
only by accident if the skb arrived with next already NULL.

Explicitly null-terminate the tail to prevent list corruption.

Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 98931bb4354b..14d22fc7365d 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -398,6 +398,7 @@ static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 			q->t_tail->next = nskb;
 		else
 			q->t_head = nskb;
+		nskb->next = NULL;
 		q->t_tail = nskb;
 	} else {
 		struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 7/8] net/sched: netem: check for invalid slot range
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Neal Cardwell, Yousuk Seung, open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

Reject slot configuration where min_delay exceeds max_delay.
The delay range computation in get_slot_next() underflows in
this case, producing bogus results.

Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 14d22fc7365d..ef4965f20f17 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -888,6 +888,18 @@ static int get_dist_table(struct disttable **tbl, const struct nlattr *attr)
 	return 0;
 }
 
+static int validate_slot(const struct nlattr *attr,
+			 struct netlink_ext_ack *extack)
+{
+	const struct tc_netem_slot *c = nla_data(attr);
+
+	if (c->min_delay > c->max_delay) {
+		NL_SET_ERR_MSG(extack, "slot min delay greater than max delay");
+		return -EINVAL;
+	}
+	return 0;
+}
+
 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 {
 	const struct tc_netem_slot *c = nla_data(attr);
@@ -1101,6 +1113,12 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 			goto table_free;
 	}
 
+	if (tb[TCA_NETEM_SLOT]) {
+		ret = validate_slot(tb[TCA_NETEM_SLOT], extack);
+		if (ret)
+			goto table_free;
+	}
+
 	sch_tree_lock(sch);
 	/* backup q->clg and q->loss_model */
 	old_clg = q->clg;
-- 
2.53.0


^ permalink raw reply related

* [PATTCH net v5 8/8] net/sched: netem: fix slot delay calculation overflow
From: Stephen Hemminger @ 2026-04-11  5:15 UTC (permalink / raw)
  To: netdev
  Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Neal Cardwell, Yousuk Seung, open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>

get_slot_next() computes a random delay between min_delay and
max_delay using:

  get_random_u32() * (max_delay - min_delay) >> 32

This overflows signed 64-bit arithmetic when the delay range exceeds
approximately 2.1 seconds (2^31 nanoseconds), producing a negative
result that effectively disables slot-based pacing. This is a
realistic configuration for WAN emulation (e.g., slot 1s 5s).

Use mul_u64_u32_shr() which handles the widening multiply without
overflow.

Fixes: 0a9fe5c375b5 ("netem: slotting with non-uniform distribution")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
 net/sched/sch_netem.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index ef4965f20f17..6a09627bafa0 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -660,9 +660,8 @@ static void get_slot_next(struct netem_sched_data *q, u64 now)
 
 	if (!q->slot_dist)
 		next_delay = q->slot_config.min_delay +
-				(get_random_u32() *
-				 (q->slot_config.max_delay -
-				  q->slot_config.min_delay) >> 32);
+			mul_u64_u32_shr(q->slot_config.max_delay - q->slot_config.min_delay,
+					get_random_u32(), 32);
 	else
 		next_delay = tabledist(q->slot_config.dist_delay,
 				       (s32)(q->slot_config.dist_jitter),
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH] net: Optimize flush calculation in inet_gro_receive()
From: Kuniyuki Iwashima @ 2026-04-11  5:19 UTC (permalink / raw)
  To: deller; +Cc: davem, dsahern, linux-kernel, linux-parisc, netdev, edumazet
In-Reply-To: <adkMqgP6QeBPgqP9@p100>

From: Helge Deller <deller@kernel.org>
Date: Fri, 10 Apr 2026 16:43:54 +0200
> For the calculation of the flush variable, use the get_unaligned_xxx() helpers
> to access only relevant bits of the IP header.
> 
> Note: Since I don't know the network details, I'm not sure if "& ~IP_DF"
> (& ~0x4000) is correct, or if "& IP_OFFSET" (& 0x1FFF) should be used instead

~IP_DF is correct (MF bit needs to be checked), see

commit db8caf3dbc77599dc90f4ea0a803cd1d97116f30
Author: Eric Dumazet <edumazet@google.com>
Date:   Fri May 31 11:18:10 2013

    gro: should aggregate frames without DF


> (which I believe would be more correct). Instead of possibly breaking things I
> left it as is, but maybe some expert can check?
> 
> Signed-off-by: Helge Deller <deller@gmx.de>
> 
> diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
> index c7731e300a44..58cad2687c2c 100644
> --- a/net/ipv4/af_inet.c
> +++ b/net/ipv4/af_inet.c
> @@ -1479,7 +1479,7 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
>  	struct sk_buff *p;
>  	unsigned int hlen;
>  	unsigned int off;
> -	int flush = 1;
> +	u16 flush = 1;
>  	int proto;
>  
>  	off = skb_gro_offset(skb);
> @@ -1504,7 +1504,8 @@ struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb)
>  		goto out;
>  
>  	NAPI_GRO_CB(skb)->proto = proto;
> -	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
> +	flush = (get_unaligned_be16(&iph->tot_len) ^ skb_gro_len(skb)) |
> +	        (get_unaligned_be16(&iph->frag_off) & ~IP_DF);

I think here we intentionally use 32-bit loads:

commit 1075f3f65d0e0f49351b7d4310e9f94483972a51
Author: Herbert Xu <herbert@gondor.apana.org.au>
Date:   Tue May 26 18:50:29 2009

    ipv4: Use 32-bit loads for ID and length in GRO


Before your patch, 32-bit load + bswap are used while
16-bit load + rol 8 after the change.

I feel the 4-byte aligned load + bswap is faster than
misaligned access + 8 times shift (Is this internally
optimised like xchg for a single word size ?)

Do you have some numbers ?


Before:
	flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb))
mov    edx,DWORD PTR [rcx]
bswap  edx
	return skb->len - NAPI_GRO_CB(skb)->data_offset;
mov    r8d,DWORD PTR [rsi+0x38]
mov    r9d,DWORD PTR [rsi+0x70]
sub    r9d,r8d
xor    r9d,edx
	| (ntohl(*(__be32 *)&iph->id) & ~IP_DF));
mov    ebp,0xffbfffff
and    ebp,DWORD PTR [rcx+0x4]
bswap  ebp
or     ebp,r9d


After:
	flush = (get_unaligned_be16(&iph->tot_len) ^ skb_gro_len(skb))
movzx  edx,WORD PTR [rcx+0x2]
rol    dx,0x8
	return skb->len - NAPI_GRO_CB(skb)->data_offset;
mov    r8d,DWORD PTR [rsi+0x38]
mov    r9d,DWORD PTR [rsi+0x70]
sub    r9d,r8d
xor    r9d,edx
	| (get_unaligned_be16(&iph->frag_off) & ~IP_DF);
movzx  ebp,WORD PTR [rcx+0x6]
and    ebp,0xffffffbf
rol    bp,0x8
or     ebp,r9d

^ permalink raw reply

* Re: [PATCH iwl-next 2/2] idpf: implement pci error handlers
From: Lukas Wunner @ 2026-04-11  5:43 UTC (permalink / raw)
  To: Emil Tantilov
  Cc: intel-wired-lan, netdev, przemyslaw.kitszel, jay.bhat,
	ivan.d.barrera, aleksandr.loktionov, larysa.zaremba,
	anthony.l.nguyen, andrew+netdev, davem, edumazet, kuba, pabeni,
	aleksander.lobakin, linux-pci, madhu.chittim, decot, willemb,
	sheenamo
In-Reply-To: <20260411003959.30959-3-emil.s.tantilov@intel.com>

On Fri, Apr 10, 2026 at 05:39:59PM -0700, Emil Tantilov wrote:
> +static pci_ers_result_t
> +idpf_pci_err_slot_reset(struct pci_dev *pdev)
> +{
> +	struct idpf_adapter *adapter = pci_get_drvdata(pdev);
> +
> +	pci_restore_state(pdev);
> +	pci_set_master(pdev);
> +	pci_wake_from_d3(pdev, false);
> +	if (readl(adapter->reset_reg.rstat) != 0xFFFFFFFF) {
> +		pci_save_state(pdev);
> +		return PCI_ERS_RESULT_RECOVERED;
> +	}

The pci_save_state() is no longer necessary here, please drop it.
See commits a2f1e22390ac and 383d89699c50 for details.

Thanks,

Lukas

^ permalink raw reply

* Re: [PATCH net] netrom: do some basic forms of validation on incoming frames
From: Greg KH @ 2026-04-11  5:50 UTC (permalink / raw)
  To: hugh
  Cc: Kuniyuki Iwashima, kuba, davem, edumazet, horms, linux-hams,
	linux-kernel, netdev, pabeni, stable, workflows, yizhe
In-Reply-To: <4f5810a7-c792-4d6b-9f7c-6c6b289def19@blemings.org>

On Sat, Apr 11, 2026 at 08:25:19AM +1000, Hugh Blemings wrote:
> 
> On 11/4/2026 08:11, Kuniyuki Iwashima wrote:
> > From: Jakub Kicinski <kuba@kernel.org>
> > Date: Fri, 10 Apr 2026 14:54:48 -0700
> > > On Fri, 10 Apr 2026 14:30:42 -0700 Jakub Kicinski wrote:
> > > > On Fri, 10 Apr 2026 07:24:36 +0200 Greg Kroah-Hartman wrote:
> > > > > On Thu, Apr 09, 2026 at 08:32:35PM -0700, Jakub Kicinski wrote:
> > > > > > Or for simplicity we could also be testing against skb_headlen()
> > > > > > since we don't expect any legit non-linear frames here? Dunno.
> > > > > I'll be glad to change this either way, your call.  Given that this is
> > > > > an obsolete protocol that seems to only be a target for drive-by fuzzers
> > > > > to attack, whatever the simplest thing to do to quiet them up I'll be
> > > > > glad to implement.
> > > > > 
> > > > > Or can we just delete this stuff entirely?  :)
> > > > Yes.
> > > > 
> > > > My thinking is to delete hamradio, nfc, atm, caif.. [more to come]
> > > > Create GH repos which provide them as OOT modules.
> > > > Hopefully we can convince any existing users to switch to that.
> > > > 
> > > > The only thing stopping me is the concern that this is just the softest
> > > > target and the LLMs will find something else to focus on which we can't
> > > > delete. I suspect any PCIe driver can be flooded with "aren't you
> > > > trusting the HW to provide valid responses here?" bullshit.
> > > > 
> > > > But hey, let's try. I'll post a patch nuking all of hamradio later
> > > > today.
> > > Well, either we "expunge" this code to OOT repos, or we mark it
> > > as broken and tell everyone that we don't take security fixes
> > > for anything that depends on BROKEN. I'd personally rather expunge.
> > +1 for "expunge" to prevent LLM-based patch flood.
> > 
> > IIRC, we did that recently for one driver only used by OpenWRT ?
> > 
> > 
> If the main concern here is ongoing maintenance of these Ham Radio related
> protocols/drivers, can we pause for a moment on anything as dramatic as
> removing from the tree entirely ?

Sure, but:

> There is a good cohort of capable kernel folks that either are or were ham
> radio operators who I believe, upon realising that things have got to this
> point, will be happy to redouble efforts to ensure this code maintained and
> tested to a satisfactory standard.

We need this code to be maintained, because as is being shown, there are
reported problems with it that will affect these devices/networks that
you all are using.  So all we need is a maintainer for this to be able
to take reports that we get and fix things up as needed.  I know you
have that experience, want to come back to kernel development, we've
missed you :)

thanks,

greg k-h

^ permalink raw reply

* [PATCH net] openvswitch: fix kernel panic from oversized vport upcall PID arrays
From: Weiming Shi @ 2026-04-11  5:59 UTC (permalink / raw)
  To: Aaron Conole, Eelco Chaudron, Ilya Maximets, David S . Miller,
	Eric Dumazet, Jakub Kicinski, Paolo Abeni
  Cc: Simon Horman, Pravin B Shelar, Flavio Leitner, Mark Gray, netdev,
	dev, linux-kernel, Xiang Mei, Weiming Shi

The vport netlink reply helpers allocate a fixed-size skb with
nlmsg_new(NLMSG_DEFAULT_SIZE, ...) but serialize the full upcall PID
array via ovs_vport_get_upcall_portids(). Since
ovs_vport_set_upcall_portids() accepts any non-zero multiple of
sizeof(u32) with no upper bound, a CAP_NET_ADMIN user can install a
PID array large enough to overflow the reply buffer. When the
subsequent nla_put() fails with -EMSGSIZE, five BUG_ON(err < 0) sites
fire and panic the kernel. On systems with unprivileged user namespaces
enabled (e.g., Ubuntu default), this is reachable via unshare -Urn.

 kernel BUG at net/openvswitch/datapath.c:2414!
 Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI
 CPU: 1 UID: 0 PID: 65 Comm: poc Not tainted 7.0.0-rc7-00195-geb216e422044 #1
 RIP: 0010:ovs_vport_cmd_set (net/openvswitch/datapath.c:2414 (discriminator 1))
 Call Trace:
  <TASK>
  genl_family_rcv_msg_doit (net/netlink/genetlink.c:1116)
  genl_rcv_msg (net/netlink/genetlink.c:1194 net/netlink/genetlink.c:1209)
  netlink_rcv_skb (net/netlink/af_netlink.c:2550)
  genl_rcv (net/netlink/genetlink.c:1219)
  netlink_unicast (net/netlink/af_netlink.c:1319 net/netlink/af_netlink.c:1344)
  netlink_sendmsg (net/netlink/af_netlink.c:1894)
  __sys_sendto (net/socket.c:2206 (discriminator 1))
  __x64_sys_sendto (net/socket.c:2209)
  do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1))
  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130)
  </TASK>
 Kernel panic - not syncing: Fatal exception

Dynamically compute the reply skb size based on the vport's actual PID
array length instead of using a fixed NLMSG_DEFAULT_SIZE, and replace
the BUG_ON() calls with WARN_ON_ONCE() plus graceful error returns.

Fixes: b83d23a2a38b ("openvswitch: Introduce per-cpu upcall dispatch")
Reported-by: Xiang Mei <xmei5@asu.edu>
Signed-off-by: Weiming Shi <bestswngs@gmail.com>
---
 net/openvswitch/datapath.c | 95 ++++++++++++++++++++++++++------------
 1 file changed, 66 insertions(+), 29 deletions(-)

diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index e209099218b4..3649a1f2a3f5 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -2184,9 +2184,17 @@ static int ovs_vport_cmd_fill_info(struct vport *vport, struct sk_buff *skb,
 	return err;
 }
 
-static struct sk_buff *ovs_vport_cmd_alloc_info(void)
+/* Must be called with ovs_mutex or rcu_read_lock. */
+static size_t ovs_vport_cmd_msg_size(const struct vport *vport)
 {
-	return nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	const struct vport_portids *ids;
+	size_t msgsize = NLMSG_DEFAULT_SIZE;
+
+	ids = rcu_dereference_ovsl(vport->upcall_portids);
+	if (ids && (vport->dp->user_features & OVS_DP_F_VPORT_PIDS))
+		msgsize += ids->n_ids * sizeof(u32);
+
+	return msgsize;
 }
 
 /* Called with ovs_mutex, only via ovs_dp_notify_wq(). */
@@ -2196,13 +2204,16 @@ struct sk_buff *ovs_vport_cmd_build_info(struct vport *vport, struct net *net,
 	struct sk_buff *skb;
 	int retval;
 
-	skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+	skb = nlmsg_new(ovs_vport_cmd_msg_size(vport), GFP_KERNEL);
 	if (!skb)
 		return ERR_PTR(-ENOMEM);
 
 	retval = ovs_vport_cmd_fill_info(vport, skb, net, portid, seq, 0, cmd,
 					 GFP_KERNEL);
-	BUG_ON(retval < 0);
+	if (WARN_ON_ONCE(retval < 0)) {
+		kfree_skb(skb);
+		return ERR_PTR(retval);
+	}
 
 	return skb;
 }
@@ -2303,7 +2314,8 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	if (port_no >= DP_MAX_PORTS)
 		return -EFBIG;
 
-	reply = ovs_vport_cmd_alloc_info();
+	reply = genlmsg_new(NLMSG_DEFAULT_SIZE +
+			    nla_len(a[OVS_VPORT_ATTR_UPCALL_PID]), GFP_KERNEL);
 	if (!reply)
 		return -ENOMEM;
 
@@ -2358,7 +2370,9 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
 	else
 		netdev_set_rx_headroom(vport->dev, dp->max_headroom);
 
-	BUG_ON(err < 0);
+	if (WARN_ON_ONCE(err < 0))
+		goto exit_unlock_free;
+
 	ovs_unlock();
 
 	ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2377,49 +2391,52 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
 	struct vport *vport;
 	int err;
 
-	reply = ovs_vport_cmd_alloc_info();
-	if (!reply)
-		return -ENOMEM;
-
 	ovs_lock();
 	vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a);
 	err = PTR_ERR(vport);
 	if (IS_ERR(vport))
-		goto exit_unlock_free;
+		goto exit_unlock;
 
 	if (a[OVS_VPORT_ATTR_TYPE] &&
 	    nla_get_u32(a[OVS_VPORT_ATTR_TYPE]) != vport->ops->type) {
 		err = -EINVAL;
-		goto exit_unlock_free;
+		goto exit_unlock;
 	}
 
 	if (a[OVS_VPORT_ATTR_OPTIONS]) {
 		err = ovs_vport_set_options(vport, a[OVS_VPORT_ATTR_OPTIONS]);
 		if (err)
-			goto exit_unlock_free;
+			goto exit_unlock;
 	}
 
-
 	if (a[OVS_VPORT_ATTR_UPCALL_PID]) {
 		struct nlattr *ids = a[OVS_VPORT_ATTR_UPCALL_PID];
 
 		err = ovs_vport_set_upcall_portids(vport, ids);
 		if (err)
-			goto exit_unlock_free;
+			goto exit_unlock;
+	}
+
+	reply = genlmsg_new(ovs_vport_cmd_msg_size(vport), GFP_KERNEL);
+	if (!reply) {
+		err = -ENOMEM;
+		goto exit_unlock;
 	}
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
 				      OVS_VPORT_CMD_SET, GFP_KERNEL);
-	BUG_ON(err < 0);
+	if (WARN_ON_ONCE(err < 0)) {
+		kfree_skb(reply);
+		goto exit_unlock;
+	}
 
 	ovs_unlock();
 	ovs_notify(&dp_vport_genl_family, reply, info);
 	return 0;
 
-exit_unlock_free:
+exit_unlock:
 	ovs_unlock();
-	kfree_skb(reply);
 	return err;
 }
 
@@ -2433,25 +2450,30 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	unsigned int new_headroom;
 	int err;
 
-	reply = ovs_vport_cmd_alloc_info();
-	if (!reply)
-		return -ENOMEM;
-
 	ovs_lock();
 	vport = lookup_vport(sock_net(skb->sk), genl_info_userhdr(info), a);
 	err = PTR_ERR(vport);
 	if (IS_ERR(vport))
-		goto exit_unlock_free;
+		goto exit_unlock;
 
 	if (vport->port_no == OVSP_LOCAL) {
 		err = -EINVAL;
-		goto exit_unlock_free;
+		goto exit_unlock;
+	}
+
+	reply = genlmsg_new(ovs_vport_cmd_msg_size(vport), GFP_KERNEL);
+	if (!reply) {
+		err = -ENOMEM;
+		goto exit_unlock;
 	}
 
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
 				      OVS_VPORT_CMD_DEL, GFP_KERNEL);
-	BUG_ON(err < 0);
+	if (WARN_ON_ONCE(err < 0)) {
+		kfree_skb(reply);
+		goto exit_unlock;
+	}
 
 	/* the vport deletion may trigger dp headroom update */
 	dp = vport->dp;
@@ -2472,9 +2494,8 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
 	ovs_notify(&dp_vport_genl_family, reply, info);
 	return 0;
 
-exit_unlock_free:
+exit_unlock:
 	ovs_unlock();
-	kfree_skb(reply);
 	return err;
 }
 
@@ -2484,9 +2505,20 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	struct ovs_header *ovs_header = genl_info_userhdr(info);
 	struct sk_buff *reply;
 	struct vport *vport;
+	size_t msg_size;
 	int err;
 
-	reply = ovs_vport_cmd_alloc_info();
+	rcu_read_lock();
+	vport = lookup_vport(sock_net(skb->sk), ovs_header, a);
+	err = PTR_ERR(vport);
+	if (IS_ERR(vport)) {
+		rcu_read_unlock();
+		return err;
+	}
+	msg_size = ovs_vport_cmd_msg_size(vport);
+	rcu_read_unlock();
+
+	reply = genlmsg_new(msg_size, GFP_KERNEL);
 	if (!reply)
 		return -ENOMEM;
 
@@ -2495,12 +2527,17 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
 	err = PTR_ERR(vport);
 	if (IS_ERR(vport))
 		goto exit_unlock_free;
+
 	err = ovs_vport_cmd_fill_info(vport, reply, genl_info_net(info),
 				      info->snd_portid, info->snd_seq, 0,
 				      OVS_VPORT_CMD_GET, GFP_ATOMIC);
-	BUG_ON(err < 0);
 	rcu_read_unlock();
 
+	if (err < 0) {
+		kfree_skb(reply);
+		return err;
+	}
+
 	return genlmsg_reply(reply, info);
 
 exit_unlock_free:
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next 1/3] psp: add crypt-offset and spi-threshold get/set attributes
From: Willem de Bruijn @ 2026-04-11  6:11 UTC (permalink / raw)
  To: Jakub Kicinski, Akhilesh Samineni
  Cc: Willem de Bruijn, davem, edumazet, pabeni, andrew+netdev, horms,
	willemb, daniel.zahka, netdev, linux-kernel,
	jayakrishnan.udayavarma, ajit.khaparde, kiran.kella, sachin.suman
In-Reply-To: <20260410135736.4bc7ed46@kernel.org>

Jakub Kicinski wrote:
> On Sat, 11 Apr 2026 01:06:06 +0530 Akhilesh Samineni wrote:
> > On Wed, Apr 8, 2026 at 6:34 AM Jakub Kicinski <kuba@kernel.org> wrote:
> > > On Tue, 07 Apr 2026 17:37:41 -0400 Willem de Bruijn wrote:  
> > > > PSP defines a 6-bit field in 4 octet units. Does this need bounds checking?  
> > >
> > > More fundamentally, were we to support this -- is it a device property
> > > or an assoc property?  
> > 
> > It's a device property. All associations under the device will share
> > the same crypt-offset.
> 
> I don't think there's anything in the spec that says the crypto
> offset is device level.
> At the very least every L4 proto may want to have a different offset.
> We should probably hold off adding this until a real user appears.

On how it is configured, the spec says "The crypt offset can be
specified by the transmit descriptor or by configuration".

So some devices might indeed selectively set it per-packet, e.g.,
for specific protocols.

One real use case is network telemetry, exposing the inner transport
protocol ports. For that to be useful it would have to be enabled on
most if not all packets.

^ permalink raw reply

* Re: [PATTCH net v5 6/8] net/sched: netem: null-terminate tfifo linear queue tail
From: Eric Dumazet @ 2026-04-11  6:38 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: netdev, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
	David S. Miller, Jakub Kicinski, Paolo Abeni, Peter Oskolkov,
	open list
In-Reply-To: <20260411051700.311679-7-stephen@networkplumber.org>

On Fri, Apr 10, 2026 at 10:17 PM Stephen Hemminger
<stephen@networkplumber.org> wrote:
>
> When tfifo_enqueue() appends a packet to the linear queue tail,
> nskb->next is never set to NULL. The list terminates correctly
> only by accident if the skb arrived with next already NULL.
>
> Explicitly null-terminate the tail to prevent list corruption.
>
> Fixes: d66280b12bd7 ("net: netem: use a list in addition to rbtree")
> Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
> Reviewed-by: Simon Horman <horms@kernel.org>

Can you explain how skb->next could be not NULL ?

This would be a bug in the upper stack.

Only TCQ_F_NOLOCK qdiscs (pfifo_fast) can possibly get such skbs, and
it would not care.

Other qdiscs already get skbs with skb_mark_not_on_list(skb).

^ permalink raw reply

* Re: [PATCH net] netrom: do some basic forms of validation on incoming frames
From: Hugh Blemings @ 2026-04-11  7:24 UTC (permalink / raw)
  To: Greg KH, hugh
  Cc: Kuniyuki Iwashima, kuba, davem, edumazet, horms, linux-hams,
	linux-kernel, netdev, pabeni, stable, workflows, yizhe
In-Reply-To: <2026041135-shindig-trekker-5d06@gregkh>


On 11/4/2026 15:50, Greg KH wrote:
> On Sat, Apr 11, 2026 at 08:25:19AM +1000, Hugh Blemings wrote:
>> On 11/4/2026 08:11, Kuniyuki Iwashima wrote:
>>> From: Jakub Kicinski <kuba@kernel.org>
>>> Date: Fri, 10 Apr 2026 14:54:48 -0700
>>>> On Fri, 10 Apr 2026 14:30:42 -0700 Jakub Kicinski wrote:
>>>>> On Fri, 10 Apr 2026 07:24:36 +0200 Greg Kroah-Hartman wrote:
>>>>>> On Thu, Apr 09, 2026 at 08:32:35PM -0700, Jakub Kicinski wrote:
>>>>>>> Or for simplicity we could also be testing against skb_headlen()
>>>>>>> since we don't expect any legit non-linear frames here? Dunno.
>>>>>> I'll be glad to change this either way, your call.  Given that this is
>>>>>> an obsolete protocol that seems to only be a target for drive-by fuzzers
>>>>>> to attack, whatever the simplest thing to do to quiet them up I'll be
>>>>>> glad to implement.
>>>>>>
>>>>>> Or can we just delete this stuff entirely?  :)
>>>>> Yes.
>>>>>
>>>>> My thinking is to delete hamradio, nfc, atm, caif.. [more to come]
>>>>> Create GH repos which provide them as OOT modules.
>>>>> Hopefully we can convince any existing users to switch to that.
>>>>>
>>>>> The only thing stopping me is the concern that this is just the softest
>>>>> target and the LLMs will find something else to focus on which we can't
>>>>> delete. I suspect any PCIe driver can be flooded with "aren't you
>>>>> trusting the HW to provide valid responses here?" bullshit.
>>>>>
>>>>> But hey, let's try. I'll post a patch nuking all of hamradio later
>>>>> today.
>>>> Well, either we "expunge" this code to OOT repos, or we mark it
>>>> as broken and tell everyone that we don't take security fixes
>>>> for anything that depends on BROKEN. I'd personally rather expunge.
>>> +1 for "expunge" to prevent LLM-based patch flood.
>>>
>>> IIRC, we did that recently for one driver only used by OpenWRT ?
>>>
>>>
>> If the main concern here is ongoing maintenance of these Ham Radio related
>> protocols/drivers, can we pause for a moment on anything as dramatic as
>> removing from the tree entirely ?
> Sure, but:
>
>> There is a good cohort of capable kernel folks that either are or were ham
>> radio operators who I believe, upon realising that things have got to this
>> point, will be happy to redouble efforts to ensure this code maintained and
>> tested to a satisfactory standard.
> We need this code to be maintained, because as is being shown, there are
> reported problems with it that will affect these devices/networks that
> you all are using.  So all we need is a maintainer for this to be able
> to take reports that we get and fix things up as needed.  I know you
> have that experience, want to come back to kernel development, we've
> missed you :)

That's most kind Greg, thank you, have missed all you cool kids too :)

More seriously though - I'd be up for doing it, but I think there may be 
others better placed than I who haven't yet realised we have this 
conundrum. I'm nudging a few folks offline on this front.

I've also kicked off a thread in linux-hams to discuss some of the 
broader questions raised about staying in tree, going to out of tree or 
looking at userspace solutions instead.

We'll try get a cohesive picture back over next few days.

Cheers,
Hugh

-- 
I am slowly moving to hugh@blemings.id.au as my main email address.
If you're using hugh@blemings.org please update your address book accordingly.
Thank you :)


^ permalink raw reply

* [PATCH v2 0/6] bus: mhi: host: mhi_phc: Add support for PHC over MHI
From: Krishna Chaitanya Chundru @ 2026-04-11  8:12 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Richard Cochran
  Cc: mhi, linux-arm-msm, linux-kernel, netdev,
	Krishna Chaitanya Chundru, Vivek Pernamitta, Sivareddy Surasani,
	Vivek Pernamitta, Imran Shaik, Taniya Das

This series introduces the MHI PHC (PTP Hardware Clock) driver, which
registers a PTP (Precision Time Protocol) clock and communicates with
the MHI core to get the device side timestamps. These timestamps are
then exposed to the PTP subsystem, enabling precise time synchronization
between the host and the device.

The device exposes these through MHI time sync capability registers.

The following diagram illustrates the architecture and data flow:

 +-------------+    +--------------------+    +--------------+
 |Userspace App|<-->|Kernel PTP framework|<-->|MHI PHC Driver|
 +-------------+    +--------------------+    +--------------+
                                                     |
                                                     v
 +-------------------------------+         +-----------------+
 | MHI Device (Timestamp source) |<------->| MHI Core Driver |
 +-------------------------------+         +-----------------+

- User space applications use the standard Linux PTP interface.
- The PTP subsystem routes IOCTLs to the MHI PHC driver.
- The MHI PHC driver communicates with the MHI core to fetch timestamps.
- The MHI core interacts with the device to retrieve accurate time data.

Signed-off-by: Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
---
Changes in V2:
- Rebases to the latest code.
- Patch 1 (bus: mhi: host: Add support to read MHI capabilities) which
is dependent change was brought it here as the series which this patch
is part need to re-designed and will take time https://lore.kernel.org/all/CAFEp6-0ik4B20cRyid9w0f+UgibGciPof9HCWTJ=uBOPvHG35Q@mail.gmail.com/.
- Link to v1: https://lore.kernel.org/all/20250818-tsc_time_sync-v1-0-2747710693ba@oss.qualcomm.com/

---
Imran Shaik (1):
      bus: mhi: host: mhi_phc: Add support for PHC over MHI

Krishna Chaitanya Chundru (3):
      bus: mhi: host: Add support for 64bit register reads and writes
      bus: mhi: pci_generic: Add support for 64 bit register read & write
      bus: mhi: host: Update the Time sync logic to read 64 bit register value

Vivek Pernamitta (2):
      bus: mhi: host: Add support to read MHI capabilities
      bus: mhi: host: Add support for non-posted TSC timesync feature

 drivers/bus/mhi/common.h           |  15 ++++
 drivers/bus/mhi/host/Kconfig       |   8 ++
 drivers/bus/mhi/host/Makefile      |   1 +
 drivers/bus/mhi/host/init.c        |  60 +++++++++++++++
 drivers/bus/mhi/host/internal.h    |   9 +++
 drivers/bus/mhi/host/main.c        |  97 ++++++++++++++++++++++++
 drivers/bus/mhi/host/mhi_phc.c     | 150 +++++++++++++++++++++++++++++++++++++
 drivers/bus/mhi/host/mhi_phc.h     |  28 +++++++
 drivers/bus/mhi/host/pci_generic.c |  46 ++++++++++++
 include/linux/mhi.h                |  43 +++++++++++
 10 files changed, 457 insertions(+)
---
base-commit: e774d5f1bc27a85f858bce7688509e866f8e8a4e
change-id: 20260411-tsc_timesync-f877a0394393

Best regards,
-- 
Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>


^ permalink raw reply

* [PATCH v2 1/6] bus: mhi: host: Add support to read MHI capabilities
From: Krishna Chaitanya Chundru @ 2026-04-11  8:12 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Richard Cochran
  Cc: mhi, linux-arm-msm, linux-kernel, netdev,
	Krishna Chaitanya Chundru, Vivek Pernamitta, Sivareddy Surasani
In-Reply-To: <20260411-tsc_timesync-v2-0-6f25f72987b3@oss.qualcomm.com>

From: Vivek Pernamitta <vivek.pernamitta@oss.qualcomm.com>

As per MHI spec v1.2,sec 6.6, MHI has capability registers which are
located after the ERDB array. The location of this group of registers is
indicated by the MISCOFF register. Each capability has a capability ID to
determine which functionality is supported and each capability will point
to the next capability supported.

Add a basic function to read those capabilities offsets.

Signed-off-by: Vivek Pernamitta <vivek.pernamitta@oss.qualcomm.com>
Signed-off-by: Sivareddy Surasani <sivareddy.surasani@oss.qualcomm.com>
Signed-off-by: Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
---
 drivers/bus/mhi/common.h    | 11 +++++++++++
 drivers/bus/mhi/host/init.c | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+)

diff --git a/drivers/bus/mhi/common.h b/drivers/bus/mhi/common.h
index dda340aaed95a5573a2ec776ca712e11a1ed0b52..4c316f3d5a68beb01f15cf575b03747096fdcf2c 100644
--- a/drivers/bus/mhi/common.h
+++ b/drivers/bus/mhi/common.h
@@ -16,6 +16,7 @@
 #define MHICFG				0x10
 #define CHDBOFF				0x18
 #define ERDBOFF				0x20
+#define MISCOFF				0x24
 #define BHIOFF				0x28
 #define BHIEOFF				0x2c
 #define DEBUGOFF			0x30
@@ -113,6 +114,9 @@
 #define MHISTATUS_MHISTATE_MASK		GENMASK(15, 8)
 #define MHISTATUS_SYSERR_MASK		BIT(2)
 #define MHISTATUS_READY_MASK		BIT(0)
+#define MISC_CAP_MASK			GENMASK(31, 0)
+#define CAP_CAPID_MASK			GENMASK(31, 24)
+#define CAP_NEXT_CAP_MASK		GENMASK(23, 12)
 
 /* Command Ring Element macros */
 /* No operation command */
@@ -204,6 +208,13 @@
 #define MHI_RSCTRE_DATA_DWORD1		cpu_to_le32(FIELD_PREP(GENMASK(23, 16), \
 							       MHI_PKT_TYPE_COALESCING))
 
+#define MHI_CAP_ID_INTX			0x1
+#define MHI_CAP_ID_TIME_SYNC		0x2
+#define MHI_CAP_ID_BW_SCALE		0x3
+#define MHI_CAP_ID_TSC_TIME_SYNC	0x4
+#define MHI_CAP_ID_MAX_TRB_LEN		0x5
+#define MHI_CAP_ID_MAX			0x6
+
 enum mhi_pkt_type {
 	MHI_PKT_TYPE_INVALID = 0x0,
 	MHI_PKT_TYPE_NOOP_CMD = 0x1,
diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index 0a728ca2c494836b0e0ce4c3f4aea41794c0868b..c2162aa04e810e45ccfbedd20aaa62f892420d31 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -466,6 +466,38 @@ static int mhi_init_dev_ctxt(struct mhi_controller *mhi_cntrl)
 	return ret;
 }
 
+static int mhi_find_capability(struct mhi_controller *mhi_cntrl, u32 capability)
+{
+	u32 val, cur_cap, next_offset, cur_offset;
+	int ret;
+
+	/* Get the first supported capability offset */
+	ret = mhi_read_reg_field(mhi_cntrl, mhi_cntrl->regs, MISCOFF, MISC_CAP_MASK, &cur_offset);
+	if (ret)
+		return 0;
+
+	do {
+		if (cur_offset >= mhi_cntrl->reg_len)
+			return 0;
+
+		ret = mhi_read_reg(mhi_cntrl, mhi_cntrl->regs, cur_offset, &val);
+		if (ret)
+			return 0;
+
+		cur_cap = FIELD_GET(CAP_CAPID_MASK, val);
+		next_offset = FIELD_GET(CAP_NEXT_CAP_MASK, val);
+		if (cur_cap >= MHI_CAP_ID_MAX)
+			return 0;
+
+		if (cur_cap == capability)
+			return cur_offset;
+
+		cur_offset = next_offset;
+	} while (next_offset);
+
+	return 0;
+}
+
 int mhi_init_mmio(struct mhi_controller *mhi_cntrl)
 {
 	u32 val;

-- 
2.34.1


^ permalink raw reply related

* [PATCH v2 2/6] bus: mhi: host: Add support for non-posted TSC timesync feature
From: Krishna Chaitanya Chundru @ 2026-04-11  8:12 UTC (permalink / raw)
  To: Manivannan Sadhasivam, Richard Cochran
  Cc: mhi, linux-arm-msm, linux-kernel, netdev,
	Krishna Chaitanya Chundru, Vivek Pernamitta
In-Reply-To: <20260411-tsc_timesync-v2-0-6f25f72987b3@oss.qualcomm.com>

From: Vivek Pernamitta <quic_vpernami@quicinc.com>

Implement non-posted time synchronization as described in section 5.1.1
of the MHI v1.2 specification. The host disables low-power link states
to minimize latency, reads the local time, issues a MMIO read to the
device's TIME register.

Add support for initializing this feature and export a function to be
used by the drivers which does the time synchronization.

MHI reads the device time registers in the MMIO address space pointed to
by the capability register after disabling all low power modes and keeping
MHI in M0. Before and after MHI reads, the local time is captured
and shared for processing.

Signed-off-by: Vivek Pernamitta <quic_vpernami@quicinc.com>
Signed-off-by: Krishna Chaitanya Chundru <krishna.chundru@oss.qualcomm.com>
---
 drivers/bus/mhi/common.h        |  4 +++
 drivers/bus/mhi/host/init.c     | 28 ++++++++++++++++
 drivers/bus/mhi/host/internal.h |  9 +++++
 drivers/bus/mhi/host/main.c     | 74 +++++++++++++++++++++++++++++++++++++++++
 include/linux/mhi.h             | 37 +++++++++++++++++++++
 5 files changed, 152 insertions(+)

diff --git a/drivers/bus/mhi/common.h b/drivers/bus/mhi/common.h
index 4c316f3d5a68beb01f15cf575b03747096fdcf2c..64f9b2b94387a112bb6b5e20c634c3ba8d6bc78e 100644
--- a/drivers/bus/mhi/common.h
+++ b/drivers/bus/mhi/common.h
@@ -118,6 +118,10 @@
 #define CAP_CAPID_MASK			GENMASK(31, 24)
 #define CAP_NEXT_CAP_MASK		GENMASK(23, 12)
 
+/* MHI TSC Timesync */
+#define TSC_TIMESYNC_TIME_LOW_OFFSET	(0x8)
+#define TSC_TIMESYNC_TIME_HIGH_OFFSET	(0xC)
+
 /* Command Ring Element macros */
 /* No operation command */
 #define MHI_TRE_CMD_NOOP_PTR		0
diff --git a/drivers/bus/mhi/host/init.c b/drivers/bus/mhi/host/init.c
index c2162aa04e810e45ccfbedd20aaa62f892420d31..eb720f671726d919646cbc450cd54bda655a1060 100644
--- a/drivers/bus/mhi/host/init.c
+++ b/drivers/bus/mhi/host/init.c
@@ -498,6 +498,30 @@ static int mhi_find_capability(struct mhi_controller *mhi_cntrl, u32 capability)
 	return 0;
 }
 
+static int mhi_init_tsc_timesync(struct mhi_controller *mhi_cntrl)
+{
+	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	struct mhi_timesync *mhi_tsc_tsync;
+	u32 time_offset;
+	int ret;
+
+	time_offset = mhi_find_capability(mhi_cntrl, MHI_CAP_ID_TSC_TIME_SYNC);
+	if (!time_offset)
+		return -ENXIO;
+
+	mhi_tsc_tsync = devm_kzalloc(dev, sizeof(*mhi_tsc_tsync), GFP_KERNEL);
+	if (!mhi_tsc_tsync)
+		return -ENOMEM;
+
+	mhi_cntrl->tsc_timesync = mhi_tsc_tsync;
+	mutex_init(&mhi_tsc_tsync->ts_mutex);
+
+	/* save time_offset for obtaining time via MMIO register reads */
+	mhi_tsc_tsync->time_reg = mhi_cntrl->regs + time_offset;
+
+	return 0;
+}
+
 int mhi_init_mmio(struct mhi_controller *mhi_cntrl)
 {
 	u32 val;
@@ -635,6 +659,10 @@ int mhi_init_mmio(struct mhi_controller *mhi_cntrl)
 		return ret;
 	}
 
+	ret = mhi_init_tsc_timesync(mhi_cntrl);
+	if (ret)
+		dev_dbg(dev, "TSC Time synchronization init failure\n");
+
 	return 0;
 }
 
diff --git a/drivers/bus/mhi/host/internal.h b/drivers/bus/mhi/host/internal.h
index 7b0ee5e3a12dd585064169b7b884750bf4d8c8db..a0e729e7a1198c1b82c70b6bfe3bc2ee24331229 100644
--- a/drivers/bus/mhi/host/internal.h
+++ b/drivers/bus/mhi/host/internal.h
@@ -15,6 +15,15 @@ extern const struct bus_type mhi_bus_type;
 #define MHI_SOC_RESET_REQ_OFFSET			0xb0
 #define MHI_SOC_RESET_REQ				BIT(0)
 
+/*
+ * With ASPM enabled, the link may enter a low power state, requiring
+ * a wake-up sequence. Use a short burst of back-to-back reads to
+ * transition the link to the active state. Based on testing,
+ * 4 iterations are necessary to ensure reliable wake-up without
+ * excess latency.
+ */
+#define MHI_NUM_BACK_TO_BACK_READS			4
+
 struct mhi_ctxt {
 	struct mhi_event_ctxt *er_ctxt;
 	struct mhi_chan_ctxt *chan_ctxt;
diff --git a/drivers/bus/mhi/host/main.c b/drivers/bus/mhi/host/main.c
index 53c0ffe300702bcc3caa8fd9ea8086203c75b186..b7a727b1a5d1f20b570c62707a991ec5b85bfec7 100644
--- a/drivers/bus/mhi/host/main.c
+++ b/drivers/bus/mhi/host/main.c
@@ -1626,3 +1626,77 @@ int mhi_get_channel_doorbell_offset(struct mhi_controller *mhi_cntrl, u32 *chdb_
 	return 0;
 }
 EXPORT_SYMBOL_GPL(mhi_get_channel_doorbell_offset);
+
+static int mhi_get_remote_time(struct mhi_controller *mhi_cntrl, struct mhi_timesync *mhi_tsync,
+			       struct mhi_timesync_info *time)
+{
+	struct device *dev = &mhi_cntrl->mhi_dev->dev;
+	int ret, i;
+
+	if (!mhi_tsync && !mhi_tsync->time_reg) {
+		dev_err(dev, "Time sync is not supported\n");
+		return -EINVAL;
+	}
+
+	if (unlikely(MHI_PM_IN_ERROR_STATE(mhi_cntrl->pm_state))) {
+		dev_err(dev, "MHI is not in active state, pm_state:%s\n",
+			to_mhi_pm_state_str(mhi_cntrl->pm_state));
+		return -EIO;
+	}
+
+	/* bring to M0 state */
+	ret = mhi_device_get_sync(mhi_cntrl->mhi_dev);
+	if (ret)
+		return ret;
+
+	guard(mutex)(&mhi_tsync->ts_mutex);
+	mhi_cntrl->runtime_get(mhi_cntrl);
+
+	/*
+	 * time critical code to fetch device time, delay between these two steps
+	 * should be deterministic as possible.
+	 */
+	preempt_disable();
+	local_irq_disable();
+
+	time->t_host_pre = ktime_get_real();
+
+	/*
+	 * To ensure the PCIe link is in L0 when ASPM is enabled, perform series
+	 * of back-to-back reads. This is necessary because the link may be in a
+	 * low-power state (e.g., L1 or L1ss), and need to be forced it to
+	 * transition to L0.
+	 */
+	for (i = 0; i < MHI_NUM_BACK_TO_BACK_READS; i++) {
+		ret = mhi_read_reg(mhi_cntrl, mhi_tsync->time_reg,
+				   TSC_TIMESYNC_TIME_LOW_OFFSET, &time->t_dev_lo);
+
+		ret = mhi_read_reg(mhi_cntrl, mhi_tsync->time_reg,
+				   TSC_TIMESYNC_TIME_HIGH_OFFSET, &time->t_dev_hi);
+	}
+
+	time->t_host_post = ktime_get_real();
+
+	local_irq_enable();
+	preempt_enable();
+
+	mhi_cntrl->runtime_put(mhi_cntrl);
+
+	mhi_device_put(mhi_cntrl->mhi_dev);
+
+	return 0;
+}
+
+int mhi_get_remote_tsc_time_sync(struct mhi_device *mhi_dev, struct mhi_timesync_info *time)
+{
+	struct mhi_controller *mhi_cntrl = mhi_dev->mhi_cntrl;
+	struct mhi_timesync *mhi_tsc_tsync = mhi_cntrl->tsc_timesync;
+	int ret;
+
+	ret = mhi_get_remote_time(mhi_cntrl, mhi_tsc_tsync, time);
+	if (ret)
+		dev_err(&mhi_dev->dev, "Failed to get TSC Time Sync value:%d\n", ret);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(mhi_get_remote_tsc_time_sync);
diff --git a/include/linux/mhi.h b/include/linux/mhi.h
index 88ccb3e14f481d6b85c2a314eb74ba960c2d4c81..f39c8ca7c251954f2d83c1227d206b600b88c75f 100644
--- a/include/linux/mhi.h
+++ b/include/linux/mhi.h
@@ -286,6 +286,30 @@ struct mhi_controller_config {
 	bool m2_no_db;
 };
 
+/**
+ * struct mhi_timesync - MHI time synchronization structure
+ * @time_reg: Points to address of Timesync register
+ * @ts_mutex: Mutex for synchronization
+ */
+struct mhi_timesync {
+	void __iomem *time_reg;
+	struct mutex ts_mutex;
+};
+
+/**
+ * struct mhi_timesync_info - MHI time sync info structure
+ * @t_host_pre: Pre host soc time
+ * @t_host_post: Post host soc time
+ * @t_dev_lo: Mhi device time of lower dword
+ * @t_dev_hi: Mhi device time of higher dword
+ */
+struct mhi_timesync_info {
+	ktime_t t_host_pre;
+	ktime_t t_host_post;
+	u32 t_dev_lo;
+	u32 t_dev_hi;
+};
+
 /**
  * struct mhi_controller - Master MHI controller structure
  * @name: Device name of the MHI controller
@@ -323,6 +347,7 @@ struct mhi_controller_config {
  * @mhi_event: MHI event ring configurations table
  * @mhi_cmd: MHI command ring configurations table
  * @mhi_ctxt: MHI device context, shared memory between host and device
+ * @tsc_timesync: MHI TSC timesync
  * @pm_mutex: Mutex for suspend/resume operation
  * @pm_lock: Lock for protecting MHI power management state
  * @timeout_ms: Timeout in ms for state transitions
@@ -401,6 +426,8 @@ struct mhi_controller {
 	struct mhi_cmd *mhi_cmd;
 	struct mhi_ctxt *mhi_ctxt;
 
+	struct mhi_timesync *tsc_timesync;
+
 	struct mutex pm_mutex;
 	rwlock_t pm_lock;
 	u32 timeout_ms;
@@ -795,4 +822,14 @@ bool mhi_queue_is_full(struct mhi_device *mhi_dev, enum dma_data_direction dir);
  */
 int mhi_get_channel_doorbell_offset(struct mhi_controller *mhi_cntrl, u32 *chdb_offset);
 
+/**
+ * mhi_get_remote_tsc_time_sync - get external soc time relative to local soc
+ * time pre and post using MMIO method.
+ * @mhi_dev: Device associated with the channels
+ * @time: mhi_timesync_info to get device time details
+ *
+ * Returns:
+ * 0 for success, error code for failure
+ */
+int mhi_get_remote_tsc_time_sync(struct mhi_device *mhi_dev, struct mhi_timesync_info *time);
 #endif /* _MHI_H_ */

-- 
2.34.1


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox