* [PATTCH net v5 4/8] net/sched: netem: refactor dequeue into helper functions
From: Stephen Hemminger @ 2026-04-11 5:15 UTC (permalink / raw)
To: netdev
Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>
Extract the tfifo removal, slot accounting, and child/direct dequeue
paths from the monolithic netem_dequeue() into separate helpers:
netem_pull_tfifo() - remove head packet from tfifo
netem_slot_account() - update slot pacing counters
netem_dequeue_child() - enqueue to child, then dequeue from child
netem_dequeue_direct()- dequeue from tfifo when no child
This replaces the goto-based control flow with straightforward function
calls, making the code easier to follow and modify.
No functional change intended.
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
net/sched/sch_netem.c | 190 +++++++++++++++++++++++++++---------------
1 file changed, 123 insertions(+), 67 deletions(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 556f9747f0e7..e264f7aefb97 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -689,99 +689,155 @@ static struct sk_buff *netem_peek(struct netem_sched_data *q)
return q->t_head;
}
-static void netem_erase_head(struct netem_sched_data *q, struct sk_buff *skb)
+/*
+ * Pop the head packet from the tfifo and prepare it for delivery.
+ * skb->dev shares the rbnode area and must be restored after removal.
+ */
+static struct sk_buff *netem_pull_tfifo(struct netem_sched_data *q,
+ struct Qdisc *sch)
{
- if (skb == q->t_head) {
+ struct sk_buff *skb;
+
+ if (q->t_head) {
+ skb = q->t_head;
q->t_head = skb->next;
if (!q->t_head)
q->t_tail = NULL;
} else {
- rb_erase(&skb->rbnode, &q->t_root);
+ struct rb_node *p = rb_first(&q->t_root);
+
+ if (!p)
+ return NULL;
+ skb = rb_to_skb(p);
+ rb_erase(p, &q->t_root);
}
+
+ q->t_len--;
+ skb->next = NULL;
+ skb->prev = NULL;
+ skb->dev = qdisc_dev(sch);
+
+ return skb;
}
-static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+/* Update slot pacing counters after releasing a packet */
+static void netem_slot_account(struct netem_sched_data *q,
+ const struct sk_buff *skb, u64 now)
+{
+ if (!q->slot.slot_next)
+ return;
+
+ q->slot.packets_left--;
+ q->slot.bytes_left -= qdisc_pkt_len(skb);
+ if (q->slot.packets_left <= 0 || q->slot.bytes_left <= 0)
+ get_slot_next(q, now);
+}
+
+/*
+ * Transfer time-ready packets from the tfifo into the child qdisc,
+ * then dequeue from the child.
+ */
+static struct sk_buff *netem_dequeue_child(struct Qdisc *sch)
{
struct netem_sched_data *q = qdisc_priv(sch);
+ u64 now = ktime_get_ns();
struct sk_buff *skb;
-tfifo_dequeue:
- skb = __qdisc_dequeue_head(&sch->q);
- if (skb) {
-deliver:
- qdisc_qstats_backlog_dec(sch, skb);
- qdisc_bstats_update(sch, skb);
- return skb;
- }
skb = netem_peek(q);
if (skb) {
- u64 time_to_send;
- u64 now = ktime_get_ns();
+ u64 time_to_send = netem_skb_cb(skb)->time_to_send;
- /* if more time remaining? */
- time_to_send = netem_skb_cb(skb)->time_to_send;
if (q->slot.slot_next && q->slot.slot_next < time_to_send)
get_slot_next(q, now);
if (time_to_send <= now && q->slot.slot_next <= now) {
- netem_erase_head(q, skb);
- q->t_len--;
- skb->next = NULL;
- skb->prev = NULL;
- /* skb->dev shares skb->rbnode area,
- * we need to restore its value.
- */
- skb->dev = qdisc_dev(sch);
-
- if (q->slot.slot_next) {
- q->slot.packets_left--;
- q->slot.bytes_left -= qdisc_pkt_len(skb);
- if (q->slot.packets_left <= 0 ||
- q->slot.bytes_left <= 0)
- get_slot_next(q, now);
- }
-
- if (q->qdisc) {
- unsigned int pkt_len = qdisc_pkt_len(skb);
- struct sk_buff *to_free = NULL;
- int err;
-
- err = qdisc_enqueue(skb, q->qdisc, &to_free);
- kfree_skb_list(to_free);
- if (err != NET_XMIT_SUCCESS) {
- if (net_xmit_drop_count(err))
- qdisc_qstats_drop(sch);
- sch->qstats.backlog -= pkt_len;
- sch->q.qlen--;
- qdisc_tree_reduce_backlog(sch, 1, pkt_len);
- }
- goto tfifo_dequeue;
- }
- sch->q.qlen--;
- goto deliver;
- }
-
- if (q->qdisc) {
- skb = q->qdisc->ops->dequeue(q->qdisc);
- if (skb) {
+ struct sk_buff *to_free = NULL;
+ unsigned int pkt_len;
+ int err;
+
+ skb = netem_pull_tfifo(q, sch);
+ netem_slot_account(q, skb, now);
+
+ pkt_len = qdisc_pkt_len(skb);
+ err = qdisc_enqueue(skb, q->qdisc, &to_free);
+ kfree_skb_list(to_free);
+ if (err != NET_XMIT_SUCCESS) {
+ if (net_xmit_drop_count(err))
+ qdisc_qstats_drop(sch);
+ sch->qstats.backlog -= pkt_len;
sch->q.qlen--;
- goto deliver;
+ qdisc_tree_reduce_backlog(sch, 1, pkt_len);
}
}
-
- qdisc_watchdog_schedule_ns(&q->watchdog,
- max(time_to_send,
- q->slot.slot_next));
}
- if (q->qdisc) {
- skb = q->qdisc->ops->dequeue(q->qdisc);
- if (skb) {
- sch->q.qlen--;
- goto deliver;
- }
+ skb = q->qdisc->ops->dequeue(q->qdisc);
+ if (skb)
+ sch->q.qlen--;
+
+ return skb;
+}
+
+/* Dequeue directly from the tfifo when no child qdisc is configured. */
+static struct sk_buff *netem_dequeue_direct(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+ u64 time_to_send;
+ u64 now;
+
+ skb = netem_peek(q);
+ if (!skb)
+ return NULL;
+
+ now = ktime_get_ns();
+ time_to_send = netem_skb_cb(skb)->time_to_send;
+
+ if (q->slot.slot_next && q->slot.slot_next < time_to_send)
+ get_slot_next(q, now);
+
+ if (time_to_send > now || q->slot.slot_next > now)
+ return NULL;
+
+ skb = netem_pull_tfifo(q, sch);
+ netem_slot_account(q, skb, now);
+ sch->q.qlen--;
+
+ return skb;
+}
+
+static struct sk_buff *netem_dequeue(struct Qdisc *sch)
+{
+ struct netem_sched_data *q = qdisc_priv(sch);
+ struct sk_buff *skb;
+
+ /* First check the reorder queue */
+ skb = __qdisc_dequeue_head(&sch->q);
+ if (skb)
+ goto deliver;
+
+ if (q->qdisc)
+ skb = netem_dequeue_child(sch);
+ else
+ skb = netem_dequeue_direct(sch);
+
+ if (skb)
+ goto deliver;
+
+ /* Nothing ready — schedule watchdog for next packet */
+ skb = netem_peek(q);
+ if (skb) {
+ u64 time_to_send = netem_skb_cb(skb)->time_to_send;
+
+ qdisc_watchdog_schedule_ns(&q->watchdog,
+ max(time_to_send, q->slot.slot_next));
}
return NULL;
+
+deliver:
+ qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_bstats_update(sch, skb);
+ return skb;
}
static void netem_reset(struct Qdisc *sch)
--
2.53.0
^ permalink raw reply related
* [PATTCH net v5 3/8] net/sched: netem: only reseed PRNG when seed is explicitly provided
From: Stephen Hemminger @ 2026-04-11 5:15 UTC (permalink / raw)
To: netdev
Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
François Michel, open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>
netem_change() unconditionally reseeds the PRNG on every tc change
command. If TCA_NETEM_PRNG_SEED is not specified, a new random seed
is generated, destroying reproducibility for users who set a
deterministic seed on a previous change.
Move the initial random seed generation to netem_init() and only
reseed in netem_change() when TCA_NETEM_PRNG_SEED is explicitly
provided by the user.
Fixes: 4072d97ddc44 ("netem: add prng attribute to netem_sched_data")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
net/sched/sch_netem.c | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index d400a730eadd..556f9747f0e7 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -1112,11 +1112,10 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
/* capping jitter to the range acceptable by tabledist() */
q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
- if (tb[TCA_NETEM_PRNG_SEED])
+ if (tb[TCA_NETEM_PRNG_SEED]) {
q->prng.seed = nla_get_u64(tb[TCA_NETEM_PRNG_SEED]);
- else
- q->prng.seed = get_random_u64();
- prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+ prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+ }
unlock:
sch_tree_unlock(sch);
@@ -1139,6 +1138,9 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt,
return -EINVAL;
q->loss_model = CLG_RANDOM;
+ q->prng.seed = get_random_u64();
+ prandom_seed_state(&q->prng.prng_state, q->prng.seed);
+
ret = netem_change(sch, opt, extack);
if (ret)
pr_info("netem: change failed\n");
--
2.53.0
^ permalink raw reply related
* [PATTCH net v5 2/8] net/sched: netem: fix queue limit check to include reordered packets
From: Stephen Hemminger @ 2026-04-11 5:15 UTC (permalink / raw)
To: netdev
Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>
The queue limit check in netem_enqueue() uses q->t_len which only
counts packets in the internal tfifo. Packets placed in sch->q by
the reorder path (__qdisc_enqueue_head) are not counted, allowing
the total queue occupancy to exceed sch->limit under reordering.
Include sch->q.qlen in the limit check.
Fixes: 50612537e9ab ("netem: fix classful handling")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
net/sched/sch_netem.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 8ee72cac1faf..d400a730eadd 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -524,7 +524,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1 << get_random_u32_below(8);
}
- if (unlikely(q->t_len >= sch->limit)) {
+ if (unlikely(sch->q.qlen >= sch->limit)) {
/* re-link segs, so that qdisc_drop_all() frees them all */
skb->next = segs;
qdisc_drop_all(skb, sch, to_free);
--
2.53.0
^ permalink raw reply related
* [PATTCH net v5 1/8] net/sched: netem: fix probability gaps in 4-state loss model
From: Stephen Hemminger @ 2026-04-11 5:15 UTC (permalink / raw)
To: netdev
Cc: Stephen Hemminger, Simon Horman, Jamal Hadi Salim, Jiri Pirko,
David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
open list
In-Reply-To: <20260411051700.311679-1-stephen@networkplumber.org>
The 4-state Markov chain in loss_4state() has gaps at the boundaries
between transition probability ranges. The comparisons use:
if (rnd < a4)
else if (a4 < rnd && rnd < a1 + a4)
When rnd equals a boundary value exactly, neither branch matches and
no state transition occurs. The redundant lower-bound check (a4 < rnd)
is already implied by being in the else branch.
Remove the unnecessary lower-bound comparisons so the ranges are
contiguous and every random value produces a transition, matching
the GI (General and Intuitive) loss model specification.
This bug goes back to original implementation of this model.
Fixes: 661b79725fea ("netem: revised correlated loss generator")
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
Reviewed-by: Simon Horman <horms@kernel.org>
---
net/sched/sch_netem.c | 8 ++++----
1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 20df1c08b1e9..8ee72cac1faf 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -227,10 +227,10 @@ static bool loss_4state(struct netem_sched_data *q)
if (rnd < clg->a4) {
clg->state = LOST_IN_GAP_PERIOD;
return true;
- } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
+ } else if (rnd < clg->a1 + clg->a4) {
clg->state = LOST_IN_BURST_PERIOD;
return true;
- } else if (clg->a1 + clg->a4 < rnd) {
+ } else {
clg->state = TX_IN_GAP_PERIOD;
}
@@ -247,9 +247,9 @@ static bool loss_4state(struct netem_sched_data *q)
case LOST_IN_BURST_PERIOD:
if (rnd < clg->a3)
clg->state = TX_IN_BURST_PERIOD;
- else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
+ else if (rnd < clg->a2 + clg->a3) {
clg->state = TX_IN_GAP_PERIOD;
- } else if (clg->a2 + clg->a3 < rnd) {
+ } else {
clg->state = LOST_IN_BURST_PERIOD;
return true;
}
--
2.53.0
^ permalink raw reply related
* [PATTCH net v5 0/8] net/sched: netem bug fixes
From: Stephen Hemminger @ 2026-04-11 5:15 UTC (permalink / raw)
To: netdev; +Cc: Stephen Hemminger
These bugs were identified while using AI-assisted code review of
sch_netem.c to analyze the packet duplication re-entrancy problem
(CVE-2025-37890, CVE-2025-38001), which are addressed in a separate
series.
The review uncovered several additional issues:
- probability gaps in the 4-state Markov loss model where
boundary values produce no state transition
- queue limit check not accounting for reordered packets
- PRNG reseeded on every tc change, breaking reproducibility
- the core dequeue re-entrancy issue with child qdiscs
causing HFSC eltree corruption and DRR class stalls
- missing NULL termination on the tfifo linear list tail
- slot delay configuration not validated for inverted ranges
- slot delay arithmetic overflow for ranges above ~2.1 seconds
v5 - fix slot dynamics in the dequeue change
v4 - split refactoring and fix for dequeue into two patches
Stephen Hemminger (8):
net/sched: netem: fix probability gaps in 4-state loss model
net/sched: netem: fix queue limit check to include reordered packets
net/sched: netem: only reseed PRNG when seed is explicitly provided
net/sched: netem: refactor dequeue into helper functions
net/sched: netem: batch-transfer ready packets to avoid child
re-entrancy
net/sched: netem: null-terminate tfifo linear queue tail
net/sched: netem: check for invalid slot range
net/sched: netem: fix slot delay calculation overflow
net/sched/sch_netem.c | 238 ++++++++++++++++++++++++++++--------------
1 file changed, 159 insertions(+), 79 deletions(-)
--
2.53.0
^ permalink raw reply
* Re: [PATCH net-next 4/5] net/sched: netem: add per-impairment extended statistics
From: Stephen Hemminger @ 2026-04-11 5:08 UTC (permalink / raw)
To: Paolo Abeni
Cc: netdev, Jamal Hadi Salim, Jiri Pirko, David S. Miller,
Eric Dumazet, Jakub Kicinski, Simon Horman, open list
In-Reply-To: <36e86b68-f7d9-4455-a0b5-613f717e670a@redhat.com>
On Thu, 9 Apr 2026 11:30:00 +0200
Paolo Abeni <pabeni@redhat.com> wrote:
> On 4/4/26 12:52 AM, Stephen Hemminger wrote:
> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > index 66e8072f44df..fada10cb9b7b 100644
> > --- a/include/uapi/linux/pkt_sched.h
> > +++ b/include/uapi/linux/pkt_sched.h
> > @@ -569,6 +569,15 @@ struct tc_netem_gemodel {
> > #define NETEM_DIST_SCALE 8192
> > #define NETEM_DIST_MAX 16384
> >
> > +struct tc_netem_xstats {
> > + __u32 delayed; /* packets delayed */
> > + __u32 dropped; /* packets dropped by loss model */
> > + __u32 corrupted; /* packets with bit errors injected */
> > + __u32 duplicated; /* duplicate packets generated */
> > + __u32 reordered; /* packets sent out of order */
> > + __u32 ecn_marked; /* packets ECN CE-marked (not dropped)*/
> > +};
>
> Sashiko notes that the counters size will be set in stone by the uAPI,
> and u32 can wraparound very quickly (especially for unconditional delay).
>
> I see other qdiscs generally use __u32, but some have __u64 too, so I
> assume there are no architectural blocker to larger counter.
>
> Could you please move use __u64 above?
>
> Thanks,
>
> Paolo
>
Sure larger counters are fine, mostly just following the herd.
I assume don't need to need about 32 bit tearing when reading these?
^ permalink raw reply
* Re: [PATCH bpf-next v2 2/3] bpf: Use kmalloc_nolock() universally in local storage
From: Alexei Starovoitov @ 2026-04-11 4:39 UTC (permalink / raw)
To: bot+bpf-ci
Cc: Amery Hung, bpf, Network Development, Andrii Nakryiko,
Daniel Borkmann, Martin KaFai Lau, Kumar Kartikeya Dwivedi,
Kernel Team, Alexei Starovoitov, Eduard, Yonghong Song,
Chris Mason, Ihor Solodrai
In-Reply-To: <efdb35c2d8151489fb031ad63ce8bd6000b8bbaa2bf5a523c927907399c93ab0@mail.kernel.org>
On Fri, Apr 10, 2026 at 7:36 PM <bot+bpf-ci@kernel.org> wrote:
>
>
> This allows value sizes up to ~65KB. Before this patch, socket and
> inode storage used bpf_map_kzalloc() (backed by regular kmalloc)
> which could handle those large sizes. After this patch, any
> elem_size above KMALLOC_MAX_CACHE_SIZE will silently fail: the map
> creation succeeds via bpf_local_storage_map_alloc_check() but every
> element allocation returns NULL.
>
> Should BPF_LOCAL_STORAGE_MAX_VALUE_SIZE be updated to use
> KMALLOC_MAX_CACHE_SIZE instead of KMALLOC_MAX_SIZE now that all
> storage types go through kmalloc_nolock()?
>
> Slava Imameev raised the same concern for task storage in
> https://lore.kernel.org/bpf/20260410014341.47043-1-slava.imameev@crowdstrike.com/
Right. Let's update it, but I don't think it's a regression.
On a loaded system kmalloc_large() rarely succeeds for order 2+.
That's why kmalloc_nolock() doesn't attempt to bridge that gap.
One or two contiguous physical pages is the best one can expect.
In early bpf days we picked KMALLOC_MAX_SIZE assuming that
it's a realistic max for kmalloc().
It turned out to be wishful thinking.
kmalloc_large concept should really be removed.
It deceives users into thinking that it's usable.
^ permalink raw reply
* Re: [PATCH bpf-next v2 0/3] Use kmalloc_nolock() universally in BPF local storage
From: patchwork-bot+netdevbpf @ 2026-04-11 4:30 UTC (permalink / raw)
To: Amery Hung
Cc: bpf, netdev, alexei.starovoitov, andrii, daniel, martin.lau,
memxor, kernel-team
In-Reply-To: <20260411015419.114016-1-ameryhung@gmail.com>
Hello:
This series was applied to bpf/bpf-next.git (master)
by Alexei Starovoitov <ast@kernel.org>:
On Fri, 10 Apr 2026 18:54:15 -0700 you wrote:
> Socket local storage did not convert to use kmalloc_nolock() since there
> were observable performance degredation due to kfree_nolock() hitting the
> slow path and the lack of kfree_rcu()-like batching freeing. Now that
> these concern were addressed in slub, convert all remaining local storage
> flavors to use kmalloc_nolock().
>
> v1 -> v2:
> - Fix build (CI, Alexei)
>
> [...]
Here is the summary with links:
- [bpf-next,v2,1/3] selftests/bpf: Remove kmalloc tracing from local storage create bench
https://git.kernel.org/bpf/bpf-next/c/78ee02a966ad
- [bpf-next,v2,2/3] bpf: Use kmalloc_nolock() universally in local storage
https://git.kernel.org/bpf/bpf-next/c/5063e7758899
- [bpf-next,v2,3/3] bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
https://git.kernel.org/bpf/bpf-next/c/136deea435dc
You are awesome, thank you!
--
Deet-doot-dot, I am a bot.
https://korg.docs.kernel.org/patchwork/pwbot.html
^ permalink raw reply
* [PATCH net-next] r8169: Use napi_schedule_irqoff()
From: Matt Vollrath @ 2026-04-11 4:16 UTC (permalink / raw)
To: netdev; +Cc: Matt Vollrath
napi_schedule() masks hard interrupts while doing its work, which is
redundant when called from an interrupt handler where hard interrupts
are already masked. Use napi_schedule_irqoff() instead to bypass this
redundant masking. This is an optimization.
Tested on a Lenovo RTL8168h/8111h.
Signed-off-by: Matt Vollrath <tactii@gmail.com>
---
drivers/net/ethernet/realtek/r8169_main.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 791277e750ba..4c0ad0de3410 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -4873,7 +4873,7 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance)
phy_mac_interrupt(tp->phydev);
rtl_irq_disable(tp);
- napi_schedule(&tp->napi);
+ napi_schedule_irqoff(&tp->napi);
out:
rtl_ack_events(tp, status);
--
2.43.0
^ permalink raw reply related
* Re: [PATCH net v4 1/2] flow_dissector: do not dissect PPPoE PFC frames
From: Qingfang Deng @ 2026-04-11 3:56 UTC (permalink / raw)
To: Simon Horman
Cc: linux-ppp, David S. Miller, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Guillaume Nault, Wojciech Drewek, Tony Nguyen,
linux-kernel, netdev, Paul Mackerras, Jaco Kroon, James Carlson,
Marcin Szycik
In-Reply-To: <20260410171056.GD469338@kernel.org>
Hi,
On 4/11/2026 1:10 AM, Simon Horman wrote:
> On Fri, Apr 10, 2026 at 11:36:20AM +0800, Qingfang Deng wrote:
>> @@ -1361,7 +1376,7 @@ bool __skb_flow_dissect(const struct net *net,
>> struct pppoe_hdr hdr;
>> __be16 proto;
>> } *hdr, _hdr;
>> - u16 ppp_proto;
>> + __be16 ppp_proto;
>
> I'm unclear of the relationship between changing the type of ppp_proto
> and the problem described in the patch description. And it
> is creating a log of churn in this patch. I suggest dropping it.
The intention is to restore the original behavior before the blamed
commit. If you find it too verbose for a fix, I can drop it and then
repost that part later to net-next.
>> @@ -1374,27 +1389,19 @@ bool __skb_flow_dissect(const struct net *net,
>> break;
>> }
>>
>> - /* least significant bit of the most significant octet
>> - * indicates if protocol field was compressed
>> - */
>> - ppp_proto = ntohs(hdr->proto);
>> - if (ppp_proto & 0x0100) {
>> - ppp_proto = ppp_proto >> 8;
>> - nhoff += PPPOE_SES_HLEN - 1;
>> - } else {
>> - nhoff += PPPOE_SES_HLEN;
>> - }
>
> Could we go for something like this?
>
> ppp_proto = ntohs(hdr->proto);
> nhoff += PPPOE_SES_HLEN;
>
> /* Explanation of what is going on */
> if (ppp_proto & 0x0100)
> ppp_proto = some invalid value like 0
>
I think it is redundant. ppp_proto_is_valid() already requires
uncompressed frames.
>> + ppp_proto = hdr->proto;
>> + nhoff += PPPOE_SES_HLEN;
>>
>> - if (ppp_proto == PPP_IP) {
>> + if (ppp_proto == htons(PPP_IP)) {
>> proto = htons(ETH_P_IP);
>> fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> - } else if (ppp_proto == PPP_IPV6) {
>> + } else if (ppp_proto == htons(PPP_IPV6)) {
>> proto = htons(ETH_P_IPV6);
>> fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> - } else if (ppp_proto == PPP_MPLS_UC) {
>> + } else if (ppp_proto == htons(PPP_MPLS_UC)) {
>> proto = htons(ETH_P_MPLS_UC);
>> fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> - } else if (ppp_proto == PPP_MPLS_MC) {
>> + } else if (ppp_proto == htons(PPP_MPLS_MC)) {
>> proto = htons(ETH_P_MPLS_MC);
>> fdret = FLOW_DISSECT_RET_PROTO_AGAIN;
>> } else if (ppp_proto_is_valid(ppp_proto)) {
^ permalink raw reply
* [PATCH net-next v05 6/6] hinic3: Remove unneeded coalesce parameters
From: Fan Gong @ 2026-04-11 3:37 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Remove unneeded coalesce parameters in irq handling.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
drivers/net/ethernet/huawei/hinic3/hinic3_irq.c | 6 +-----
drivers/net/ethernet/huawei/hinic3/hinic3_rx.h | 3 ---
2 files changed, 1 insertion(+), 8 deletions(-)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index d3b3927b5408..42464c007174 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -156,13 +156,9 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
- if (err) {
+ if (err)
netdev_err(netdev,
"Failed to modify moderation for Queue: %u\n", q_id);
- } else {
- nic_dev->rxqs[q_id].last_coalesc_timer_cfg = coalesc_timer_cfg;
- nic_dev->rxqs[q_id].last_pending_limit = pending_limit;
- }
return err;
}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index c11d080408a7..2ab691ed11a9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -111,9 +111,6 @@ struct hinic3_rxq {
dma_addr_t cqe_start_paddr;
struct dim dim;
-
- u8 last_coalesc_timer_cfg;
- u8 last_pending_limit;
} ____cacheline_aligned;
struct hinic3_dyna_rxq_res {
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 4/6] hinic3: Add ethtool rss ops
From: Fan Gong @ 2026-04-11 3:37 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Implement following ethtool callback function:
.get_rxnfc
.set_rxnfc
.get_channels
.set_channels
.get_rxfh_indir_size
.get_rxfh_key_size
.get_rxfh
.set_rxfh
These callbacks allow users to utilize ethtool for detailed
RSS parameters configuration and monitoring.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
.../ethernet/huawei/hinic3/hinic3_ethtool.c | 9 +
.../huawei/hinic3/hinic3_mgmt_interface.h | 2 +
| 487 +++++++++++++++++-
| 19 +
4 files changed, 515 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index f0fb9a30840b..69663ee70cbd 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -15,6 +15,7 @@
#include "hinic3_hw_comm.h"
#include "hinic3_nic_dev.h"
#include "hinic3_nic_cfg.h"
+#include "hinic3_rss.h"
#define HINIC3_MGMT_VERSION_MAX_LEN 32
/* Coalesce time properties in microseconds */
@@ -1231,6 +1232,14 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
.get_pause_stats = hinic3_get_pause_stats,
.get_coalesce = hinic3_get_coalesce,
.set_coalesce = hinic3_set_coalesce,
+ .get_rxnfc = hinic3_get_rxnfc,
+ .set_rxnfc = hinic3_set_rxnfc,
+ .get_channels = hinic3_get_channels,
+ .set_channels = hinic3_set_channels,
+ .get_rxfh_indir_size = hinic3_get_rxfh_indir_size,
+ .get_rxfh_key_size = hinic3_get_rxfh_key_size,
+ .get_rxfh = hinic3_get_rxfh,
+ .set_rxfh = hinic3_set_rxfh,
};
void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
index 76c691f82703..3c1263ff99ff 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
@@ -282,6 +282,7 @@ enum l2nic_cmd {
L2NIC_CMD_SET_VLAN_FILTER_EN = 26,
L2NIC_CMD_SET_RX_VLAN_OFFLOAD = 27,
L2NIC_CMD_CFG_RSS = 60,
+ L2NIC_CMD_GET_RSS_CTX_TBL = 62,
L2NIC_CMD_CFG_RSS_HASH_KEY = 63,
L2NIC_CMD_CFG_RSS_HASH_ENGINE = 64,
L2NIC_CMD_SET_RSS_CTX_TBL = 65,
@@ -301,6 +302,7 @@ enum l2nic_ucode_cmd {
L2NIC_UCODE_CMD_MODIFY_QUEUE_CTX = 0,
L2NIC_UCODE_CMD_CLEAN_QUEUE_CTX = 1,
L2NIC_UCODE_CMD_SET_RSS_INDIR_TBL = 4,
+ L2NIC_UCODE_CMD_GET_RSS_INDIR_TBL = 6,
};
/* hilink mac group command */
--git a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
index 25db74d8c7dd..b40d5fa885c2 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.c
@@ -155,7 +155,7 @@ static int hinic3_set_rss_type(struct hinic3_hwdev *hwdev,
L2NIC_CMD_SET_RSS_CTX_TBL, &msg_params);
if (ctx_tbl.msg_head.status == MGMT_STATUS_CMD_UNSUPPORTED) {
- return MGMT_STATUS_CMD_UNSUPPORTED;
+ return -EOPNOTSUPP;
} else if (err || ctx_tbl.msg_head.status) {
dev_err(hwdev->dev, "mgmt Failed to set rss context offload, err: %d, status: 0x%x\n",
err, ctx_tbl.msg_head.status);
@@ -165,6 +165,39 @@ static int hinic3_set_rss_type(struct hinic3_hwdev *hwdev,
return 0;
}
+static int hinic3_get_rss_type(struct hinic3_hwdev *hwdev,
+ struct hinic3_rss_type *rss_type)
+{
+ struct l2nic_cmd_rss_ctx_tbl ctx_tbl = {};
+ struct mgmt_msg_params msg_params = {};
+ int err;
+
+ ctx_tbl.func_id = hinic3_global_func_id(hwdev);
+
+ mgmt_msg_params_init_default(&msg_params, &ctx_tbl, sizeof(ctx_tbl));
+
+ err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_L2NIC,
+ L2NIC_CMD_GET_RSS_CTX_TBL,
+ &msg_params);
+ if (err || ctx_tbl.msg_head.status) {
+ dev_err(hwdev->dev, "Failed to get hash type, err: %d, status: 0x%x\n",
+ err, ctx_tbl.msg_head.status);
+ return -EINVAL;
+ }
+
+ rss_type->ipv4 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV4);
+ rss_type->ipv6 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV6);
+ rss_type->ipv6_ext = L2NIC_RSS_TYPE_GET(ctx_tbl.context, IPV6_EXT);
+ rss_type->tcp_ipv4 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, TCP_IPV4);
+ rss_type->tcp_ipv6 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, TCP_IPV6);
+ rss_type->tcp_ipv6_ext = L2NIC_RSS_TYPE_GET(ctx_tbl.context,
+ TCP_IPV6_EXT);
+ rss_type->udp_ipv4 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, UDP_IPV4);
+ rss_type->udp_ipv6 = L2NIC_RSS_TYPE_GET(ctx_tbl.context, UDP_IPV6);
+
+ return 0;
+}
+
static int hinic3_rss_cfg_hash_type(struct hinic3_hwdev *hwdev, u8 opcode,
enum hinic3_rss_hash_type *type)
{
@@ -264,7 +297,8 @@ static int hinic3_set_hw_rss_parameters(struct net_device *netdev, u8 rss_en)
if (err)
return err;
- hinic3_fillout_indir_tbl(netdev, nic_dev->rss_indir);
+ if (!netif_is_rxfh_configured(netdev))
+ hinic3_fillout_indir_tbl(netdev, nic_dev->rss_indir);
err = hinic3_config_rss_hw_resource(netdev, nic_dev->rss_indir);
if (err)
@@ -334,3 +368,452 @@ void hinic3_try_to_enable_rss(struct net_device *netdev)
clear_bit(HINIC3_RSS_ENABLE, &nic_dev->flags);
nic_dev->q_params.num_qps = nic_dev->max_qps;
}
+
+static int hinic3_set_l4_rss_hash_ops(const struct ethtool_rxnfc *cmd,
+ struct hinic3_rss_type *rss_type)
+{
+ u8 rss_l4_en;
+
+ switch (cmd->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+ case 0:
+ rss_l4_en = 0;
+ break;
+ case (RXH_L4_B_0_1 | RXH_L4_B_2_3):
+ rss_l4_en = 1;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ switch (cmd->flow_type) {
+ case TCP_V4_FLOW:
+ rss_type->tcp_ipv4 = rss_l4_en;
+ break;
+ case TCP_V6_FLOW:
+ rss_type->tcp_ipv6 = rss_l4_en;
+ break;
+ case UDP_V4_FLOW:
+ rss_type->udp_ipv4 = rss_l4_en;
+ break;
+ case UDP_V6_FLOW:
+ rss_type->udp_ipv6 = rss_l4_en;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hinic3_update_rss_hash_opts(struct net_device *netdev,
+ struct ethtool_rxnfc *cmd,
+ struct hinic3_rss_type *rss_type)
+{
+ int err;
+
+ switch (cmd->flow_type) {
+ case TCP_V4_FLOW:
+ case TCP_V6_FLOW:
+ case UDP_V4_FLOW:
+ case UDP_V6_FLOW:
+ err = hinic3_set_l4_rss_hash_ops(cmd, rss_type);
+ if (err)
+ return err;
+
+ break;
+ case IPV4_FLOW:
+ rss_type->ipv4 = 1;
+ break;
+ case IPV6_FLOW:
+ rss_type->ipv6 = 1;
+ break;
+ default:
+ netdev_err(netdev, "Unsupported flow type\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hinic3_set_rss_hash_opts(struct net_device *netdev,
+ struct ethtool_rxnfc *cmd)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_rss_type rss_type;
+ int err;
+
+ if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+ cmd->data = 0;
+ netdev_err(netdev, "RSS is disable, not support to set flow-hash\n");
+ return -EOPNOTSUPP;
+ }
+
+ /* RSS only supports hashing of IP addresses and L4 ports */
+ if (cmd->data & ~(RXH_IP_SRC | RXH_IP_DST |
+ RXH_L4_B_0_1 | RXH_L4_B_2_3))
+ return -EINVAL;
+
+ /* Both IP addresses must be part of the hash tuple */
+ if (!(cmd->data & RXH_IP_SRC) || !(cmd->data & RXH_IP_DST))
+ return -EINVAL;
+
+ err = hinic3_get_rss_type(nic_dev->hwdev, &rss_type);
+ if (err) {
+ netdev_err(netdev, "Failed to get rss type\n");
+ return err;
+ }
+
+ err = hinic3_update_rss_hash_opts(netdev, cmd, &rss_type);
+ if (err)
+ return err;
+
+ err = hinic3_set_rss_type(nic_dev->hwdev, rss_type);
+ if (err) {
+ netdev_err(netdev, "Failed to set rss type\n");
+ return err;
+ }
+
+ nic_dev->rss_type = rss_type;
+
+ return 0;
+}
+
+static void convert_rss_type(u8 rss_opt, struct ethtool_rxnfc *cmd)
+{
+ if (rss_opt)
+ cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+}
+
+static int hinic3_convert_rss_type(struct net_device *netdev,
+ struct hinic3_rss_type *rss_type,
+ struct ethtool_rxnfc *cmd)
+{
+ cmd->data = RXH_IP_SRC | RXH_IP_DST;
+ switch (cmd->flow_type) {
+ case TCP_V4_FLOW:
+ convert_rss_type(rss_type->tcp_ipv4, cmd);
+ break;
+ case TCP_V6_FLOW:
+ convert_rss_type(rss_type->tcp_ipv6, cmd);
+ break;
+ case UDP_V4_FLOW:
+ convert_rss_type(rss_type->udp_ipv4, cmd);
+ break;
+ case UDP_V6_FLOW:
+ convert_rss_type(rss_type->udp_ipv6, cmd);
+ break;
+ case IPV4_FLOW:
+ case IPV6_FLOW:
+ break;
+ default:
+ netdev_err(netdev, "Unsupported flow type\n");
+ cmd->data = 0;
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hinic3_get_rss_hash_opts(struct net_device *netdev,
+ struct ethtool_rxnfc *cmd)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_rss_type rss_type;
+ int err;
+
+ cmd->data = 0;
+
+ if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags))
+ return 0;
+
+ err = hinic3_get_rss_type(nic_dev->hwdev, &rss_type);
+ if (err) {
+ netdev_err(netdev, "Failed to get rss type\n");
+ return err;
+ }
+
+ return hinic3_convert_rss_type(netdev, &rss_type, cmd);
+}
+
+int hinic3_get_rxnfc(struct net_device *netdev,
+ struct ethtool_rxnfc *cmd, u32 *rule_locs)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int err = 0;
+
+ switch (cmd->cmd) {
+ case ETHTOOL_GRXRINGS:
+ cmd->data = nic_dev->q_params.num_qps;
+ break;
+ case ETHTOOL_GRXFH:
+ err = hinic3_get_rss_hash_opts(netdev, cmd);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+int hinic3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd)
+{
+ int err;
+
+ switch (cmd->cmd) {
+ case ETHTOOL_SRXFH:
+ err = hinic3_set_rss_hash_opts(netdev, cmd);
+ break;
+ default:
+ err = -EOPNOTSUPP;
+ break;
+ }
+
+ return err;
+}
+
+static u16 hinic3_max_channels(struct net_device *netdev)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ u8 tcs = netdev_get_num_tc(netdev);
+
+ return tcs ? nic_dev->max_qps / tcs : nic_dev->max_qps;
+}
+
+static u16 hinic3_curr_channels(struct net_device *netdev)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+
+ if (netif_running(netdev))
+ return nic_dev->q_params.num_qps ?
+ nic_dev->q_params.num_qps : 1;
+ else
+ return min_t(u16, hinic3_max_channels(netdev),
+ nic_dev->q_params.num_qps);
+}
+
+void hinic3_get_channels(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+ channels->max_rx = 0;
+ channels->max_tx = 0;
+ channels->max_other = 0;
+ /* report maximum channels */
+ channels->max_combined = hinic3_max_channels(netdev);
+ channels->rx_count = 0;
+ channels->tx_count = 0;
+ channels->other_count = 0;
+ /* report flow director queues as maximum channels */
+ channels->combined_count = hinic3_curr_channels(netdev);
+}
+
+static int
+hinic3_validate_channel_parameter(struct net_device *netdev,
+ const struct ethtool_channels *channels)
+{
+ u16 max_channel = hinic3_max_channels(netdev);
+ unsigned int count = channels->combined_count;
+
+ if (!count) {
+ netdev_err(netdev, "Unsupported combined_count=0\n");
+ return -EINVAL;
+ }
+
+ if (channels->tx_count || channels->rx_count || channels->other_count) {
+ netdev_err(netdev, "Setting rx/tx/other count not supported\n");
+ return -EINVAL;
+ }
+
+ if (count > max_channel) {
+ netdev_err(netdev, "Combined count %u exceed limit %u\n", count,
+ max_channel);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+int hinic3_set_channels(struct net_device *netdev,
+ struct ethtool_channels *channels)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ unsigned int count = channels->combined_count;
+ struct hinic3_dyna_txrxq_params q_params;
+ int err;
+
+ if (hinic3_validate_channel_parameter(netdev, channels))
+ return -EINVAL;
+
+ if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+ netdev_err(netdev, "This function doesn't support RSS, only support 1 queue pair\n");
+ return -EOPNOTSUPP;
+ }
+
+ netdev_dbg(netdev, "Set max combined queue number from %u to %u\n",
+ nic_dev->q_params.num_qps, count);
+
+ if (netif_running(netdev)) {
+ q_params = nic_dev->q_params;
+ q_params.num_qps = (u16)count;
+ q_params.txqs_res = NULL;
+ q_params.rxqs_res = NULL;
+ q_params.irq_cfg = NULL;
+
+ err = hinic3_change_channel_settings(netdev, &q_params);
+ if (err) {
+ netdev_err(netdev, "Failed to change channel settings\n");
+ return err;
+ }
+ } else {
+ nic_dev->q_params.num_qps = (u16)count;
+ }
+
+ return 0;
+}
+
+u32 hinic3_get_rxfh_indir_size(struct net_device *netdev)
+{
+ return L2NIC_RSS_INDIR_SIZE;
+}
+
+static int hinic3_set_rss_rxfh(struct net_device *netdev,
+ const u32 *indir, u8 *key)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int err;
+ u32 i;
+
+ if (indir) {
+ for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
+ nic_dev->rss_indir[i] = (u16)indir[i];
+
+ err = hinic3_rss_set_indir_tbl(nic_dev->hwdev,
+ nic_dev->rss_indir);
+ if (err) {
+ netdev_err(netdev, "Failed to set rss indir table\n");
+ return err;
+ }
+ }
+
+ if (key) {
+ err = hinic3_rss_set_hash_key(nic_dev->hwdev, key);
+ if (err) {
+ netdev_err(netdev, "Failed to set rss key\n");
+ return err;
+ }
+
+ memcpy(nic_dev->rss_hkey, key, L2NIC_RSS_KEY_SIZE);
+ }
+
+ return 0;
+}
+
+u32 hinic3_get_rxfh_key_size(struct net_device *netdev)
+{
+ return L2NIC_RSS_KEY_SIZE;
+}
+
+static int hinic3_rss_get_indir_tbl(struct hinic3_hwdev *hwdev,
+ u32 *indir_table)
+{
+ struct hinic3_cmd_buf_pair pair;
+ __le16 *indir_tbl = NULL;
+ int err, i;
+
+ err = hinic3_cmd_buf_pair_init(hwdev, &pair);
+ if (err) {
+ dev_err(hwdev->dev, "Failed to allocate cmd_buf.\n");
+ return err;
+ }
+
+ err = hinic3_cmdq_detail_resp(hwdev, MGMT_MOD_L2NIC,
+ L2NIC_UCODE_CMD_GET_RSS_INDIR_TBL,
+ pair.in, pair.out, NULL);
+ if (err) {
+ dev_err(hwdev->dev, "Failed to get rss indir table\n");
+ goto err_get_indir_tbl;
+ }
+
+ indir_tbl = (__le16 *)pair.out->buf;
+ for (i = 0; i < L2NIC_RSS_INDIR_SIZE; i++)
+ indir_table[i] = le16_to_cpu(*(indir_tbl + i));
+
+err_get_indir_tbl:
+ hinic3_cmd_buf_pair_uninit(hwdev, &pair);
+
+ return err;
+}
+
+int hinic3_get_rxfh(struct net_device *netdev,
+ struct ethtool_rxfh_param *rxfh)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int err = 0;
+
+ if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+ netdev_err(netdev, "Rss is disabled\n");
+ return -EOPNOTSUPP;
+ }
+
+ rxfh->hfunc =
+ nic_dev->rss_hash_type == HINIC3_RSS_HASH_ENGINE_TYPE_XOR ?
+ ETH_RSS_HASH_XOR : ETH_RSS_HASH_TOP;
+
+ if (rxfh->indir) {
+ err = hinic3_rss_get_indir_tbl(nic_dev->hwdev, rxfh->indir);
+ if (err)
+ return err;
+ }
+
+ if (rxfh->key)
+ memcpy(rxfh->key, nic_dev->rss_hkey, L2NIC_RSS_KEY_SIZE);
+
+ return err;
+}
+
+static int hinic3_update_hash_func_type(struct net_device *netdev, u8 hfunc)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ enum hinic3_rss_hash_type new_rss_hash_type;
+
+ switch (hfunc) {
+ case ETH_RSS_HASH_NO_CHANGE:
+ return 0;
+ case ETH_RSS_HASH_XOR:
+ new_rss_hash_type = HINIC3_RSS_HASH_ENGINE_TYPE_XOR;
+ break;
+ case ETH_RSS_HASH_TOP:
+ new_rss_hash_type = HINIC3_RSS_HASH_ENGINE_TYPE_TOEP;
+ break;
+ default:
+ netdev_err(netdev, "Unsupported hash func %u\n", hfunc);
+ return -EOPNOTSUPP;
+ }
+
+ if (new_rss_hash_type == nic_dev->rss_hash_type)
+ return 0;
+
+ nic_dev->rss_hash_type = new_rss_hash_type;
+ return hinic3_rss_set_hash_type(nic_dev->hwdev, nic_dev->rss_hash_type);
+}
+
+int hinic3_set_rxfh(struct net_device *netdev,
+ struct ethtool_rxfh_param *rxfh,
+ struct netlink_ext_ack *extack)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int err;
+
+ if (!test_bit(HINIC3_RSS_ENABLE, &nic_dev->flags)) {
+ netdev_err(netdev, "Not support to set rss parameters when rss is disable\n");
+ return -EOPNOTSUPP;
+ }
+
+ err = hinic3_update_hash_func_type(netdev, rxfh->hfunc);
+ if (err)
+ return err;
+
+ err = hinic3_set_rss_rxfh(netdev, rxfh->indir, rxfh->key);
+
+ return err;
+}
--git a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
index 78d82c2aca06..9f1b77780cd4 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rss.h
@@ -5,10 +5,29 @@
#define _HINIC3_RSS_H_
#include <linux/netdevice.h>
+#include <linux/ethtool.h>
int hinic3_rss_init(struct net_device *netdev);
void hinic3_rss_uninit(struct net_device *netdev);
void hinic3_try_to_enable_rss(struct net_device *netdev);
void hinic3_clear_rss_config(struct net_device *netdev);
+int hinic3_get_rxnfc(struct net_device *netdev,
+ struct ethtool_rxnfc *cmd, u32 *rule_locs);
+int hinic3_set_rxnfc(struct net_device *netdev, struct ethtool_rxnfc *cmd);
+
+void hinic3_get_channels(struct net_device *netdev,
+ struct ethtool_channels *channels);
+int hinic3_set_channels(struct net_device *netdev,
+ struct ethtool_channels *channels);
+
+u32 hinic3_get_rxfh_indir_size(struct net_device *netdev);
+u32 hinic3_get_rxfh_key_size(struct net_device *netdev);
+
+int hinic3_get_rxfh(struct net_device *netdev,
+ struct ethtool_rxfh_param *rxfh);
+int hinic3_set_rxfh(struct net_device *netdev,
+ struct ethtool_rxfh_param *rxfh,
+ struct netlink_ext_ack *extack);
+
#endif
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 5/6] hinic3: Configure netdev->watchdog_timeo to set nic tx timeout
From: Fan Gong @ 2026-04-11 3:37 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Configure netdev watchdog timeout to improve transmission reliability.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
drivers/net/ethernet/huawei/hinic3/hinic3_main.c | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index 3b470978714a..7e09b4b2da9f 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -33,6 +33,8 @@
#define HINIC3_RX_PENDING_LIMIT_LOW 2
#define HINIC3_RX_PENDING_LIMIT_HIGH 8
+#define HINIC3_WATCHDOG_TIMEOUT 5
+
static void init_intr_coal_param(struct net_device *netdev)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
@@ -246,6 +248,8 @@ static void hinic3_assign_netdev_ops(struct net_device *netdev)
{
hinic3_set_netdev_ops(netdev);
hinic3_set_ethtool_ops(netdev);
+
+ netdev->watchdog_timeo = HINIC3_WATCHDOG_TIMEOUT * HZ;
}
static void netdev_feature_init(struct net_device *netdev)
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 3/6] hinic3: Add ethtool coalesce ops
From: Fan Gong @ 2026-04-11 3:37 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Implement following ethtool callback function:
.get_coalesce
.set_coalesce
These callbacks allow users to utilize ethtool for detailed
RX coalesce configuration and monitoring.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
.../ethernet/huawei/hinic3/hinic3_ethtool.c | 232 +++++++++++++++++-
1 file changed, 230 insertions(+), 2 deletions(-)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index be26698fc658..f0fb9a30840b 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -17,6 +17,11 @@
#include "hinic3_nic_cfg.h"
#define HINIC3_MGMT_VERSION_MAX_LEN 32
+/* Coalesce time properties in microseconds */
+#define COALESCE_PENDING_LIMIT_UNIT 8
+#define COALESCE_TIMER_CFG_UNIT 5
+#define COALESCE_MAX_PENDING_LIMIT (255 * COALESCE_PENDING_LIMIT_UNIT)
+#define COALESCE_MAX_TIMER_CFG (255 * COALESCE_TIMER_CFG_UNIT)
static void hinic3_get_drvinfo(struct net_device *netdev,
struct ethtool_drvinfo *info)
@@ -985,9 +990,230 @@ static void hinic3_get_pause_stats(struct net_device *netdev,
kfree(ps);
}
+static int hinic3_set_queue_coalesce(struct net_device *netdev, u16 q_id,
+ struct hinic3_intr_coal_info *coal)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_intr_coal_info *intr_coal;
+ struct hinic3_interrupt_info info = {};
+ int err;
+
+ intr_coal = &nic_dev->intr_coalesce[q_id];
+
+ intr_coal->coalesce_timer_cfg = coal->coalesce_timer_cfg;
+ intr_coal->pending_limit = coal->pending_limit;
+ intr_coal->rx_pending_limit_low = coal->rx_pending_limit_low;
+ intr_coal->rx_pending_limit_high = coal->rx_pending_limit_high;
+
+ if (!test_bit(HINIC3_INTF_UP, &nic_dev->flags) ||
+ q_id >= nic_dev->q_params.num_qps || nic_dev->adaptive_rx_coal)
+ return 0;
+
+ info.msix_index = nic_dev->q_params.irq_cfg[q_id].msix_entry_idx;
+ info.interrupt_coalesc_set = 1;
+ info.coalesc_timer_cfg = intr_coal->coalesce_timer_cfg;
+ info.pending_limit = intr_coal->pending_limit;
+ info.resend_timer_cfg = intr_coal->resend_timer_cfg;
+ err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
+ if (err) {
+ netdev_warn(netdev, "Failed to set queue%u coalesce\n", q_id);
+ return err;
+ }
+
+ return 0;
+}
+
+static int is_coalesce_exceed_limit(struct net_device *netdev,
+ const struct ethtool_coalesce *coal)
+{
+ const struct {
+ const char *name;
+ u32 value;
+ u32 limit;
+ } coalesce_limits[] = {
+ {"rx_coalesce_usecs",
+ coal->rx_coalesce_usecs,
+ COALESCE_MAX_TIMER_CFG},
+ {"rx_max_coalesced_frames",
+ coal->rx_max_coalesced_frames,
+ COALESCE_MAX_PENDING_LIMIT},
+ {"rx_max_coalesced_frames_low",
+ coal->rx_max_coalesced_frames_low,
+ COALESCE_MAX_PENDING_LIMIT},
+ {"rx_max_coalesced_frames_high",
+ coal->rx_max_coalesced_frames_high,
+ COALESCE_MAX_PENDING_LIMIT},
+ };
+
+ for (int i = 0; i < ARRAY_SIZE(coalesce_limits); i++) {
+ if (coalesce_limits[i].value > coalesce_limits[i].limit) {
+ netdev_err(netdev, "%s out of range %d-%d\n",
+ coalesce_limits[i].name, 0,
+ coalesce_limits[i].limit);
+ return -ERANGE;
+ }
+ }
+ return 0;
+}
+
+static int is_coalesce_legal(struct net_device *netdev,
+ const struct ethtool_coalesce *coal)
+{
+ int err;
+
+ err = is_coalesce_exceed_limit(netdev, coal);
+ if (err)
+ return err;
+
+ if (coal->rx_max_coalesced_frames_low >=
+ coal->rx_max_coalesced_frames_high) {
+ netdev_err(netdev, "invalid coalesce frame high %u, low %u, unit %d\n",
+ coal->rx_max_coalesced_frames_high,
+ coal->rx_max_coalesced_frames_low,
+ COALESCE_PENDING_LIMIT_UNIT);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void check_coalesce_align(struct net_device *netdev,
+ u32 item, u32 unit, const char *str)
+{
+ if (item % unit)
+ netdev_warn(netdev, "%s in %d units, change to %u\n",
+ str, unit, item - item % unit);
+}
+
+#define CHECK_COALESCE_ALIGN(member, unit) \
+ check_coalesce_align(netdev, member, unit, #member)
+
+static void check_coalesce_changed(struct net_device *netdev,
+ u32 item, u32 unit, u32 ori_val,
+ const char *obj_str, const char *str)
+{
+ if ((item / unit) != ori_val)
+ netdev_dbg(netdev, "Change %s from %d to %u %s\n",
+ str, ori_val * unit, item - item % unit, obj_str);
+}
+
+#define CHECK_COALESCE_CHANGED(member, unit, ori_val, obj_str) \
+ check_coalesce_changed(netdev, member, unit, ori_val, obj_str, #member)
+
+static int hinic3_set_hw_coal_param(struct net_device *netdev,
+ struct hinic3_intr_coal_info *intr_coal)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int err;
+ u16 i;
+
+ for (i = 0; i < nic_dev->max_qps; i++) {
+ err = hinic3_set_queue_coalesce(netdev, i, intr_coal);
+ if (err)
+ return err;
+ }
+
+ return 0;
+}
+
+static int hinic3_get_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_intr_coal_info *interrupt_info;
+
+ interrupt_info = &nic_dev->intr_coalesce[0];
+
+ /* TX/RX uses the same interrupt.
+ * So we only declare RX ethtool_coalesce parameters.
+ */
+ coal->rx_coalesce_usecs = interrupt_info->coalesce_timer_cfg *
+ COALESCE_TIMER_CFG_UNIT;
+ coal->rx_max_coalesced_frames = interrupt_info->pending_limit *
+ COALESCE_PENDING_LIMIT_UNIT;
+
+ coal->use_adaptive_rx_coalesce = nic_dev->adaptive_rx_coal;
+
+ coal->rx_max_coalesced_frames_high =
+ interrupt_info->rx_pending_limit_high *
+ COALESCE_PENDING_LIMIT_UNIT;
+
+ coal->rx_max_coalesced_frames_low =
+ interrupt_info->rx_pending_limit_low *
+ COALESCE_PENDING_LIMIT_UNIT;
+
+ return 0;
+}
+
+static int hinic3_set_coalesce(struct net_device *netdev,
+ struct ethtool_coalesce *coal,
+ struct kernel_ethtool_coalesce *kernel_coal,
+ struct netlink_ext_ack *extack)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_intr_coal_info *ori_intr_coal;
+ struct hinic3_intr_coal_info intr_coal = {};
+ char obj_str[32];
+ int err;
+
+ err = is_coalesce_legal(netdev, coal);
+ if (err)
+ return err;
+
+ CHECK_COALESCE_ALIGN(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT);
+ CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames,
+ COALESCE_PENDING_LIMIT_UNIT);
+ CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_high,
+ COALESCE_PENDING_LIMIT_UNIT);
+ CHECK_COALESCE_ALIGN(coal->rx_max_coalesced_frames_low,
+ COALESCE_PENDING_LIMIT_UNIT);
+
+ ori_intr_coal = &nic_dev->intr_coalesce[0];
+ snprintf(obj_str, sizeof(obj_str), "for netdev");
+
+ CHECK_COALESCE_CHANGED(coal->rx_coalesce_usecs, COALESCE_TIMER_CFG_UNIT,
+ ori_intr_coal->coalesce_timer_cfg, obj_str);
+ CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames,
+ COALESCE_PENDING_LIMIT_UNIT,
+ ori_intr_coal->pending_limit, obj_str);
+ CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_high,
+ COALESCE_PENDING_LIMIT_UNIT,
+ ori_intr_coal->rx_pending_limit_high, obj_str);
+ CHECK_COALESCE_CHANGED(coal->rx_max_coalesced_frames_low,
+ COALESCE_PENDING_LIMIT_UNIT,
+ ori_intr_coal->rx_pending_limit_low, obj_str);
+
+ intr_coal.coalesce_timer_cfg =
+ (u8)(coal->rx_coalesce_usecs / COALESCE_TIMER_CFG_UNIT);
+ intr_coal.pending_limit = (u8)(coal->rx_max_coalesced_frames /
+ COALESCE_PENDING_LIMIT_UNIT);
+
+ nic_dev->adaptive_rx_coal = coal->use_adaptive_rx_coalesce;
+
+ intr_coal.rx_pending_limit_high =
+ (u8)(coal->rx_max_coalesced_frames_high /
+ COALESCE_PENDING_LIMIT_UNIT);
+
+ intr_coal.rx_pending_limit_low =
+ (u8)(coal->rx_max_coalesced_frames_low /
+ COALESCE_PENDING_LIMIT_UNIT);
+
+ /* coalesce timer or pending set to zero will disable coalesce */
+ if (!nic_dev->adaptive_rx_coal &&
+ (!intr_coal.coalesce_timer_cfg || !intr_coal.pending_limit))
+ netdev_warn(netdev, "Coalesce will be disabled\n");
+
+ return hinic3_set_hw_coal_param(netdev, &intr_coal);
+}
+
static const struct ethtool_ops hinic3_ethtool_ops = {
- .supported_coalesce_params = ETHTOOL_COALESCE_USECS |
- ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
+ .supported_coalesce_params = ETHTOOL_COALESCE_RX_USECS |
+ ETHTOOL_COALESCE_RX_MAX_FRAMES |
+ ETHTOOL_COALESCE_USE_ADAPTIVE_RX |
+ ETHTOOL_COALESCE_RX_MAX_FRAMES_LOW |
+ ETHTOOL_COALESCE_RX_MAX_FRAMES_HIGH,
.get_link_ksettings = hinic3_get_link_ksettings,
.get_drvinfo = hinic3_get_drvinfo,
.get_msglevel = hinic3_get_msglevel,
@@ -1003,6 +1229,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
.get_eth_ctrl_stats = hinic3_get_eth_ctrl_stats,
.get_rmon_stats = hinic3_get_rmon_stats,
.get_pause_stats = hinic3_get_pause_stats,
+ .get_coalesce = hinic3_get_coalesce,
+ .set_coalesce = hinic3_set_coalesce,
};
void hinic3_set_ethtool_ops(struct net_device *netdev)
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 2/6] hinic3: Add ethtool statistic ops
From: Fan Gong @ 2026-04-11 3:37 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Add PF/VF statistics functions in TX and RX processing.
Implement following ethtool callback function:
.get_sset_count
.get_ethtool_stats
.get_strings
.get_eth_phy_stats
.get_eth_mac_stats
.get_eth_ctrl_stats
.get_rmon_stats
.get_pause_stats
These callbacks allow users to utilize ethtool for detailed
TX and RX netdev stats monitoring.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
.../ethernet/huawei/hinic3/hinic3_ethtool.c | 485 ++++++++++++++++++
.../ethernet/huawei/hinic3/hinic3_hw_intf.h | 13 +-
.../huawei/hinic3/hinic3_mgmt_interface.h | 37 ++
.../ethernet/huawei/hinic3/hinic3_nic_cfg.c | 64 +++
.../ethernet/huawei/hinic3/hinic3_nic_cfg.h | 109 ++++
.../net/ethernet/huawei/hinic3/hinic3_rx.c | 59 ++-
.../net/ethernet/huawei/hinic3/hinic3_rx.h | 15 +-
.../net/ethernet/huawei/hinic3/hinic3_tx.c | 71 ++-
.../net/ethernet/huawei/hinic3/hinic3_tx.h | 2 +
9 files changed, 845 insertions(+), 10 deletions(-)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index e47c3f43e7b9..be26698fc658 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -508,6 +508,483 @@ static int hinic3_set_ringparam(struct net_device *netdev,
return 0;
}
+struct hinic3_stats {
+ char name[ETH_GSTRING_LEN];
+ u32 size;
+ int offset;
+};
+
+#define HINIC3_RXQ_STAT(_stat_item) { \
+ .name = "rxq%d_"#_stat_item, \
+ .size = sizeof_field(struct hinic3_rxq_stats, _stat_item), \
+ .offset = offsetof(struct hinic3_rxq_stats, _stat_item) \
+}
+
+#define HINIC3_TXQ_STAT(_stat_item) { \
+ .name = "txq%d_"#_stat_item, \
+ .size = sizeof_field(struct hinic3_txq_stats, _stat_item), \
+ .offset = offsetof(struct hinic3_txq_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_rx_queue_stats[] = {
+ HINIC3_RXQ_STAT(csum_errors),
+ HINIC3_RXQ_STAT(other_errors),
+ HINIC3_RXQ_STAT(rx_buf_empty),
+ HINIC3_RXQ_STAT(alloc_skb_err),
+ HINIC3_RXQ_STAT(alloc_rx_buf_err),
+};
+
+static struct hinic3_stats hinic3_tx_queue_stats[] = {
+ HINIC3_TXQ_STAT(busy),
+ HINIC3_TXQ_STAT(skb_pad_err),
+ HINIC3_TXQ_STAT(frag_len_overflow),
+ HINIC3_TXQ_STAT(offload_cow_skb_err),
+ HINIC3_TXQ_STAT(map_frag_err),
+ HINIC3_TXQ_STAT(unknown_tunnel_pkt),
+ HINIC3_TXQ_STAT(frag_size_err),
+};
+
+#define HINIC3_FUNC_STAT(_stat_item) { \
+ .name = #_stat_item, \
+ .size = sizeof_field(struct l2nic_vport_stats, _stat_item), \
+ .offset = offsetof(struct l2nic_vport_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_function_stats[] = {
+ HINIC3_FUNC_STAT(tx_unicast_pkts_vport),
+ HINIC3_FUNC_STAT(tx_unicast_bytes_vport),
+ HINIC3_FUNC_STAT(tx_multicast_pkts_vport),
+ HINIC3_FUNC_STAT(tx_multicast_bytes_vport),
+ HINIC3_FUNC_STAT(tx_broadcast_pkts_vport),
+ HINIC3_FUNC_STAT(tx_broadcast_bytes_vport),
+
+ HINIC3_FUNC_STAT(rx_unicast_pkts_vport),
+ HINIC3_FUNC_STAT(rx_unicast_bytes_vport),
+ HINIC3_FUNC_STAT(rx_multicast_pkts_vport),
+ HINIC3_FUNC_STAT(rx_multicast_bytes_vport),
+ HINIC3_FUNC_STAT(rx_broadcast_pkts_vport),
+ HINIC3_FUNC_STAT(rx_broadcast_bytes_vport),
+
+ HINIC3_FUNC_STAT(tx_discard_vport),
+ HINIC3_FUNC_STAT(rx_discard_vport),
+ HINIC3_FUNC_STAT(tx_err_vport),
+ HINIC3_FUNC_STAT(rx_err_vport),
+};
+
+#define HINIC3_PORT_STAT(_stat_item) { \
+ .name = #_stat_item, \
+ .size = sizeof_field(struct mag_cmd_port_stats, _stat_item), \
+ .offset = offsetof(struct mag_cmd_port_stats, _stat_item) \
+}
+
+static struct hinic3_stats hinic3_port_stats[] = {
+ HINIC3_PORT_STAT(mac_tx_fragment_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_undersize_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_undermin_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_1519_max_bad_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_1519_max_good_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_oversize_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_jabber_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_bad_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_bad_oct_num),
+ HINIC3_PORT_STAT(mac_tx_good_oct_num),
+ HINIC3_PORT_STAT(mac_tx_total_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_uni_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri0_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri1_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri2_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri3_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri4_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri5_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri6_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_pfc_pri7_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_err_all_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_from_app_good_pkt_num),
+ HINIC3_PORT_STAT(mac_tx_from_app_bad_pkt_num),
+
+ HINIC3_PORT_STAT(mac_rx_undermin_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_1519_max_bad_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_1519_max_good_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_bad_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_bad_oct_num),
+ HINIC3_PORT_STAT(mac_rx_good_oct_num),
+ HINIC3_PORT_STAT(mac_rx_total_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_uni_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri0_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri1_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri2_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri3_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri4_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri5_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri6_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_pfc_pri7_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_send_app_good_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_send_app_bad_pkt_num),
+ HINIC3_PORT_STAT(mac_rx_unfilter_pkt_num),
+};
+
+static int hinic3_get_sset_count(struct net_device *netdev, int sset)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ int count, q_num;
+
+ switch (sset) {
+ case ETH_SS_STATS:
+ q_num = nic_dev->q_params.num_qps;
+ count = ARRAY_SIZE(hinic3_function_stats) +
+ (ARRAY_SIZE(hinic3_tx_queue_stats) +
+ ARRAY_SIZE(hinic3_rx_queue_stats)) *
+ q_num;
+
+ if (!HINIC3_IS_VF(nic_dev->hwdev))
+ count += ARRAY_SIZE(hinic3_port_stats);
+
+ return count;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+
+static u64 get_val_of_ptr(u32 size, const void *ptr)
+{
+ u64 ret = size == sizeof(u64) ? *(u64 *)ptr :
+ size == sizeof(u32) ? *(u32 *)ptr :
+ size == sizeof(u16) ? *(u16 *)ptr :
+ *(u8 *)ptr;
+
+ return ret;
+}
+
+static void hinic3_get_drv_queue_stats(struct net_device *netdev, u64 *data)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_txq_stats txq_stats = {};
+ struct hinic3_rxq_stats rxq_stats = {};
+ u16 i = 0, j, qid;
+ char *p;
+
+ u64_stats_init(&txq_stats.syncp);
+ u64_stats_init(&rxq_stats.syncp);
+
+ for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+ if (!nic_dev->txqs)
+ break;
+
+ hinic3_txq_get_stats(&nic_dev->txqs[qid], &txq_stats);
+ for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++, i++) {
+ p = (char *)&txq_stats +
+ hinic3_tx_queue_stats[j].offset;
+ data[i] = get_val_of_ptr(hinic3_tx_queue_stats[j].size,
+ p);
+ }
+ }
+
+ for (qid = 0; qid < nic_dev->q_params.num_qps; qid++) {
+ if (!nic_dev->rxqs)
+ break;
+
+ hinic3_rxq_get_stats(&nic_dev->rxqs[qid], &rxq_stats);
+ for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++, i++) {
+ p = (char *)&rxq_stats +
+ hinic3_rx_queue_stats[j].offset;
+ data[i] = get_val_of_ptr(hinic3_rx_queue_stats[j].size,
+ p);
+ }
+ }
+}
+
+static u16 hinic3_get_ethtool_port_stats(struct net_device *netdev, u64 *data)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ u16 i = 0, j;
+ char *p;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ goto err_zero_stats;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get port stats from fw\n");
+ goto err_zero_stats;
+ }
+
+ for (j = 0; j < ARRAY_SIZE(hinic3_port_stats); j++, i++) {
+ p = (char *)ps + hinic3_port_stats[j].offset;
+ data[i] = get_val_of_ptr(hinic3_port_stats[j].size, p);
+ }
+
+ kfree(ps);
+
+ return i;
+
+err_zero_stats:
+ memset(&data[i], 0, ARRAY_SIZE(hinic3_port_stats) * sizeof(*data));
+
+ return i + ARRAY_SIZE(hinic3_port_stats);
+}
+
+static void hinic3_get_ethtool_stats(struct net_device *netdev,
+ struct ethtool_stats *stats, u64 *data)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct l2nic_vport_stats vport_stats = {};
+ u16 i = 0, j;
+ char *p;
+ int err;
+
+ err = hinic3_get_vport_stats(nic_dev->hwdev,
+ hinic3_global_func_id(nic_dev->hwdev),
+ &vport_stats);
+ if (err)
+ netdev_err(netdev, "Failed to get function stats from fw\n");
+
+ for (j = 0; j < ARRAY_SIZE(hinic3_function_stats); j++, i++) {
+ p = (char *)&vport_stats + hinic3_function_stats[j].offset;
+ data[i] = get_val_of_ptr(hinic3_function_stats[j].size, p);
+ }
+
+ if (!HINIC3_IS_VF(nic_dev->hwdev))
+ i += hinic3_get_ethtool_port_stats(netdev, data + i);
+
+ hinic3_get_drv_queue_stats(netdev, data + i);
+}
+
+static u16 hinic3_get_hw_stats_strings(struct net_device *netdev, char *p)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ u16 i, cnt = 0;
+
+ for (i = 0; i < ARRAY_SIZE(hinic3_function_stats); i++) {
+ memcpy(p, hinic3_function_stats[i].name, ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ cnt++;
+ }
+
+ if (!HINIC3_IS_VF(nic_dev->hwdev)) {
+ for (i = 0; i < ARRAY_SIZE(hinic3_port_stats); i++) {
+ memcpy(p, hinic3_port_stats[i].name, ETH_GSTRING_LEN);
+ p += ETH_GSTRING_LEN;
+ cnt++;
+ }
+ }
+
+ return cnt;
+}
+
+static void hinic3_get_qp_stats_strings(struct net_device *netdev, char *p)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ u8 *data = p;
+ u16 i, j;
+
+ for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+ for (j = 0; j < ARRAY_SIZE(hinic3_tx_queue_stats); j++)
+ ethtool_sprintf(&data,
+ hinic3_tx_queue_stats[j].name, i);
+ }
+
+ for (i = 0; i < nic_dev->q_params.num_qps; i++) {
+ for (j = 0; j < ARRAY_SIZE(hinic3_rx_queue_stats); j++)
+ ethtool_sprintf(&data,
+ hinic3_rx_queue_stats[j].name, i);
+ }
+}
+
+static void hinic3_get_strings(struct net_device *netdev,
+ u32 stringset, u8 *data)
+{
+ char *p = (char *)data;
+ u16 offset;
+
+ switch (stringset) {
+ case ETH_SS_STATS:
+ offset = hinic3_get_hw_stats_strings(netdev, p);
+ hinic3_get_qp_stats_strings(netdev,
+ p + offset * ETH_GSTRING_LEN);
+
+ return;
+ default:
+ netdev_err(netdev, "Invalid string set %u.\n", stringset);
+ return;
+ }
+}
+
+static void hinic3_get_eth_phy_stats(struct net_device *netdev,
+ struct ethtool_eth_phy_stats *phy_stats)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ return;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get eth phy stats from fw\n");
+ return;
+ }
+
+ phy_stats->SymbolErrorDuringCarrier = ps->mac_rx_sym_err_pkt_num;
+
+ kfree(ps);
+}
+
+static void hinic3_get_eth_mac_stats(struct net_device *netdev,
+ struct ethtool_eth_mac_stats *mac_stats)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ return;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get eth mac stats from fw\n");
+ return;
+ }
+
+ mac_stats->FramesTransmittedOK = ps->mac_tx_good_pkt_num;
+ mac_stats->FramesReceivedOK = ps->mac_rx_good_pkt_num;
+ mac_stats->FrameCheckSequenceErrors = ps->mac_rx_fcs_err_pkt_num;
+ mac_stats->OctetsTransmittedOK = ps->mac_tx_total_oct_num;
+ mac_stats->OctetsReceivedOK = ps->mac_rx_total_oct_num;
+ mac_stats->MulticastFramesXmittedOK = ps->mac_tx_multi_pkt_num;
+ mac_stats->BroadcastFramesXmittedOK = ps->mac_tx_broad_pkt_num;
+ mac_stats->MulticastFramesReceivedOK = ps->mac_rx_multi_pkt_num;
+ mac_stats->BroadcastFramesReceivedOK = ps->mac_rx_broad_pkt_num;
+
+ kfree(ps);
+}
+
+static void hinic3_get_eth_ctrl_stats(struct net_device *netdev,
+ struct ethtool_eth_ctrl_stats *ctrl_stats)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ return;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get eth ctrl stats from fw\n");
+ return;
+ }
+
+ ctrl_stats->MACControlFramesTransmitted = ps->mac_tx_control_pkt_num;
+ ctrl_stats->MACControlFramesReceived = ps->mac_rx_control_pkt_num;
+
+ kfree(ps);
+}
+
+static const struct ethtool_rmon_hist_range hinic3_rmon_ranges[] = {
+ { 0, 64 },
+ { 65, 127 },
+ { 128, 255 },
+ { 256, 511 },
+ { 512, 1023 },
+ { 1024, 1518 },
+ { 1519, 2047 },
+ { 2048, 4095 },
+ { 4096, 8191 },
+ { 8192, 9216 },
+ { 9217, 12287 },
+ {}
+};
+
+static void hinic3_get_rmon_stats(struct net_device *netdev,
+ struct ethtool_rmon_stats *rmon_stats,
+ const struct ethtool_rmon_hist_range **ranges)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ return;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get eth rmon stats from fw\n");
+ return;
+ }
+
+ rmon_stats->undersize_pkts = ps->mac_rx_undersize_pkt_num;
+ rmon_stats->oversize_pkts = ps->mac_rx_oversize_pkt_num;
+ rmon_stats->fragments = ps->mac_rx_fragment_pkt_num;
+ rmon_stats->jabbers = ps->mac_rx_jabber_pkt_num;
+
+ rmon_stats->hist[0] = ps->mac_rx_64_oct_pkt_num;
+ rmon_stats->hist[1] = ps->mac_rx_65_127_oct_pkt_num;
+ rmon_stats->hist[2] = ps->mac_rx_128_255_oct_pkt_num;
+ rmon_stats->hist[3] = ps->mac_rx_256_511_oct_pkt_num;
+ rmon_stats->hist[4] = ps->mac_rx_512_1023_oct_pkt_num;
+ rmon_stats->hist[5] = ps->mac_rx_1024_1518_oct_pkt_num;
+ rmon_stats->hist[6] = ps->mac_rx_1519_2047_oct_pkt_num;
+ rmon_stats->hist[7] = ps->mac_rx_2048_4095_oct_pkt_num;
+ rmon_stats->hist[8] = ps->mac_rx_4096_8191_oct_pkt_num;
+ rmon_stats->hist[9] = ps->mac_rx_8192_9216_oct_pkt_num;
+ rmon_stats->hist[10] = ps->mac_rx_9217_12287_oct_pkt_num;
+
+ rmon_stats->hist_tx[0] = ps->mac_tx_64_oct_pkt_num;
+ rmon_stats->hist_tx[1] = ps->mac_tx_65_127_oct_pkt_num;
+ rmon_stats->hist_tx[2] = ps->mac_tx_128_255_oct_pkt_num;
+ rmon_stats->hist_tx[3] = ps->mac_tx_256_511_oct_pkt_num;
+ rmon_stats->hist_tx[4] = ps->mac_tx_512_1023_oct_pkt_num;
+ rmon_stats->hist_tx[5] = ps->mac_tx_1024_1518_oct_pkt_num;
+ rmon_stats->hist_tx[6] = ps->mac_tx_1519_2047_oct_pkt_num;
+ rmon_stats->hist_tx[7] = ps->mac_tx_2048_4095_oct_pkt_num;
+ rmon_stats->hist_tx[8] = ps->mac_tx_4096_8191_oct_pkt_num;
+ rmon_stats->hist_tx[9] = ps->mac_tx_8192_9216_oct_pkt_num;
+ rmon_stats->hist_tx[10] = ps->mac_tx_9217_12287_oct_pkt_num;
+
+ *ranges = hinic3_rmon_ranges;
+
+ kfree(ps);
+}
+
+static void hinic3_get_pause_stats(struct net_device *netdev,
+ struct ethtool_pause_stats *pause_stats)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct mag_cmd_port_stats *ps;
+ int err;
+
+ ps = kmalloc_obj(*ps);
+ if (!ps)
+ return;
+
+ err = hinic3_get_phy_port_stats(nic_dev->hwdev, ps);
+ if (err) {
+ kfree(ps);
+ netdev_err(netdev, "Failed to get eth pause stats from fw\n");
+ return;
+ }
+
+ pause_stats->tx_pause_frames = ps->mac_tx_pause_num;
+ pause_stats->rx_pause_frames = ps->mac_rx_pause_num;
+
+ kfree(ps);
+}
+
static const struct ethtool_ops hinic3_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -518,6 +995,14 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
.get_link = ethtool_op_get_link,
.get_ringparam = hinic3_get_ringparam,
.set_ringparam = hinic3_set_ringparam,
+ .get_sset_count = hinic3_get_sset_count,
+ .get_ethtool_stats = hinic3_get_ethtool_stats,
+ .get_strings = hinic3_get_strings,
+ .get_eth_phy_stats = hinic3_get_eth_phy_stats,
+ .get_eth_mac_stats = hinic3_get_eth_mac_stats,
+ .get_eth_ctrl_stats = hinic3_get_eth_ctrl_stats,
+ .get_rmon_stats = hinic3_get_rmon_stats,
+ .get_pause_stats = hinic3_get_pause_stats,
};
void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
index cfc9daa3034f..0b2ebef04c02 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_hw_intf.h
@@ -51,7 +51,18 @@ static inline void mgmt_msg_params_init_default(struct mgmt_msg_params *msg_para
msg_params->in_size = buf_size;
msg_params->expected_out_size = buf_size;
msg_params->timeout_ms = 0;
-}
+};
+
+static inline void
+mgmt_msg_params_init_in_out(struct mgmt_msg_params *msg_params, void *in_buf,
+ void *out_buf, u32 in_buf_size, u32 out_buf_size)
+{
+ msg_params->buf_in = in_buf;
+ msg_params->buf_out = out_buf;
+ msg_params->in_size = in_buf_size;
+ msg_params->expected_out_size = out_buf_size;
+ msg_params->timeout_ms = 0;
+};
enum cfg_cmd {
CFG_CMD_GET_DEV_CAP = 0,
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
index c5bca3c4af96..76c691f82703 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_mgmt_interface.h
@@ -143,6 +143,41 @@ struct l2nic_cmd_set_dcb_state {
u8 rsvd[7];
};
+struct l2nic_port_stats_info {
+ struct mgmt_msg_head msg_head;
+ u16 func_id;
+ u16 rsvd1;
+};
+
+struct l2nic_vport_stats {
+ u64 tx_unicast_pkts_vport;
+ u64 tx_unicast_bytes_vport;
+ u64 tx_multicast_pkts_vport;
+ u64 tx_multicast_bytes_vport;
+ u64 tx_broadcast_pkts_vport;
+ u64 tx_broadcast_bytes_vport;
+
+ u64 rx_unicast_pkts_vport;
+ u64 rx_unicast_bytes_vport;
+ u64 rx_multicast_pkts_vport;
+ u64 rx_multicast_bytes_vport;
+ u64 rx_broadcast_pkts_vport;
+ u64 rx_broadcast_bytes_vport;
+
+ u64 tx_discard_vport;
+ u64 rx_discard_vport;
+ u64 tx_err_vport;
+ u64 rx_err_vport;
+};
+
+struct l2nic_cmd_vport_stats {
+ struct mgmt_msg_head msg_head;
+ u32 stats_size;
+ u32 rsvd1;
+ struct l2nic_vport_stats stats;
+ u64 rsvd2[6];
+};
+
struct l2nic_cmd_lro_config {
struct mgmt_msg_head msg_head;
u16 func_id;
@@ -234,6 +269,7 @@ enum l2nic_cmd {
L2NIC_CMD_SET_VPORT_ENABLE = 6,
L2NIC_CMD_SET_RX_MODE = 7,
L2NIC_CMD_SET_SQ_CI_ATTR = 8,
+ L2NIC_CMD_GET_VPORT_STAT = 9,
L2NIC_CMD_CLEAR_QP_RESOURCE = 11,
L2NIC_CMD_CFG_RX_LRO = 13,
L2NIC_CMD_CFG_LRO_TIMER = 14,
@@ -272,6 +308,7 @@ enum mag_cmd {
MAG_CMD_SET_PORT_ENABLE = 6,
MAG_CMD_GET_LINK_STATUS = 7,
+ MAG_CMD_GET_PORT_STAT = 151,
MAG_CMD_GET_PORT_INFO = 153,
};
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
index de5a7984d2cb..1b14dc824ce1 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.c
@@ -639,6 +639,42 @@ int hinic3_get_link_status(struct hinic3_hwdev *hwdev, bool *link_status_up)
return 0;
}
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+ struct mag_cmd_port_stats *stats)
+{
+ struct mag_cmd_port_stats_info stats_info = {};
+ struct mag_cmd_get_port_stat *ps;
+ struct mgmt_msg_params msg_params = {};
+ int err;
+
+ ps = kzalloc_obj(*ps);
+ if (!ps)
+ return -ENOMEM;
+
+ stats_info.port_id = hinic3_physical_port_id(hwdev);
+
+ mgmt_msg_params_init_in_out(&msg_params, &stats_info, ps,
+ sizeof(stats_info), sizeof(*ps));
+
+ err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_HILINK,
+ MAG_CMD_GET_PORT_STAT, &msg_params);
+
+ if (err || ps->head.status) {
+ dev_err(hwdev->dev,
+ "Failed to get port statistics, err: %d, status: 0x%x\n",
+ err, ps->head.status);
+ err = -EFAULT;
+ goto out;
+ }
+
+ memcpy(stats, &ps->counter, sizeof(*stats));
+
+out:
+ kfree(ps);
+
+ return err;
+}
+
int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
struct hinic3_nic_port_info *port_info)
{
@@ -738,3 +774,31 @@ int hinic3_get_pause_info(struct hinic3_nic_dev *nic_dev,
return hinic3_cfg_hw_pause(nic_dev->hwdev, MGMT_MSG_CMD_OP_GET,
nic_pause);
}
+
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+ struct l2nic_vport_stats *stats)
+{
+ struct l2nic_cmd_vport_stats vport_stats = {};
+ struct l2nic_port_stats_info stats_info = {};
+ struct mgmt_msg_params msg_params = {};
+ int err;
+
+ stats_info.func_id = func_id;
+
+ mgmt_msg_params_init_in_out(&msg_params, &stats_info, &vport_stats,
+ sizeof(stats_info), sizeof(vport_stats));
+
+ err = hinic3_send_mbox_to_mgmt(hwdev, MGMT_MOD_L2NIC,
+ L2NIC_CMD_GET_VPORT_STAT, &msg_params);
+
+ if (err || vport_stats.msg_head.status) {
+ dev_err(hwdev->dev,
+ "Failed to get function statistics, err: %d, status: 0x%x\n",
+ err, vport_stats.msg_head.status);
+ return -EFAULT;
+ }
+
+ memcpy(stats, &vport_stats.stats, sizeof(*stats));
+
+ return 0;
+}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
index 5d52202a8d4e..80573c121539 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_cfg.h
@@ -129,6 +129,110 @@ struct mag_cmd_get_xsfp_present {
u8 rsvd[2];
};
+struct mag_cmd_port_stats {
+ u64 mac_tx_fragment_pkt_num;
+ u64 mac_tx_undersize_pkt_num;
+ u64 mac_tx_undermin_pkt_num;
+ u64 mac_tx_64_oct_pkt_num;
+ u64 mac_tx_65_127_oct_pkt_num;
+ u64 mac_tx_128_255_oct_pkt_num;
+ u64 mac_tx_256_511_oct_pkt_num;
+ u64 mac_tx_512_1023_oct_pkt_num;
+ u64 mac_tx_1024_1518_oct_pkt_num;
+ u64 mac_tx_1519_2047_oct_pkt_num;
+ u64 mac_tx_2048_4095_oct_pkt_num;
+ u64 mac_tx_4096_8191_oct_pkt_num;
+ u64 mac_tx_8192_9216_oct_pkt_num;
+ u64 mac_tx_9217_12287_oct_pkt_num;
+ u64 mac_tx_12288_16383_oct_pkt_num;
+ u64 mac_tx_1519_max_bad_pkt_num;
+ u64 mac_tx_1519_max_good_pkt_num;
+ u64 mac_tx_oversize_pkt_num;
+ u64 mac_tx_jabber_pkt_num;
+ u64 mac_tx_bad_pkt_num;
+ u64 mac_tx_bad_oct_num;
+ u64 mac_tx_good_pkt_num;
+ u64 mac_tx_good_oct_num;
+ u64 mac_tx_total_pkt_num;
+ u64 mac_tx_total_oct_num;
+ u64 mac_tx_uni_pkt_num;
+ u64 mac_tx_multi_pkt_num;
+ u64 mac_tx_broad_pkt_num;
+ u64 mac_tx_pause_num;
+ u64 mac_tx_pfc_pkt_num;
+ u64 mac_tx_pfc_pri0_pkt_num;
+ u64 mac_tx_pfc_pri1_pkt_num;
+ u64 mac_tx_pfc_pri2_pkt_num;
+ u64 mac_tx_pfc_pri3_pkt_num;
+ u64 mac_tx_pfc_pri4_pkt_num;
+ u64 mac_tx_pfc_pri5_pkt_num;
+ u64 mac_tx_pfc_pri6_pkt_num;
+ u64 mac_tx_pfc_pri7_pkt_num;
+ u64 mac_tx_control_pkt_num;
+ u64 mac_tx_err_all_pkt_num;
+ u64 mac_tx_from_app_good_pkt_num;
+ u64 mac_tx_from_app_bad_pkt_num;
+
+ u64 mac_rx_fragment_pkt_num;
+ u64 mac_rx_undersize_pkt_num;
+ u64 mac_rx_undermin_pkt_num;
+ u64 mac_rx_64_oct_pkt_num;
+ u64 mac_rx_65_127_oct_pkt_num;
+ u64 mac_rx_128_255_oct_pkt_num;
+ u64 mac_rx_256_511_oct_pkt_num;
+ u64 mac_rx_512_1023_oct_pkt_num;
+ u64 mac_rx_1024_1518_oct_pkt_num;
+ u64 mac_rx_1519_2047_oct_pkt_num;
+ u64 mac_rx_2048_4095_oct_pkt_num;
+ u64 mac_rx_4096_8191_oct_pkt_num;
+ u64 mac_rx_8192_9216_oct_pkt_num;
+ u64 mac_rx_9217_12287_oct_pkt_num;
+ u64 mac_rx_12288_16383_oct_pkt_num;
+ u64 mac_rx_1519_max_bad_pkt_num;
+ u64 mac_rx_1519_max_good_pkt_num;
+ u64 mac_rx_oversize_pkt_num;
+ u64 mac_rx_jabber_pkt_num;
+ u64 mac_rx_bad_pkt_num;
+ u64 mac_rx_bad_oct_num;
+ u64 mac_rx_good_pkt_num;
+ u64 mac_rx_good_oct_num;
+ u64 mac_rx_total_pkt_num;
+ u64 mac_rx_total_oct_num;
+ u64 mac_rx_uni_pkt_num;
+ u64 mac_rx_multi_pkt_num;
+ u64 mac_rx_broad_pkt_num;
+ u64 mac_rx_pause_num;
+ u64 mac_rx_pfc_pkt_num;
+ u64 mac_rx_pfc_pri0_pkt_num;
+ u64 mac_rx_pfc_pri1_pkt_num;
+ u64 mac_rx_pfc_pri2_pkt_num;
+ u64 mac_rx_pfc_pri3_pkt_num;
+ u64 mac_rx_pfc_pri4_pkt_num;
+ u64 mac_rx_pfc_pri5_pkt_num;
+ u64 mac_rx_pfc_pri6_pkt_num;
+ u64 mac_rx_pfc_pri7_pkt_num;
+ u64 mac_rx_control_pkt_num;
+ u64 mac_rx_sym_err_pkt_num;
+ u64 mac_rx_fcs_err_pkt_num;
+ u64 mac_rx_send_app_good_pkt_num;
+ u64 mac_rx_send_app_bad_pkt_num;
+ u64 mac_rx_unfilter_pkt_num;
+};
+
+struct mag_cmd_port_stats_info {
+ struct mgmt_msg_head head;
+
+ u8 port_id;
+ u8 rsvd0[3];
+};
+
+struct mag_cmd_get_port_stat {
+ struct mgmt_msg_head head;
+
+ struct mag_cmd_port_stats counter;
+ u64 rsvd1[15];
+};
+
enum link_err_type {
LINK_ERR_MODULE_UNRECOGENIZED,
LINK_ERR_NUM,
@@ -209,6 +313,11 @@ int hinic3_get_port_info(struct hinic3_hwdev *hwdev,
struct hinic3_nic_port_info *port_info);
int hinic3_set_vport_enable(struct hinic3_hwdev *hwdev, u16 func_id,
bool enable);
+int hinic3_get_phy_port_stats(struct hinic3_hwdev *hwdev,
+ struct mag_cmd_port_stats *stats);
+int hinic3_get_vport_stats(struct hinic3_hwdev *hwdev, u16 func_id,
+ struct l2nic_vport_stats *stats);
+
int hinic3_add_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
int hinic3_del_vlan(struct hinic3_hwdev *hwdev, u16 vlan_id, u16 func_id);
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
index 309ab5901379..7fadb88ff722 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.c
@@ -29,7 +29,7 @@
#define HINIC3_LRO_PKT_HDR_LEN_IPV4 66
#define HINIC3_LRO_PKT_HDR_LEN_IPV6 86
#define HINIC3_LRO_PKT_HDR_LEN(cqe) \
- (RQ_CQE_OFFOLAD_TYPE_GET((cqe)->offload_type, IP_TYPE) == \
+ (RQ_CQE_OFFOLAD_TYPE_GET(le32_to_cpu((cqe)->offload_type), IP_TYPE) == \
HINIC3_RX_IPV6_PKT ? HINIC3_LRO_PKT_HDR_LEN_IPV6 : \
HINIC3_LRO_PKT_HDR_LEN_IPV4)
@@ -46,7 +46,6 @@ static void hinic3_rxq_clean_stats(struct hinic3_rxq_stats *rxq_stats)
rxq_stats->alloc_skb_err = 0;
rxq_stats->alloc_rx_buf_err = 0;
- rxq_stats->restore_drop_sge = 0;
u64_stats_update_end(&rxq_stats->syncp);
}
@@ -155,8 +154,12 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
err = rx_alloc_mapped_page(rxq->page_pool, rx_info,
rxq->buf_len);
- if (unlikely(err))
+ if (unlikely(err)) {
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.alloc_rx_buf_err++;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
break;
+ }
dma_addr = page_pool_get_dma_addr(rx_info->page) +
rx_info->page_offset;
@@ -170,6 +173,10 @@ static u32 hinic3_rx_fill_buffers(struct hinic3_rxq *rxq)
rxq->next_to_update << HINIC3_NORMAL_RQ_WQE);
rxq->delta -= i;
rxq->next_to_alloc = rxq->next_to_update;
+ } else if (free_wqebbs == rxq->q_depth - 1) {
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.rx_buf_empty++;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
}
return i;
@@ -330,11 +337,23 @@ static void hinic3_rx_csum(struct hinic3_rxq *rxq, u32 offload_type,
struct net_device *netdev = rxq->netdev;
bool l2_tunnel;
+ if (unlikely(csum_err == HINIC3_RX_CSUM_IPSU_OTHER_ERR)) {
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.other_errors++;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
+ }
+
if (!(netdev->features & NETIF_F_RXCSUM))
return;
if (unlikely(csum_err)) {
/* pkt type is recognized by HW, and csum is wrong */
+ if (!(csum_err & (HINIC3_RX_CSUM_HW_CHECK_NONE |
+ HINIC3_RX_CSUM_IPSU_OTHER_ERR))) {
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.csum_errors++;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
+ }
skb->ip_summed = CHECKSUM_NONE;
return;
}
@@ -387,8 +406,12 @@ static int recv_one_pkt(struct hinic3_rxq *rxq, struct hinic3_rq_cqe *rx_cqe,
u16 num_lro;
skb = hinic3_fetch_rx_buffer(rxq, pkt_len);
- if (unlikely(!skb))
+ if (unlikely(!skb)) {
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.alloc_skb_err++;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
return -ENOMEM;
+ }
/* place header in linear portion of buffer */
if (skb_is_nonlinear(skb))
@@ -550,11 +573,28 @@ int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
return 0;
}
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+ struct hinic3_rxq_stats *stats)
+{
+ struct hinic3_rxq_stats *rxq_stats = &rxq->rxq_stats;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin(&rxq_stats->syncp);
+ stats->csum_errors = rxq_stats->csum_errors;
+ stats->other_errors = rxq_stats->other_errors;
+ stats->rx_buf_empty = rxq_stats->rx_buf_empty;
+ stats->alloc_skb_err = rxq_stats->alloc_skb_err;
+ stats->alloc_rx_buf_err = rxq_stats->alloc_rx_buf_err;
+ } while (u64_stats_fetch_retry(&rxq_stats->syncp, start));
+}
+
int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(rxq->netdev);
u32 sw_ci, status, pkt_len, vlan_len;
struct hinic3_rq_cqe *rx_cqe;
+ u64 rx_bytes = 0;
u32 num_wqe = 0;
int nr_pkts = 0;
u16 num_lro;
@@ -574,10 +614,14 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
if (recv_one_pkt(rxq, rx_cqe, pkt_len, vlan_len, status))
break;
+ rx_bytes += pkt_len;
nr_pkts++;
num_lro = RQ_CQE_STATUS_GET(status, NUM_LRO);
- if (num_lro)
+ if (num_lro) {
+ rx_bytes += (num_lro - 1) *
+ HINIC3_LRO_PKT_HDR_LEN(rx_cqe);
num_wqe += hinic3_get_sge_num(rxq, pkt_len);
+ }
rx_cqe->status = 0;
@@ -588,5 +632,10 @@ int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget)
if (rxq->delta >= HINIC3_RX_BUFFER_WRITE)
hinic3_rx_fill_buffers(rxq);
+ u64_stats_update_begin(&rxq->rxq_stats.syncp);
+ rxq->rxq_stats.packets += (u64)nr_pkts;
+ rxq->rxq_stats.bytes += rx_bytes;
+ u64_stats_update_end(&rxq->rxq_stats.syncp);
+
return nr_pkts;
}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
index 06d1b3299e7c..c11d080408a7 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_rx.h
@@ -8,6 +8,17 @@
#include <linux/dim.h>
#include <linux/netdevice.h>
+/* rx cqe checksum err */
+#define HINIC3_RX_CSUM_IP_CSUM_ERR BIT(0)
+#define HINIC3_RX_CSUM_TCP_CSUM_ERR BIT(1)
+#define HINIC3_RX_CSUM_UDP_CSUM_ERR BIT(2)
+#define HINIC3_RX_CSUM_IGMP_CSUM_ERR BIT(3)
+#define HINIC3_RX_CSUM_ICMPV4_CSUM_ERR BIT(4)
+#define HINIC3_RX_CSUM_ICMPV6_CSUM_ERR BIT(5)
+#define HINIC3_RX_CSUM_SCTP_CRC_ERR BIT(6)
+#define HINIC3_RX_CSUM_HW_CHECK_NONE BIT(7)
+#define HINIC3_RX_CSUM_IPSU_OTHER_ERR BIT(8)
+
#define RQ_CQE_OFFOLAD_TYPE_PKT_TYPE_MASK GENMASK(4, 0)
#define RQ_CQE_OFFOLAD_TYPE_IP_TYPE_MASK GENMASK(6, 5)
#define RQ_CQE_OFFOLAD_TYPE_TUNNEL_PKT_FORMAT_MASK GENMASK(11, 8)
@@ -39,7 +50,6 @@ struct hinic3_rxq_stats {
u64 rx_buf_empty;
u64 alloc_skb_err;
u64 alloc_rx_buf_err;
- u64 restore_drop_sge;
struct u64_stats_sync syncp;
};
@@ -123,6 +133,9 @@ void hinic3_free_rxqs_res(struct net_device *netdev, u16 num_rq,
u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
int hinic3_configure_rxqs(struct net_device *netdev, u16 num_rq,
u32 rq_depth, struct hinic3_dyna_rxq_res *rxqs_res);
+
+void hinic3_rxq_get_stats(struct hinic3_rxq *rxq,
+ struct hinic3_rxq_stats *stats);
int hinic3_rx_poll(struct hinic3_rxq *rxq, int budget);
#endif
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
index 9306bf0020ca..3fbbfa5d96b6 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.c
@@ -97,8 +97,12 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
dma_info[0].dma = dma_map_single(&pdev->dev, skb->data,
skb_headlen(skb), DMA_TO_DEVICE);
- if (dma_mapping_error(&pdev->dev, dma_info[0].dma))
+ if (dma_mapping_error(&pdev->dev, dma_info[0].dma)) {
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.map_frag_err++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
return -EFAULT;
+ }
dma_info[0].len = skb_headlen(skb);
@@ -117,6 +121,9 @@ static int hinic3_tx_map_skb(struct net_device *netdev, struct sk_buff *skb,
skb_frag_size(frag),
DMA_TO_DEVICE);
if (dma_mapping_error(&pdev->dev, dma_info[idx].dma)) {
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.map_frag_err++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
err = -EFAULT;
goto err_unmap_page;
}
@@ -260,6 +267,9 @@ static int hinic3_tx_csum(struct hinic3_txq *txq, struct hinic3_sq_task *task,
if (l4_proto != IPPROTO_UDP ||
((struct udphdr *)skb_transport_header(skb))->dest !=
VXLAN_OFFLOAD_PORT_LE) {
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.unknown_tunnel_pkt++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
/* Unsupported tunnel packet, disable csum offload */
skb_checksum_help(skb);
return 0;
@@ -433,6 +443,27 @@ static u32 hinic3_tx_offload(struct sk_buff *skb, struct hinic3_sq_task *task,
return offload;
}
+static void hinic3_get_pkt_stats(struct hinic3_txq *txq, struct sk_buff *skb)
+{
+ u32 hdr_len, tx_bytes;
+ unsigned short pkts;
+
+ if (skb_is_gso(skb)) {
+ hdr_len = (skb_shinfo(skb)->gso_segs - 1) *
+ skb_tcp_all_headers(skb);
+ tx_bytes = skb->len + hdr_len;
+ pkts = skb_shinfo(skb)->gso_segs;
+ } else {
+ tx_bytes = skb->len > ETH_ZLEN ? skb->len : ETH_ZLEN;
+ pkts = 1;
+ }
+
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.bytes += tx_bytes;
+ txq->txq_stats.packets += pkts;
+ u64_stats_update_end(&txq->txq_stats.syncp);
+}
+
static u16 hinic3_get_and_update_sq_owner(struct hinic3_io_queue *sq,
u16 curr_pi, u16 wqebb_cnt)
{
@@ -539,8 +570,12 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
int err;
if (unlikely(skb->len < MIN_SKB_LEN)) {
- if (skb_pad(skb, MIN_SKB_LEN - skb->len))
+ if (skb_pad(skb, MIN_SKB_LEN - skb->len)) {
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.skb_pad_err++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
goto err_out;
+ }
skb->len = MIN_SKB_LEN;
}
@@ -595,6 +630,7 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
txq->tx_stop_thrs,
txq->tx_start_thrs);
+ hinic3_get_pkt_stats(txq, skb);
hinic3_prepare_sq_ctrl(&wqe_combo, queue_info, num_sge, owner);
hinic3_write_db(txq->sq, 0, DB_CFLAG_DP_SQ,
hinic3_get_sq_local_pi(txq->sq));
@@ -604,6 +640,10 @@ static netdev_tx_t hinic3_send_one_skb(struct sk_buff *skb,
err_drop_pkt:
dev_kfree_skb_any(skb);
err_out:
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.dropped++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
+
return NETDEV_TX_OK;
}
@@ -611,12 +651,19 @@ netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
u16 q_id = skb_get_queue_mapping(skb);
+ struct hinic3_txq *txq;
if (unlikely(!netif_carrier_ok(netdev)))
goto err_drop_pkt;
- if (unlikely(q_id >= nic_dev->q_params.num_qps))
+ if (unlikely(q_id >= nic_dev->q_params.num_qps)) {
+ txq = &nic_dev->txqs[0];
+ u64_stats_update_begin(&txq->txq_stats.syncp);
+ txq->txq_stats.dropped++;
+ u64_stats_update_end(&txq->txq_stats.syncp);
+
goto err_drop_pkt;
+ }
return hinic3_send_one_skb(skb, netdev, &nic_dev->txqs[q_id]);
@@ -754,6 +801,24 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
return 0;
}
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+ struct hinic3_txq_stats *stats)
+{
+ struct hinic3_txq_stats *txq_stats = &txq->txq_stats;
+ unsigned int start;
+
+ do {
+ start = u64_stats_fetch_begin(&txq_stats->syncp);
+ stats->busy = txq_stats->busy;
+ stats->skb_pad_err = txq_stats->skb_pad_err;
+ stats->frag_len_overflow = txq_stats->frag_len_overflow;
+ stats->offload_cow_skb_err = txq_stats->offload_cow_skb_err;
+ stats->map_frag_err = txq_stats->map_frag_err;
+ stats->unknown_tunnel_pkt = txq_stats->unknown_tunnel_pkt;
+ stats->frag_size_err = txq_stats->frag_size_err;
+ } while (u64_stats_fetch_retry(&txq_stats->syncp, start));
+}
+
bool hinic3_tx_poll(struct hinic3_txq *txq, int budget)
{
struct net_device *netdev = txq->netdev;
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
index 00194f2a1bcc..0a21c423618f 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_tx.h
@@ -157,6 +157,8 @@ int hinic3_configure_txqs(struct net_device *netdev, u16 num_sq,
u32 sq_depth, struct hinic3_dyna_txq_res *txqs_res);
netdev_tx_t hinic3_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
+void hinic3_txq_get_stats(struct hinic3_txq *txq,
+ struct hinic3_txq_stats *stats);
bool hinic3_tx_poll(struct hinic3_txq *txq, int budget);
void hinic3_flush_txqs(struct net_device *netdev);
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 1/6] hinic3: Add ethtool queue ops
From: Fan Gong @ 2026-04-11 3:36 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
In-Reply-To: <cover.1775711066.git.zhuyikai1@h-partners.com>
Implement following ethtool callback function:
.get_ringparam
.set_ringparam
These callbacks allow users to utilize ethtool for detailed
queue depth configuration and monitoring.
Co-developed-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Zhu Yikai <zhuyikai1@h-partners.com>
Signed-off-by: Fan Gong <gongfan1@huawei.com>
---
.../ethernet/huawei/hinic3/hinic3_ethtool.c | 101 +++++++++++++++++
.../net/ethernet/huawei/hinic3/hinic3_irq.c | 10 +-
.../net/ethernet/huawei/hinic3/hinic3_main.c | 11 ++
.../huawei/hinic3/hinic3_netdev_ops.c | 104 +++++++++++++++++-
.../ethernet/huawei/hinic3/hinic3_nic_dev.h | 16 +++
.../ethernet/huawei/hinic3/hinic3_nic_io.h | 4 +
6 files changed, 241 insertions(+), 5 deletions(-)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
index 90fc16288de9..e47c3f43e7b9 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_ethtool.c
@@ -409,6 +409,105 @@ hinic3_get_link_ksettings(struct net_device *netdev,
return 0;
}
+static void hinic3_get_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring,
+ struct kernel_ethtool_ringparam *kernel_ring,
+ struct netlink_ext_ack *extack)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+
+ ring->rx_max_pending = HINIC3_MAX_RX_QUEUE_DEPTH;
+ ring->tx_max_pending = HINIC3_MAX_TX_QUEUE_DEPTH;
+ ring->rx_pending = nic_dev->rxqs[0].q_depth;
+ ring->tx_pending = nic_dev->txqs[0].q_depth;
+}
+
+static void hinic3_update_qp_depth(struct net_device *netdev,
+ u32 sq_depth, u32 rq_depth)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ u16 i;
+
+ nic_dev->q_params.sq_depth = sq_depth;
+ nic_dev->q_params.rq_depth = rq_depth;
+ for (i = 0; i < nic_dev->max_qps; i++) {
+ nic_dev->txqs[i].q_depth = sq_depth;
+ nic_dev->txqs[i].q_mask = sq_depth - 1;
+ nic_dev->rxqs[i].q_depth = rq_depth;
+ nic_dev->rxqs[i].q_mask = rq_depth - 1;
+ }
+}
+
+static int hinic3_check_ringparam_valid(struct net_device *netdev,
+ const struct ethtool_ringparam *ring)
+{
+ if (ring->rx_jumbo_pending || ring->rx_mini_pending) {
+ netdev_err(netdev, "Unsupported rx_jumbo_pending/rx_mini_pending\n");
+ return -EINVAL;
+ }
+
+ if (ring->tx_pending > HINIC3_MAX_TX_QUEUE_DEPTH ||
+ ring->tx_pending < HINIC3_MIN_QUEUE_DEPTH ||
+ ring->rx_pending > HINIC3_MAX_RX_QUEUE_DEPTH ||
+ ring->rx_pending < HINIC3_MIN_QUEUE_DEPTH) {
+ netdev_err(netdev,
+ "Queue depth out of range tx[%d-%d] rx[%d-%d]\n",
+ HINIC3_MIN_QUEUE_DEPTH, HINIC3_MAX_TX_QUEUE_DEPTH,
+ HINIC3_MIN_QUEUE_DEPTH, HINIC3_MAX_RX_QUEUE_DEPTH);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int hinic3_set_ringparam(struct net_device *netdev,
+ struct ethtool_ringparam *ring,
+ struct kernel_ethtool_ringparam *kernel_ring,
+ struct netlink_ext_ack *extack)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_dyna_txrxq_params q_params = {};
+ u32 new_sq_depth, new_rq_depth;
+ int err;
+
+ err = hinic3_check_ringparam_valid(netdev, ring);
+ if (err)
+ return err;
+
+ new_sq_depth = 1U << ilog2(ring->tx_pending);
+ new_rq_depth = 1U << ilog2(ring->rx_pending);
+ if (new_sq_depth == nic_dev->q_params.sq_depth &&
+ new_rq_depth == nic_dev->q_params.rq_depth)
+ return 0;
+
+ if (new_sq_depth != ring->tx_pending)
+ netdev_info(netdev, "Requested Tx depth trimmed to %d\n",
+ new_sq_depth);
+ if (new_rq_depth != ring->rx_pending)
+ netdev_info(netdev, "Requested Rx depth trimmed to %d\n",
+ new_rq_depth);
+
+ netdev_info(netdev, "Change Tx/Rx ring depth from %u/%u to %u/%u\n",
+ nic_dev->q_params.sq_depth, nic_dev->q_params.rq_depth,
+ new_sq_depth, new_rq_depth);
+
+ if (!netif_running(netdev)) {
+ hinic3_update_qp_depth(netdev, new_sq_depth, new_rq_depth);
+ } else {
+ q_params = nic_dev->q_params;
+ q_params.sq_depth = new_sq_depth;
+ q_params.rq_depth = new_rq_depth;
+
+ err = hinic3_change_channel_settings(netdev, &q_params);
+ if (err) {
+ netdev_err(netdev, "Failed to change channel settings\n");
+ return err;
+ }
+ }
+
+ return 0;
+}
+
static const struct ethtool_ops hinic3_ethtool_ops = {
.supported_coalesce_params = ETHTOOL_COALESCE_USECS |
ETHTOOL_COALESCE_PKT_RATE_RX_USECS,
@@ -417,6 +516,8 @@ static const struct ethtool_ops hinic3_ethtool_ops = {
.get_msglevel = hinic3_get_msglevel,
.set_msglevel = hinic3_set_msglevel,
.get_link = ethtool_op_get_link,
+ .get_ringparam = hinic3_get_ringparam,
+ .set_ringparam = hinic3_set_ringparam,
};
void hinic3_set_ethtool_ops(struct net_device *netdev)
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
index e7d6c2033b45..d3b3927b5408 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_irq.c
@@ -135,10 +135,16 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
struct hinic3_interrupt_info info = {};
+ unsigned long flags;
int err;
- if (q_id >= nic_dev->q_params.num_qps)
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+
+ if (!HINIC3_CHANNEL_RES_VALID(nic_dev) ||
+ q_id >= nic_dev->q_params.num_qps) {
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
return 0;
+ }
info.interrupt_coalesc_set = 1;
info.coalesc_timer_cfg = coalesc_timer_cfg;
@@ -147,6 +153,8 @@ static int hinic3_set_interrupt_moder(struct net_device *netdev, u16 q_id,
info.resend_timer_cfg =
nic_dev->intr_coalesce[q_id].resend_timer_cfg;
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
err = hinic3_set_interrupt_cfg(nic_dev->hwdev, info);
if (err) {
netdev_err(netdev,
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
index 0a888fe4c975..3b470978714a 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_main.c
@@ -179,6 +179,8 @@ static int hinic3_sw_init(struct net_device *netdev)
int err;
mutex_init(&nic_dev->port_state_mutex);
+ mutex_init(&nic_dev->channel_cfg_lock);
+ spin_lock_init(&nic_dev->channel_res_lock);
nic_dev->q_params.sq_depth = HINIC3_SQ_DEPTH;
nic_dev->q_params.rq_depth = HINIC3_RQ_DEPTH;
@@ -314,6 +316,15 @@ static void hinic3_link_status_change(struct net_device *netdev,
bool link_status_up)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ unsigned long flags;
+ bool valid;
+
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ valid = HINIC3_CHANNEL_RES_VALID(nic_dev);
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ if (!valid)
+ return;
if (link_status_up) {
if (netif_carrier_ok(netdev))
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
index da73811641a9..d652a5ffdc2c 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_netdev_ops.c
@@ -428,6 +428,85 @@ static void hinic3_vport_down(struct net_device *netdev)
}
}
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+ struct hinic3_dyna_txrxq_params *trxq_params)
+{
+ struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
+ struct hinic3_dyna_txrxq_params old_qp_params = {};
+ struct hinic3_dyna_qp_params new_qp_params = {};
+ struct hinic3_dyna_qp_params cur_qp_params = {};
+ bool need_teardown = false;
+ unsigned long flags;
+ int err;
+
+ mutex_lock(&nic_dev->channel_cfg_lock);
+
+ hinic3_config_num_qps(netdev, trxq_params);
+
+ err = hinic3_alloc_channel_resources(netdev, &new_qp_params,
+ trxq_params);
+ if (err) {
+ netdev_err(netdev, "Failed to alloc channel resources\n");
+ mutex_unlock(&nic_dev->channel_cfg_lock);
+ return err;
+ }
+
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
+ need_teardown = true;
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ if (need_teardown) {
+ hinic3_vport_down(netdev);
+ hinic3_close_channel(netdev);
+ hinic3_uninit_qps(nic_dev, &cur_qp_params);
+ hinic3_free_channel_resources(netdev, &cur_qp_params,
+ &nic_dev->q_params);
+ }
+
+ if (nic_dev->num_qp_irq > trxq_params->num_qps)
+ hinic3_qp_irq_change(netdev, trxq_params->num_qps);
+
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ old_qp_params = nic_dev->q_params;
+ nic_dev->q_params = *trxq_params;
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ hinic3_init_qps(nic_dev, &new_qp_params);
+
+ err = hinic3_open_channel(netdev);
+ if (err)
+ goto err_uninit_qps;
+
+ err = hinic3_vport_up(netdev);
+ if (err)
+ goto err_close_channel;
+
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ clear_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags);
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ mutex_unlock(&nic_dev->channel_cfg_lock);
+
+ return 0;
+
+err_close_channel:
+ hinic3_close_channel(netdev);
+err_uninit_qps:
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ nic_dev->q_params = old_qp_params;
+ clear_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags);
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ hinic3_uninit_qps(nic_dev, &new_qp_params);
+ hinic3_free_channel_resources(netdev, &new_qp_params, trxq_params);
+
+ mutex_unlock(&nic_dev->channel_cfg_lock);
+
+ return err;
+}
+
static int hinic3_open(struct net_device *netdev)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
@@ -487,16 +566,33 @@ static int hinic3_close(struct net_device *netdev)
{
struct hinic3_nic_dev *nic_dev = netdev_priv(netdev);
struct hinic3_dyna_qp_params qp_params;
+ bool need_teardown = false;
+ unsigned long flags;
if (!test_and_clear_bit(HINIC3_INTF_UP, &nic_dev->flags)) {
netdev_dbg(netdev, "Netdev already close, do nothing\n");
return 0;
}
- hinic3_vport_down(netdev);
- hinic3_close_channel(netdev);
- hinic3_uninit_qps(nic_dev, &qp_params);
- hinic3_free_channel_resources(netdev, &qp_params, &nic_dev->q_params);
+ mutex_lock(&nic_dev->channel_cfg_lock);
+
+ spin_lock_irqsave(&nic_dev->channel_res_lock, flags);
+ if (!test_and_set_bit(HINIC3_CHANGE_RES_INVALID, &nic_dev->flags))
+ need_teardown = true;
+ spin_unlock_irqrestore(&nic_dev->channel_res_lock, flags);
+
+ if (need_teardown) {
+ hinic3_vport_down(netdev);
+ hinic3_close_channel(netdev);
+ hinic3_uninit_qps(nic_dev, &qp_params);
+ hinic3_free_channel_resources(netdev, &qp_params,
+ &nic_dev->q_params);
+ }
+
+ hinic3_free_nicio_res(nic_dev);
+ hinic3_destroy_num_qps(netdev);
+
+ mutex_unlock(&nic_dev->channel_cfg_lock);
return 0;
}
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
index 9502293ff710..55b280888ad8 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_dev.h
@@ -10,6 +10,9 @@
#include "hinic3_hw_cfg.h"
#include "hinic3_hwdev.h"
#include "hinic3_mgmt_interface.h"
+#include "hinic3_nic_io.h"
+#include "hinic3_tx.h"
+#include "hinic3_rx.h"
#define HINIC3_VLAN_BITMAP_BYTE_SIZE(nic_dev) (sizeof(*(nic_dev)->vlan_bitmap))
#define HINIC3_VLAN_BITMAP_SIZE(nic_dev) \
@@ -20,8 +23,13 @@ enum hinic3_flags {
HINIC3_MAC_FILTER_CHANGED,
HINIC3_RSS_ENABLE,
HINIC3_UPDATE_MAC_FILTER,
+ HINIC3_CHANGE_RES_INVALID,
};
+#define HINIC3_CHANNEL_RES_VALID(nic_dev) \
+ (test_bit(HINIC3_INTF_UP, &(nic_dev)->flags) && \
+ !test_bit(HINIC3_CHANGE_RES_INVALID, &(nic_dev)->flags))
+
enum hinic3_event_work_flags {
HINIC3_EVENT_WORK_TX_TIMEOUT,
};
@@ -129,6 +137,10 @@ struct hinic3_nic_dev {
struct work_struct rx_mode_work;
/* lock for enable/disable port */
struct mutex port_state_mutex;
+ /* lock for channel configuration */
+ struct mutex channel_cfg_lock;
+ /* lock for channel resources */
+ spinlock_t channel_res_lock;
struct list_head uc_filter_list;
struct list_head mc_filter_list;
@@ -143,6 +155,10 @@ struct hinic3_nic_dev {
void hinic3_set_netdev_ops(struct net_device *netdev);
int hinic3_set_hw_features(struct net_device *netdev);
+int
+hinic3_change_channel_settings(struct net_device *netdev,
+ struct hinic3_dyna_txrxq_params *trxq_params);
+
int hinic3_qps_irq_init(struct net_device *netdev);
void hinic3_qps_irq_uninit(struct net_device *netdev);
diff --git a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
index 12eefabcf1db..3791b9bc865b 100644
--- a/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
+++ b/drivers/net/ethernet/huawei/hinic3/hinic3_nic_io.h
@@ -14,6 +14,10 @@ struct hinic3_nic_dev;
#define HINIC3_RQ_WQEBB_SHIFT 3
#define HINIC3_SQ_WQEBB_SIZE BIT(HINIC3_SQ_WQEBB_SHIFT)
+#define HINIC3_MAX_TX_QUEUE_DEPTH 65536
+#define HINIC3_MAX_RX_QUEUE_DEPTH 16384
+#define HINIC3_MIN_QUEUE_DEPTH 128
+
/* ******************** RQ_CTRL ******************** */
enum hinic3_rq_wqe_type {
HINIC3_NORMAL_RQ_WQE = 1,
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v05 0/6] net: hinic3: PF initialization
From: Fan Gong @ 2026-04-11 3:36 UTC (permalink / raw)
To: Fan Gong, Zhu Yikai, netdev, David S. Miller, Eric Dumazet,
Jakub Kicinski, Paolo Abeni, Simon Horman, Andrew Lunn,
Ioana Ciornei, Mohsin Bashir
Cc: linux-kernel, linux-doc, luosifu, Xin Guo, Zhou Shuai, Wu Like,
Shi Jing, Zheng Jiezhen, Maxime Chevallier
This is [3/3] part of hinic3 Ethernet driver second submission.
With this patch hinic3 becomes a complete Ethernet driver with
pf and vf.
Add 20 ethtool ops for information of queue, rss, coalesce and eth data.
Add MTU size validation
Config netdev watchdog timeout.
Remove unneed coalesce parameters.
Changes:
PATCH 03 V01: https://lore.kernel.org/netdev/cover.1773387649.git.zhuyikai1@h-partners.com/
* Add rmon/pause/phy/mac/ctrl stats (Ioana Ciornei)
PATCH 03 V02: https://lore.kernel.org/netdev/cover.1774684571.git.zhuyikai1@h-partners.com/
* Modify "return -EINVAL" intension problem (AI review)
* Use le16_to_cpu for rss_indir pair.out->buf (AI review)
* Use u32 instead of int in coalesce_limits to avoid overflow (AI review)
* Remove redundant u64_stats_update_begin/end when reading stats without
concurrent reader (AI review)
* Modify nic_dev->stats.syncp logic (AI review)
* Complete rxq/txq stats stats fileds in hinic3_rx/txq_get_stats (AI review)
* Remove statistics values in rtnl_link_stats64 from ethtool statistics
values (AI review)
* Add channel_cfg_lock & channel_res_lock to protect resources access (AI review)
* Remove OutOfRangeLengthField, FrameToolong and InRangeLengthErrors (Ioana Ciornei)
* Remove redundant mtu commit (Maxime Chevialler)
PATCH 03 V03: https://lore.kernel.org/netdev/cover.1774940117.git.zhuyikai1@h-partners.com/
* Change unnedd to unneeded (AI review)
* Remove packets,bytes,errors and dropped in hinic3_rx/tx_queue_stats (AI review)
* Remove duplicated entried in hinic3_port_stats[] (AI review)
* change stats_info.head.status to ps->head.status (AI review)
PATCH 03 V04: https://lore.kernel.org/netdev/cover.1775618797.git.zhuyikai1@h-partners.com/
* Remove restore_drop_sge in hinic3_rx_queue_stats (AI review)
* Remove hinic3_nic_stats (AI review)
* Use old_q_param to store old config and use it in error handling (Mohsin Bashir)
* Add netdev_info to inform the user that depth is trimmed (Mohsin Bashir)
* Remove const in hinic3_get_qp_stats_strings parameters (Mohsin Bashir)
* Change EOPNOTSUPP to ERANGE in is_coalesce_exceed_limit (Mohsin Bashir)
* Update nic_dev->rss_type after hinic3_set_rss_type (Mohsin Bashir)
* Modify MGMT_STATUS_CMD_UNSUPPORTED to EOPNOTSUPP for complying with the
error code specifications (Mohsin Bashir)
PATCH 03 V05:
* Clear HINIC3_CHANGE_RES_INVALID bit in error handling (AI review)
* Use low >= high to avoid low=high in is_coalesce_legal (AI review)
* As tx and rx share interrupts, we only use ETHTOOL_COALESCE_RX_USECS for
user setting to avoid user misunderstanding. So we do not add
ETHTOOL_COALESCE_TX_USECS. (Mohsin Bashir & AI review)
Fan Gong (6):
hinic3: Add ethtool queue ops
hinic3: Add ethtool statistic ops
hinic3: Add ethtool coalesce ops
hinic3: Add ethtool rss ops
hinic3: Configure netdev->watchdog_timeo to set nic tx timeout
hinic3: Remove unneeded coalesce parameters
.../ethernet/huawei/hinic3/hinic3_ethtool.c | 827 +++++++++++++++++-
.../ethernet/huawei/hinic3/hinic3_hw_intf.h | 13 +-
.../net/ethernet/huawei/hinic3/hinic3_irq.c | 16 +-
.../net/ethernet/huawei/hinic3/hinic3_main.c | 15 +
.../huawei/hinic3/hinic3_mgmt_interface.h | 39 +
.../huawei/hinic3/hinic3_netdev_ops.c | 104 ++-
.../ethernet/huawei/hinic3/hinic3_nic_cfg.c | 64 ++
.../ethernet/huawei/hinic3/hinic3_nic_cfg.h | 109 +++
.../ethernet/huawei/hinic3/hinic3_nic_dev.h | 16 +
.../ethernet/huawei/hinic3/hinic3_nic_io.h | 4 +
.../net/ethernet/huawei/hinic3/hinic3_rss.c | 487 ++++++++++-
.../net/ethernet/huawei/hinic3/hinic3_rss.h | 19 +
.../net/ethernet/huawei/hinic3/hinic3_rx.c | 59 +-
.../net/ethernet/huawei/hinic3/hinic3_rx.h | 18 +-
.../net/ethernet/huawei/hinic3/hinic3_tx.c | 71 +-
.../net/ethernet/huawei/hinic3/hinic3_tx.h | 2 +
16 files changed, 1836 insertions(+), 27 deletions(-)
base-commit: 8e7adcf81564a3fe886a6270eea7558f063e5538
--
2.43.0
^ permalink raw reply
* Re: [PATCH bpf-next v2 2/3] bpf: Use kmalloc_nolock() universally in local storage
From: bot+bpf-ci @ 2026-04-11 2:36 UTC (permalink / raw)
To: ameryhung, bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team, ast, andrii, daniel, martin.lau, eddyz87,
yonghong.song, clm, ihor.solodrai
In-Reply-To: <20260411015419.114016-3-ameryhung@gmail.com>
[-- Attachment #1: Type: text/plain, Size: 2323 bytes --]
> diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
> index cad17ca8552f..bc687b9d25a9 100644
> --- a/kernel/bpf/bpf_local_storage.c
> +++ b/kernel/bpf/bpf_local_storage.c
[ ... ]
> @@ -75,18 +75,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
> if (mem_charge(smap, owner, smap->elem_size))
> return NULL;
>
> - if (smap->use_kmalloc_nolock) {
> - selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
> - __GFP_ZERO, NUMA_NO_NODE);
> - } else {
> - selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
> - gfp_flags | __GFP_NOWARN);
> - }
> + selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
> + __GFP_ZERO, NUMA_NO_NODE);
With this change, socket and inode local storage now always go through
kmalloc_nolock(). kmalloc_nolock_noprof() in mm/slub.c has a hard size
limit:
if (unlikely(size > KMALLOC_MAX_CACHE_SIZE))
return NULL;
KMALLOC_MAX_CACHE_SIZE is 1 << (PAGE_SHIFT + 1), which is 8KB on
x86_64 with 4KB pages. But BPF_LOCAL_STORAGE_MAX_VALUE_SIZE in
bpf_local_storage.h is calculated from KMALLOC_MAX_SIZE (not
KMALLOC_MAX_CACHE_SIZE):
#define BPF_LOCAL_STORAGE_MAX_VALUE_SIZE \
min_t(u32, \
(KMALLOC_MAX_SIZE - MAX_BPF_STACK - \
sizeof(struct bpf_local_storage_elem)), \
(U16_MAX - sizeof(struct bpf_local_storage_elem)))
This allows value sizes up to ~65KB. Before this patch, socket and
inode storage used bpf_map_kzalloc() (backed by regular kmalloc)
which could handle those large sizes. After this patch, any
elem_size above KMALLOC_MAX_CACHE_SIZE will silently fail: the map
creation succeeds via bpf_local_storage_map_alloc_check() but every
element allocation returns NULL.
Should BPF_LOCAL_STORAGE_MAX_VALUE_SIZE be updated to use
KMALLOC_MAX_CACHE_SIZE instead of KMALLOC_MAX_SIZE now that all
storage types go through kmalloc_nolock()?
Slava Imameev raised the same concern for task storage in
https://lore.kernel.org/bpf/20260410014341.47043-1-slava.imameev@crowdstrike.com/
[ ... ]
---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md
CI run summary: https://github.com/kernel-patches/bpf/actions/runs/24272235145
^ permalink raw reply
* [PATCH net] net/sched: taprio: fix use-after-free in advance_sched() on schedule switch
From: Vinicius Costa Gomes @ 2026-04-11 1:57 UTC (permalink / raw)
To: Vladimir Oltean, Jamal Hadi Salim, Jiri Pirko, David S. Miller,
Eric Dumazet, Jakub Kicinski, Paolo Abeni, Simon Horman
Cc: netdev, linux-kernel, Junxi Qian, Vinicius Costa Gomes
In advance_sched(), when should_change_schedules() returns true,
switch_schedules() is called to promote the admin schedule to oper.
switch_schedules() queues the old oper schedule for RCU freeing via
call_rcu(), but 'next' still points into an entry of the old oper
schedule. The subsequent 'next->end_time = end_time' and
rcu_assign_pointer(q->current_entry, next) are use-after-free.
Fix this by selecting 'next' from the new oper schedule immediately
after switch_schedules(), and using its pre-calculated end_time.
setup_first_end_time() sets the first entry's end_time to
base_time + interval when the schedule is installed, so the value
is already correct.
The deleted 'end_time = sched_base_time(admin)' assignment was also
harmful independently: it would overwrite the new first entry's
pre-calculated end_time with just base_time.
Fixes: a3d43c0d56f1 ("taprio: Add support adding an admin schedule")
Reported-by: Junxi Qian <qjx1298677004@gmail.com>
Signed-off-by: Vinicius Costa Gomes <vinicius.gomes@intel.com>
---
net/sched/sch_taprio.c | 9 +++++----
1 file changed, 5 insertions(+), 4 deletions(-)
diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c
index f721c03514f6..0316f2dee06a 100644
--- a/net/sched/sch_taprio.c
+++ b/net/sched/sch_taprio.c
@@ -972,11 +972,12 @@ static enum hrtimer_restart advance_sched(struct hrtimer *timer)
}
if (should_change_schedules(admin, oper, end_time)) {
- /* Set things so the next time this runs, the new
- * schedule runs.
- */
- end_time = sched_base_time(admin);
switch_schedules(q, &admin, &oper);
+ /* After changing schedules, the next entry is the first one
+ * in the new schedule, with a pre-calculated end_time.
+ */
+ next = list_first_entry(&oper->entries, struct sched_entry, list);
+ end_time = next->end_time;
}
next->end_time = end_time;
---
base-commit: 02f72964395911e7a09bb2ea2fe6f79eda4ea2c2
change-id: 20260410-taprio-user-after-free-fix-net-fbd67f66512b
Best regards,
--
Vinicius
^ permalink raw reply related
* [PATCH bpf-next v2 3/3] bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
From: Amery Hung @ 2026-04-11 1:54 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411015419.114016-1-ameryhung@gmail.com>
Remove the check that rejects sleepable BPF programs from doing
BPF_ANY/BPF_EXIST updates on local storage. This restriction was added
in commit b00fa38a9c1c ("bpf: Enable non-atomic allocations in local
storage") because kzalloc(GFP_KERNEL) could sleep inside
local_storage->lock. This is no longer a concern: all local storage
allocations now use kmalloc_nolock() which never sleeps.
In addition, since kmalloc_nolock() only accepts __GFP_ACCOUNT,
__GFP_ZERO and __GFP_NO_OBJ_EXT, the gfp_flags parameter plumbing from
bpf_*_storage_get() to bpf_local_storage_update() becomes dead code.
Remove gfp_flags from bpf_selem_alloc(), bpf_local_storage_alloc() and
bpf_local_storage_update(). Drop the hidden 5th argument from
bpf_*_storage_get helpers, and remove the verifier patching that
injected GFP_KERNEL/GFP_ATOMIC into the fifth argument.
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf_local_storage.h | 7 +++----
kernel/bpf/bpf_cgrp_storage.c | 9 ++++-----
kernel/bpf/bpf_inode_storage.c | 9 ++++-----
kernel/bpf/bpf_local_storage.c | 16 ++++++----------
kernel/bpf/bpf_task_storage.c | 9 ++++-----
kernel/bpf/verifier.c | 26 --------------------------
net/core/bpf_sk_storage.c | 21 +++++++++------------
7 files changed, 30 insertions(+), 67 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index dced54e9265f..9e4f5c45c974 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -188,7 +188,7 @@ int bpf_selem_link_map(struct bpf_local_storage_map *smap,
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner, void *value,
- bool swap_uptrs, gfp_t gfp_flags);
+ bool swap_uptrs);
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now);
@@ -196,12 +196,11 @@ void bpf_selem_free(struct bpf_local_storage_elem *selem,
int
bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags);
+ struct bpf_local_storage_elem *first_selem);
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags);
+ void *value, u64 map_flags, bool swap_uptrs);
u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map);
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index d93ac2866748..c76e9b0fabba 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -76,7 +76,7 @@ static long bpf_cgrp_storage_update_elem(struct bpf_map *map, void *key,
return PTR_ERR(cgroup);
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
cgroup_put(cgroup);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static void cgroup_storage_map_free(struct bpf_map *map)
bpf_local_storage_map_free(map, &cgroup_cache);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -143,7 +142,7 @@ BPF_CALL_5(bpf_cgrp_storage_get, struct bpf_map *, map, struct cgroup *, cgroup,
if (!percpu_ref_is_dying(&cgroup->self.refcnt) &&
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE))
sdata = bpf_local_storage_update(cgroup, (struct bpf_local_storage_map *)map,
- value, BPF_NOEXIST, false, gfp_flags);
+ value, BPF_NOEXIST, false);
out:
return IS_ERR_OR_NULL(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index efc8996a4c0a..0da8d923e39d 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -98,7 +98,7 @@ static long bpf_fd_inode_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(file_inode(fd_file(f)),
(struct bpf_local_storage_map *)map,
- value, map_flags, false, GFP_ATOMIC);
+ value, map_flags, false);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -122,9 +122,8 @@ static long bpf_fd_inode_storage_delete_elem(struct bpf_map *map, void *key)
return inode_storage_delete(file_inode(fd_file(f)), map);
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -150,7 +149,7 @@ BPF_CALL_5(bpf_inode_storage_get, struct bpf_map *, map, struct inode *, inode,
if (flags & BPF_LOCAL_STORAGE_GET_F_CREATE) {
sdata = bpf_local_storage_update(
inode, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL :
(unsigned long)sdata->data;
}
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index bc687b9d25a9..6fc6a4b672b5 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -68,7 +68,7 @@ static bool selem_linked_to_map(const struct bpf_local_storage_elem *selem)
struct bpf_local_storage_elem *
bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
- void *value, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, bool swap_uptrs)
{
struct bpf_local_storage_elem *selem;
@@ -475,8 +475,7 @@ static int check_flags(const struct bpf_local_storage_data *old_sdata,
int bpf_local_storage_alloc(void *owner,
struct bpf_local_storage_map *smap,
- struct bpf_local_storage_elem *first_selem,
- gfp_t gfp_flags)
+ struct bpf_local_storage_elem *first_selem)
{
struct bpf_local_storage *prev_storage, *storage;
struct bpf_local_storage **owner_storage_ptr;
@@ -546,7 +545,7 @@ int bpf_local_storage_alloc(void *owner,
*/
struct bpf_local_storage_data *
bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
- void *value, u64 map_flags, bool swap_uptrs, gfp_t gfp_flags)
+ void *value, u64 map_flags, bool swap_uptrs)
{
struct bpf_local_storage_data *old_sdata = NULL;
struct bpf_local_storage_elem *alloc_selem, *selem = NULL;
@@ -563,9 +562,6 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
!btf_record_has_field(smap->map.record, BPF_SPIN_LOCK)))
return ERR_PTR(-EINVAL);
- if (gfp_flags == GFP_KERNEL && (map_flags & ~BPF_F_LOCK) != BPF_NOEXIST)
- return ERR_PTR(-EINVAL);
-
local_storage = rcu_dereference_check(*owner_storage(smap, owner),
bpf_rcu_lock_held());
if (!local_storage || hlist_empty(&local_storage->list)) {
@@ -574,11 +570,11 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
if (err)
return ERR_PTR(err);
- selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!selem)
return ERR_PTR(-ENOMEM);
- err = bpf_local_storage_alloc(owner, smap, selem, gfp_flags);
+ err = bpf_local_storage_alloc(owner, smap, selem);
if (err) {
bpf_selem_free(selem, true);
mem_uncharge(smap, owner, smap->elem_size);
@@ -608,7 +604,7 @@ bpf_local_storage_update(void *owner, struct bpf_local_storage_map *smap,
/* A lookup has just been done before and concluded a new selem is
* needed. The chance of an unnecessary alloc is unlikely.
*/
- alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs, gfp_flags);
+ alloc_selem = selem = bpf_selem_alloc(smap, owner, value, swap_uptrs);
if (!alloc_selem)
return ERR_PTR(-ENOMEM);
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 55f4f22bb212..4b342be29eac 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -118,7 +118,7 @@ static long bpf_pid_task_storage_update_elem(struct bpf_map *map, void *key,
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value, map_flags,
- true, GFP_ATOMIC);
+ true);
err = PTR_ERR_OR_ZERO(sdata);
out:
@@ -165,9 +165,8 @@ static long bpf_pid_task_storage_delete_elem(struct bpf_map *map, void *key)
return err;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
- task, void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
+ task, void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -184,7 +183,7 @@ BPF_CALL_5(bpf_task_storage_get, struct bpf_map *, map, struct task_struct *,
(flags & BPF_LOCAL_STORAGE_GET_F_CREATE)) {
sdata = bpf_local_storage_update(
task, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
return IS_ERR(sdata) ? (unsigned long)NULL : (unsigned long)sdata->data;
}
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 566311dd4fba..c44d81bfce2e 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -589,14 +589,6 @@ static bool is_may_goto_insn_at(struct bpf_verifier_env *env, int insn_idx)
return bpf_is_may_goto_insn(&env->prog->insnsi[insn_idx]);
}
-static bool is_storage_get_function(enum bpf_func_id func_id)
-{
- return func_id == BPF_FUNC_sk_storage_get ||
- func_id == BPF_FUNC_inode_storage_get ||
- func_id == BPF_FUNC_task_storage_get ||
- func_id == BPF_FUNC_cgrp_storage_get;
-}
-
static bool helper_multiple_ref_obj_use(enum bpf_func_id func_id,
const struct bpf_map *map)
{
@@ -24414,24 +24406,6 @@ static int do_misc_fixups(struct bpf_verifier_env *env)
goto patch_call_imm;
}
- if (is_storage_get_function(insn->imm)) {
- if (env->insn_aux_data[i + delta].non_sleepable)
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_ATOMIC);
- else
- insn_buf[0] = BPF_MOV64_IMM(BPF_REG_5, (__force __s32)GFP_KERNEL);
- insn_buf[1] = *insn;
- cnt = 2;
-
- new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
- if (!new_prog)
- return -ENOMEM;
-
- delta += cnt - 1;
- env->prog = prog = new_prog;
- insn = new_prog->insnsi + i + delta;
- goto patch_call_imm;
- }
-
/* bpf_per_cpu_ptr() and bpf_this_cpu_ptr() */
if (env->insn_aux_data[i + delta].call_with_percpu_alloc_ptr) {
/* patch with 'r1 = *(u64 *)(r1 + 0)' since for percpu data,
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index 9fb22e352beb..14eb7812bda4 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -106,7 +106,7 @@ static long bpf_fd_sk_storage_update_elem(struct bpf_map *map, void *key,
if (sock) {
sdata = bpf_local_storage_update(
sock->sk, (struct bpf_local_storage_map *)map, value,
- map_flags, false, GFP_ATOMIC);
+ map_flags, false);
sockfd_put(sock);
return PTR_ERR_OR_ZERO(sdata);
}
@@ -137,7 +137,7 @@ bpf_sk_storage_clone_elem(struct sock *newsk,
{
struct bpf_local_storage_elem *copy_selem;
- copy_selem = bpf_selem_alloc(smap, newsk, NULL, false, GFP_ATOMIC);
+ copy_selem = bpf_selem_alloc(smap, newsk, NULL, false);
if (!copy_selem)
return NULL;
@@ -202,7 +202,7 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
}
bpf_selem_link_storage_nolock(new_sk_storage, copy_selem);
} else {
- ret = bpf_local_storage_alloc(newsk, smap, copy_selem, GFP_ATOMIC);
+ ret = bpf_local_storage_alloc(newsk, smap, copy_selem);
if (ret) {
bpf_selem_free(copy_selem, true);
atomic_sub(smap->elem_size,
@@ -227,9 +227,8 @@ int bpf_sk_storage_clone(const struct sock *sk, struct sock *newsk)
return ret;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags)
{
struct bpf_local_storage_data *sdata;
@@ -250,7 +249,7 @@ BPF_CALL_5(bpf_sk_storage_get, struct bpf_map *, map, struct sock *, sk,
refcount_inc_not_zero(&sk->sk_refcnt)) {
sdata = bpf_local_storage_update(
sk, (struct bpf_local_storage_map *)map, value,
- BPF_NOEXIST, false, gfp_flags);
+ BPF_NOEXIST, false);
/* sk must be a fullsock (guaranteed by verifier),
* so sock_gen_put() is unnecessary.
*/
@@ -383,16 +382,14 @@ static bool bpf_sk_storage_tracing_allowed(const struct bpf_prog *prog)
return false;
}
-/* *gfp_flags* is a hidden argument provided by the verifier */
-BPF_CALL_5(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
- void *, value, u64, flags, gfp_t, gfp_flags)
+BPF_CALL_4(bpf_sk_storage_get_tracing, struct bpf_map *, map, struct sock *, sk,
+ void *, value, u64, flags)
{
WARN_ON_ONCE(!bpf_rcu_lock_held());
if (in_hardirq() || in_nmi())
return (unsigned long)NULL;
- return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags,
- gfp_flags);
+ return (unsigned long)____bpf_sk_storage_get(map, sk, value, flags);
}
BPF_CALL_2(bpf_sk_storage_delete_tracing, struct bpf_map *, map,
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v2 2/3] bpf: Use kmalloc_nolock() universally in local storage
From: Amery Hung @ 2026-04-11 1:54 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411015419.114016-1-ameryhung@gmail.com>
Switch to kmalloc_nolock() universally in local storage. Socket local
storage didn't move to kmalloc_nolock() when BPF memory allocator was
replaced by it for performance reasons. Now that kfree_rcu() supports
freeing memory allocated by kmalloc_nolock(), we can move the remaining
local storages to use kmalloc_nolock() and cleanup the cluttered free
paths.
Use kfree() instead of kfree_nolock() in bpf_selem_free_trace_rcu() and
bpf_local_storage_free_trace_rcu(). Both callbacks run in process context
where spinning is allowed, so kfree_nolock() is unnecessary.
Benchmark:
./bench -p 1 local-storage-create --storage-type socket \
--batch-size {16,32,64}
The benchmark is a microbenchmark stress-testing how fast local storage
can be created. There is no measurable throughput change for socket local
storage after switching from kzalloc() to kmalloc_nolock().
Socket local storage
batch creation speed diff
--------------- ---- ------------------ ----
Baseline 16 433.9 ± 0.6 k/s
32 434.3 ± 1.4 k/s
64 434.2 ± 0.7 k/s
After 16 439.0 ± 1.9 k/s +1.2%
32 437.3 ± 2.0 k/s +0.7%
64 435.8 ± 2.5k/s +0.4%
Also worth noting that the baseline got a 5% throughput boost when sheaf
replaces percpu partial slab recently [0].
[0] https://lore.kernel.org/bpf/20260123-sheaves-for-all-v4-0-041323d506f7@suse.cz/
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
include/linux/bpf_local_storage.h | 8 +-
kernel/bpf/bpf_cgrp_storage.c | 2 +-
kernel/bpf/bpf_inode_storage.c | 2 +-
kernel/bpf/bpf_local_storage.c | 130 ++++--------------------------
kernel/bpf/bpf_task_storage.c | 2 +-
net/core/bpf_sk_storage.c | 2 +-
6 files changed, 21 insertions(+), 125 deletions(-)
diff --git a/include/linux/bpf_local_storage.h b/include/linux/bpf_local_storage.h
index 8157e8da61d4..dced54e9265f 100644
--- a/include/linux/bpf_local_storage.h
+++ b/include/linux/bpf_local_storage.h
@@ -54,7 +54,6 @@ struct bpf_local_storage_map {
u32 bucket_log;
u16 elem_size;
u16 cache_idx;
- bool use_kmalloc_nolock;
};
struct bpf_local_storage_data {
@@ -86,8 +85,7 @@ struct bpf_local_storage_elem {
*/
};
atomic_t state;
- bool use_kmalloc_nolock;
- /* 3 bytes hole */
+ /* 4 bytes hole */
/* The data is stored in another cacheline to minimize
* the number of cachelines access during a cache hit.
*/
@@ -104,7 +102,6 @@ struct bpf_local_storage {
rqspinlock_t lock; /* Protect adding/removing from the "list" */
u64 mem_charge; /* Copy of mem charged to owner. Protected by "lock" */
refcount_t owner_refcnt;/* Used to pin owner when map_free is uncharging */
- bool use_kmalloc_nolock;
};
/* U16_MAX is much more than enough for sk local storage
@@ -137,8 +134,7 @@ int bpf_local_storage_map_alloc_check(union bpf_attr *attr);
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock);
+ struct bpf_local_storage_cache *cache);
void __bpf_local_storage_insert_cache(struct bpf_local_storage *local_storage,
struct bpf_local_storage_map *smap,
diff --git a/kernel/bpf/bpf_cgrp_storage.c b/kernel/bpf/bpf_cgrp_storage.c
index c2a2ead1f466..d93ac2866748 100644
--- a/kernel/bpf/bpf_cgrp_storage.c
+++ b/kernel/bpf/bpf_cgrp_storage.c
@@ -114,7 +114,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *cgroup_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &cgroup_cache, true);
+ return bpf_local_storage_map_alloc(attr, &cgroup_cache);
}
static void cgroup_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_inode_storage.c b/kernel/bpf/bpf_inode_storage.c
index e86734609f3d..efc8996a4c0a 100644
--- a/kernel/bpf/bpf_inode_storage.c
+++ b/kernel/bpf/bpf_inode_storage.c
@@ -179,7 +179,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key,
static struct bpf_map *inode_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &inode_cache, false);
+ return bpf_local_storage_map_alloc(attr, &inode_cache);
}
static void inode_storage_map_free(struct bpf_map *map)
diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c
index cad17ca8552f..bc687b9d25a9 100644
--- a/kernel/bpf/bpf_local_storage.c
+++ b/kernel/bpf/bpf_local_storage.c
@@ -75,18 +75,12 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
if (mem_charge(smap, owner, smap->elem_size))
return NULL;
- if (smap->use_kmalloc_nolock) {
- selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
- __GFP_ZERO, NUMA_NO_NODE);
- } else {
- selem = bpf_map_kzalloc(&smap->map, smap->elem_size,
- gfp_flags | __GFP_NOWARN);
- }
+ selem = bpf_map_kmalloc_nolock(&smap->map, smap->elem_size,
+ __GFP_ZERO, NUMA_NO_NODE);
if (selem) {
RCU_INIT_POINTER(SDATA(selem)->smap, smap);
atomic_set(&selem->state, 0);
- selem->use_kmalloc_nolock = smap->use_kmalloc_nolock;
if (value) {
/* No need to call check_and_init_map_value as memory is zero init */
@@ -102,8 +96,7 @@ bpf_selem_alloc(struct bpf_local_storage_map *smap, void *owner,
return NULL;
}
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
+static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage *local_storage;
@@ -115,47 +108,14 @@ static void __bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
kfree(local_storage);
}
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_local_storage_free(struct bpf_local_storage *local_storage,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- kfree_rcu(local_storage, rcu);
- else
- call_rcu_tasks_trace(&local_storage->rcu,
- __bpf_local_storage_free_trace_rcu);
-}
-
-static void bpf_local_storage_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage *local_storage;
-
- local_storage = container_of(rcu, struct bpf_local_storage, rcu);
- kfree_nolock(local_storage);
-}
-
-static void bpf_local_storage_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- bpf_local_storage_free_rcu(rcu);
-}
-
static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bool reuse_now)
{
if (!local_storage)
return;
- if (!local_storage->use_kmalloc_nolock) {
- __bpf_local_storage_free(local_storage, reuse_now);
- return;
- }
-
if (reuse_now) {
- call_rcu(&local_storage->rcu, bpf_local_storage_free_rcu);
+ kfree_rcu(local_storage, rcu);
return;
}
@@ -163,42 +123,7 @@ static void bpf_local_storage_free(struct bpf_local_storage *local_storage,
bpf_local_storage_free_trace_rcu);
}
-/* rcu callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_rcu(struct rcu_head *rcu)
-{
- struct bpf_local_storage_elem *selem;
- struct bpf_local_storage_map *smap;
-
- selem = container_of(rcu, struct bpf_local_storage_elem, rcu);
- /* bpf_selem_unlink_nofail may have already cleared smap and freed fields. */
- smap = rcu_dereference_check(SDATA(selem)->smap, 1);
-
- if (smap)
- bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree(selem);
-}
-
-/* rcu tasks trace callback for use_kmalloc_nolock == false */
-static void __bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
- /*
- * RCU Tasks Trace grace period implies RCU grace period, do
- * kfree() directly.
- */
- __bpf_selem_free_rcu(rcu);
-}
-
-/* Handle use_kmalloc_nolock == false */
-static void __bpf_selem_free(struct bpf_local_storage_elem *selem,
- bool vanilla_rcu)
-{
- if (vanilla_rcu)
- call_rcu(&selem->rcu, __bpf_selem_free_rcu);
- else
- call_rcu_tasks_trace(&selem->rcu, __bpf_selem_free_trace_rcu);
-}
-
-static void bpf_selem_free_rcu(struct rcu_head *rcu)
+static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
{
struct bpf_local_storage_elem *selem;
struct bpf_local_storage_map *smap;
@@ -209,37 +134,24 @@ static void bpf_selem_free_rcu(struct rcu_head *rcu)
if (smap)
bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
- kfree_nolock(selem);
-}
-
-static void bpf_selem_free_trace_rcu(struct rcu_head *rcu)
-{
/*
* RCU Tasks Trace grace period implies RCU grace period, do
* kfree() directly.
*/
- bpf_selem_free_rcu(rcu);
+ kfree(selem);
}
void bpf_selem_free(struct bpf_local_storage_elem *selem,
bool reuse_now)
{
- if (!selem->use_kmalloc_nolock) {
- /*
- * No uptr will be unpin even when reuse_now == false since uptr
- * is only supported in task local storage, where
- * smap->use_kmalloc_nolock == true.
- */
- __bpf_selem_free(selem, reuse_now);
- return;
- }
+ struct bpf_local_storage_map *smap;
+
+ smap = rcu_dereference_check(SDATA(selem)->smap, 1);
if (reuse_now) {
- /*
- * While it is okay to call bpf_obj_free_fields() that unpins uptr when
- * reuse_now == true, keep it in bpf_selem_free_rcu() for simplicity.
- */
- call_rcu(&selem->rcu, bpf_selem_free_rcu);
+ if (smap)
+ bpf_obj_free_fields(smap->map.record, SDATA(selem)->data);
+ kfree_rcu(selem, rcu);
return;
}
@@ -576,12 +488,8 @@ int bpf_local_storage_alloc(void *owner,
if (err)
return err;
- if (smap->use_kmalloc_nolock)
- storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
- __GFP_ZERO, NUMA_NO_NODE);
- else
- storage = bpf_map_kzalloc(&smap->map, sizeof(*storage),
- gfp_flags | __GFP_NOWARN);
+ storage = bpf_map_kmalloc_nolock(&smap->map, sizeof(*storage),
+ __GFP_ZERO, NUMA_NO_NODE);
if (!storage) {
err = -ENOMEM;
goto uncharge;
@@ -591,7 +499,6 @@ int bpf_local_storage_alloc(void *owner,
raw_res_spin_lock_init(&storage->lock);
storage->owner = owner;
storage->mem_charge = sizeof(*storage);
- storage->use_kmalloc_nolock = smap->use_kmalloc_nolock;
refcount_set(&storage->owner_refcnt, 1);
bpf_selem_link_storage_nolock(storage, first_selem);
@@ -868,8 +775,7 @@ u64 bpf_local_storage_map_mem_usage(const struct bpf_map *map)
struct bpf_map *
bpf_local_storage_map_alloc(union bpf_attr *attr,
- struct bpf_local_storage_cache *cache,
- bool use_kmalloc_nolock)
+ struct bpf_local_storage_cache *cache)
{
struct bpf_local_storage_map *smap;
unsigned int i;
@@ -901,12 +807,6 @@ bpf_local_storage_map_alloc(union bpf_attr *attr,
smap->elem_size = offsetof(struct bpf_local_storage_elem,
sdata.data[attr->value_size]);
- /* In PREEMPT_RT, kmalloc(GFP_ATOMIC) is still not safe in non
- * preemptible context. Thus, enforce all storages to use
- * kmalloc_nolock() when CONFIG_PREEMPT_RT is enabled.
- */
- smap->use_kmalloc_nolock = IS_ENABLED(CONFIG_PREEMPT_RT) ? true : use_kmalloc_nolock;
-
smap->cache_idx = bpf_local_storage_cache_idx_get(cache);
return &smap->map;
diff --git a/kernel/bpf/bpf_task_storage.c b/kernel/bpf/bpf_task_storage.c
index 605506792b5b..55f4f22bb212 100644
--- a/kernel/bpf/bpf_task_storage.c
+++ b/kernel/bpf/bpf_task_storage.c
@@ -212,7 +212,7 @@ static int notsupp_get_next_key(struct bpf_map *map, void *key, void *next_key)
static struct bpf_map *task_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &task_cache, true);
+ return bpf_local_storage_map_alloc(attr, &task_cache);
}
static void task_storage_map_free(struct bpf_map *map)
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index f8338acebf07..9fb22e352beb 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -68,7 +68,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map)
static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
{
- return bpf_local_storage_map_alloc(attr, &sk_cache, false);
+ return bpf_local_storage_map_alloc(attr, &sk_cache);
}
static int notsupp_get_next_key(struct bpf_map *map, void *key,
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v2 1/3] selftests/bpf: Remove kmalloc tracing from local storage create bench
From: Amery Hung @ 2026-04-11 1:54 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
In-Reply-To: <20260411015419.114016-1-ameryhung@gmail.com>
Remove the raw_tp/kmalloc BPF program and its associated reporting from
the local storage create benchmark. The kmalloc count per create is not
a useful metric as different code paths use different allocators (e.g.
kmalloc_nolock vs kzalloc), introducing noise that makes the number
hard to interpret.
Keep total_creates in the summary output as it is useful for normalizing
perf statistics collected alongside the benchmark.
Signed-off-by: Amery Hung <ameryhung@gmail.com>
---
.../bpf/benchs/bench_local_storage_create.c | 21 ++++++-------------
.../bpf/progs/bench_local_storage_create.c | 11 ----------
2 files changed, 6 insertions(+), 26 deletions(-)
diff --git a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
index e2ff8ea1cb79..71e38000ee06 100644
--- a/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/benchs/bench_local_storage_create.c
@@ -101,11 +101,6 @@ static void setup(void)
}
}
- if (!bpf_program__attach(skel->progs.kmalloc)) {
- fprintf(stderr, "Error attaching bpf program\n");
- exit(1);
- }
-
threads = calloc(env.producer_cnt, sizeof(*threads));
if (!threads) {
@@ -140,7 +135,6 @@ static void setup(void)
static void measure(struct bench_res *res)
{
res->hits = atomic_swap(&skel->bss->create_cnts, 0);
- res->drops = atomic_swap(&skel->bss->kmalloc_cnts, 0);
}
static void *sk_producer(void *input)
@@ -203,28 +197,25 @@ static void *producer(void *input)
static void report_progress(int iter, struct bench_res *res, long delta_ns)
{
- double creates_per_sec, kmallocs_per_create;
+ double creates_per_sec;
creates_per_sec = res->hits / 1000.0 / (delta_ns / 1000000000.0);
- kmallocs_per_create = (double)res->drops / res->hits;
printf("Iter %3d (%7.3lfus): ",
iter, (delta_ns - 1000000000) / 1000.0);
- printf("creates %8.3lfk/s (%7.3lfk/prod), ",
+ printf("creates %8.3lfk/s (%7.3lfk/prod)\n",
creates_per_sec, creates_per_sec / env.producer_cnt);
- printf("%3.2lf kmallocs/create\n", kmallocs_per_create);
}
static void report_final(struct bench_res res[], int res_cnt)
{
double creates_mean = 0.0, creates_stddev = 0.0;
- long total_creates = 0, total_kmallocs = 0;
+ long total_creates = 0;
int i;
for (i = 0; i < res_cnt; i++) {
creates_mean += res[i].hits / 1000.0 / (0.0 + res_cnt);
total_creates += res[i].hits;
- total_kmallocs += res[i].drops;
}
if (res_cnt > 1) {
@@ -234,9 +225,9 @@ static void report_final(struct bench_res res[], int res_cnt)
(res_cnt - 1.0);
creates_stddev = sqrt(creates_stddev);
}
- printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), ",
- creates_mean, creates_stddev, creates_mean / env.producer_cnt);
- printf("%4.2lf kmallocs/create\n", (double)total_kmallocs / total_creates);
+ printf("Summary: creates %8.3lf \u00B1 %5.3lfk/s (%7.3lfk/prod), %ld total\n",
+ creates_mean, creates_stddev, creates_mean / env.producer_cnt,
+ total_creates);
if (create_owner_errs || skel->bss->create_errs)
printf("%s() errors %ld create_errs %ld\n",
storage_type == BPF_MAP_TYPE_SK_STORAGE ?
diff --git a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
index c8ec0d0368e4..25ca6045fea3 100644
--- a/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
+++ b/tools/testing/selftests/bpf/progs/bench_local_storage_create.c
@@ -8,7 +8,6 @@
long create_errs = 0;
long create_cnts = 0;
-long kmalloc_cnts = 0;
__u32 bench_pid = 0;
struct storage {
@@ -29,16 +28,6 @@ struct {
__type(value, struct storage);
} task_storage_map SEC(".maps");
-SEC("raw_tp/kmalloc")
-int BPF_PROG(kmalloc, unsigned long call_site, const void *ptr,
- size_t bytes_req, size_t bytes_alloc, gfp_t gfp_flags,
- int node)
-{
- __sync_fetch_and_add(&kmalloc_cnts, 1);
-
- return 0;
-}
-
SEC("tp_btf/sched_process_fork")
int BPF_PROG(sched_process_fork, struct task_struct *parent, struct task_struct *child)
{
--
2.52.0
^ permalink raw reply related
* [PATCH bpf-next v2 0/3] Use kmalloc_nolock() universally in BPF local storage
From: Amery Hung @ 2026-04-11 1:54 UTC (permalink / raw)
To: bpf
Cc: netdev, alexei.starovoitov, andrii, daniel, martin.lau, memxor,
ameryhung, kernel-team
Socket local storage did not convert to use kmalloc_nolock() since there
were observable performance degredation due to kfree_nolock() hitting the
slow path and the lack of kfree_rcu()-like batching freeing. Now that
these concern were addressed in slub, convert all remaining local storage
flavors to use kmalloc_nolock().
v1 -> v2:
- Fix build (CI, Alexei)
Amery Hung (3):
selftests/bpf: Remove kmalloc tracing from local storage create bench
bpf: Use kmalloc_nolock() universally in local storage
bpf: Remove gfp_flags plumbing from bpf_local_storage_update()
include/linux/bpf_local_storage.h | 15 +-
kernel/bpf/bpf_cgrp_storage.c | 11 +-
kernel/bpf/bpf_inode_storage.c | 11 +-
kernel/bpf/bpf_local_storage.c | 146 +++---------------
kernel/bpf/bpf_task_storage.c | 11 +-
kernel/bpf/verifier.c | 26 ----
net/core/bpf_sk_storage.c | 23 ++-
.../bpf/benchs/bench_local_storage_create.c | 21 +--
.../bpf/progs/bench_local_storage_create.c | 11 --
9 files changed, 57 insertions(+), 218 deletions(-)
--
2.52.0
^ permalink raw reply
* [PATCH net-next v2 2/2] selftests/bpf: verify syncookie statistics in tcp_custom_syncookie
From: Jiayuan Chen @ 2026-04-11 1:32 UTC (permalink / raw)
To: netdev
Cc: Jiayuan Chen, Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima,
David S. Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
Shuah Khan, linux-kernel, bpf, linux-kselftest
In-Reply-To: <20260411013211.225834-1-jiayuan.chen@linux.dev>
Add read_tcpext_snmp() helper to network_helpers which reads a
TcpExt SNMP counter via nstat, and use it in the tcp_custom_syncookie
test to verify that LINUX_MIB_SYNCOOKIESRECV is incremented and
LINUX_MIB_SYNCOOKIESFAILED stays unchanged across a successful
BPF custom syncookie validation.
The delta is captured between start_server() and accept(), which
covers the full SYN/ACK/cookie-check path for one connection.
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
tools/testing/selftests/bpf/network_helpers.c | 22 +++++++++++++++++++
tools/testing/selftests/bpf/network_helpers.h | 1 +
.../bpf/prog_tests/tcp_custom_syncookie.c | 20 +++++++++++++++++
3 files changed, 43 insertions(+)
diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c
index b82f572641b7..3388dd5112b6 100644
--- a/tools/testing/selftests/bpf/network_helpers.c
+++ b/tools/testing/selftests/bpf/network_helpers.c
@@ -621,6 +621,28 @@ int get_socket_local_port(int sock_fd)
return -1;
}
+int read_tcpext_snmp(const char *name, unsigned long *val)
+{
+ char cmd[128], buf[128];
+ int ret = 0;
+ FILE *f;
+
+ snprintf(cmd, sizeof(cmd),
+ "nstat -az TcpExt%s | awk '/TcpExt/ {print $2}'", name);
+ f = popen(cmd, "r");
+ if (!f)
+ return -errno;
+
+ if (!fgets(buf, sizeof(buf), f)) {
+ ret = ferror(f) ? -errno : -ENODATA;
+ goto out;
+ }
+ *val = strtoul(buf, NULL, 10);
+out:
+ pclose(f);
+ return ret;
+}
+
int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param)
{
struct ifreq ifr = {0};
diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h
index 79a010c88e11..c53cd781df6e 100644
--- a/tools/testing/selftests/bpf/network_helpers.h
+++ b/tools/testing/selftests/bpf/network_helpers.h
@@ -84,6 +84,7 @@ int make_sockaddr(int family, const char *addr_str, __u16 port,
struct sockaddr_storage *addr, socklen_t *len);
char *ping_command(int family);
int get_socket_local_port(int sock_fd);
+int read_tcpext_snmp(const char *name, unsigned long *val);
int get_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param);
int set_hw_ring_size(char *ifname, struct ethtool_ringparam *ring_param);
diff --git a/tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c b/tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c
index eaf441dc7e79..6adfb4b892f8 100644
--- a/tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c
+++ b/tools/testing/selftests/bpf/prog_tests/tcp_custom_syncookie.c
@@ -91,12 +91,21 @@ static void transfer_message(int sender, int receiver)
static void create_connection(struct test_tcp_custom_syncookie_case *test_case)
{
+ unsigned long recv_before, recv_after;
+ unsigned long failed_before, failed_after;
int server, client, child;
server = start_server(test_case->family, test_case->type, test_case->addr, 0, 0);
if (!ASSERT_NEQ(server, -1, "start_server"))
return;
+ if (!ASSERT_OK(read_tcpext_snmp("SyncookiesRecv", &recv_before),
+ "read SyncookiesRecv before"))
+ goto close_server;
+ if (!ASSERT_OK(read_tcpext_snmp("SyncookiesFailed", &failed_before),
+ "read SyncookiesFailed before"))
+ goto close_server;
+
client = connect_to_fd(server, 0);
if (!ASSERT_NEQ(client, -1, "connect_to_fd"))
goto close_server;
@@ -105,9 +114,20 @@ static void create_connection(struct test_tcp_custom_syncookie_case *test_case)
if (!ASSERT_NEQ(child, -1, "accept"))
goto close_client;
+ if (!ASSERT_OK(read_tcpext_snmp("SyncookiesRecv", &recv_after),
+ "read SyncookiesRecv after"))
+ goto close_child;
+ if (!ASSERT_OK(read_tcpext_snmp("SyncookiesFailed", &failed_after),
+ "read SyncookiesFailed after"))
+ goto close_child;
+
+ ASSERT_EQ(recv_after - recv_before, 1, "SyncookiesRecv delta");
+ ASSERT_EQ(failed_after - failed_before, 0, "SyncookiesFailed delta");
+
transfer_message(client, child);
transfer_message(child, client);
+close_child:
close(child);
close_client:
close(client);
--
2.43.0
^ permalink raw reply related
* [PATCH net-next v2 1/2] net: add missing syncookie statistics for BPF custom syncookies
From: Jiayuan Chen @ 2026-04-11 1:32 UTC (permalink / raw)
To: netdev
Cc: Jiayuan Chen, Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima,
David S. Miller, Jakub Kicinski, Paolo Abeni, Simon Horman,
David Ahern, Andrii Nakryiko, Eduard Zingerman,
Alexei Starovoitov, Daniel Borkmann, Martin KaFai Lau, Song Liu,
Yonghong Song, John Fastabend, KP Singh, Stanislav Fomichev,
Hao Luo, Jiri Olsa, Shuah Khan, linux-kernel, bpf,
linux-kselftest
1. Replace IS_ENABLED(CONFIG_BPF) with CONFIG_BPF_SYSCALL for
cookie_bpf_ok() and cookie_bpf_check(). CONFIG_BPF is selected by
CONFIG_NET unconditionally, so IS_ENABLED(CONFIG_BPF) is always
true and provides no real guard. CONFIG_BPF_SYSCALL is the correct
config for BPF program functionality.
2. Remove the CONFIG_BPF_SYSCALL guard around struct bpf_tcp_req_attrs.
This struct is referenced by bpf_sk_assign_tcp_reqsk() in
net/core/filter.c which is compiled unconditionally, so wrapping
the definition in a config guard could cause build failures when
CONFIG_BPF_SYSCALL=n.
3. Fix mismatched declaration of cookie_bpf_check() between the
CONFIG_BPF_SYSCALL and stub paths: the real definition takes
'struct net *net' but the declaration in the header did not.
Add the net parameter to the declaration and all call sites.
4. Add missing LINUX_MIB_SYNCOOKIESRECV and LINUX_MIB_SYNCOOKIESFAILED
statistics in cookie_bpf_check(), so that BPF custom syncookie
validation is accounted for in SNMP counters just like the
non-BPF path.
Compile-tested with CONFIG_BPF_SYSCALL=y and CONFIG_BPF_SYSCALL
not set.
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
No functional bug here — CONFIG_BPF is always enabled under
CONFIG_NET, so the existing code compiles and works correctly.
This is a cleanup and improvement, no backport needed.
Added a selftest, suggested by Kuniyuki Iwashima.
---
include/net/tcp.h | 7 +++----
net/ipv4/syncookies.c | 10 +++++++---
net/ipv6/syncookies.c | 2 +-
3 files changed, 11 insertions(+), 8 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index 6156d1d068e1..570a8836c2ba 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -598,7 +598,6 @@ struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops,
struct tcp_options_received *tcp_opt,
int mss, u32 tsoff);
-#if IS_ENABLED(CONFIG_BPF)
struct bpf_tcp_req_attrs {
u32 rcv_tsval;
u32 rcv_tsecr;
@@ -612,7 +611,6 @@ struct bpf_tcp_req_attrs {
u8 usec_ts_ok;
u8 reserved[3];
};
-#endif
#ifdef CONFIG_SYN_COOKIES
@@ -715,13 +713,14 @@ static inline bool cookie_ecn_ok(const struct net *net, const struct dst_entry *
dst_feature(dst, RTAX_FEATURE_ECN);
}
-#if IS_ENABLED(CONFIG_BPF)
+#ifdef CONFIG_BPF_SYSCALL
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
return skb->sk;
}
-struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb);
+struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb);
#else
static inline bool cookie_bpf_ok(struct sk_buff *skb)
{
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index f1474598d2c8..d685631438cb 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -295,8 +295,9 @@ static int cookie_tcp_reqsk_init(struct sock *sk, struct sk_buff *skb,
return 0;
}
-#if IS_ENABLED(CONFIG_BPF)
-struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
+#ifdef CONFIG_BPF_SYSCALL
+struct request_sock *cookie_bpf_check(struct net *net, struct sock *sk,
+ struct sk_buff *skb)
{
struct request_sock *req = inet_reqsk(skb->sk);
@@ -306,6 +307,9 @@ struct request_sock *cookie_bpf_check(struct sock *sk, struct sk_buff *skb)
if (cookie_tcp_reqsk_init(sk, skb, req)) {
reqsk_free(req);
req = NULL;
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESFAILED);
+ } else {
+ __NET_INC_STATS(net, LINUX_MIB_SYNCOOKIESRECV);
}
return req;
@@ -419,7 +423,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
goto out;
if (cookie_bpf_ok(skb)) {
- req = cookie_bpf_check(sk, skb);
+ req = cookie_bpf_check(net, sk, skb);
} else {
req = cookie_tcp_check(net, sk, skb);
if (IS_ERR(req))
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index 4f6f0d751d6c..111d7a41d957 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -190,7 +190,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
goto out;
if (cookie_bpf_ok(skb)) {
- req = cookie_bpf_check(sk, skb);
+ req = cookie_bpf_check(net, sk, skb);
} else {
req = cookie_tcp_check(net, sk, skb);
if (IS_ERR(req))
--
2.43.0
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox