Netdev List

Netdev List
 help / color / mirror / Atom feed

* [PATCH net-next 14/15] batman-adv: tp_meter: adjust name of receiver lock
From: Simon Wunderlich @ 2026-06-30 14:06 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, b.a.t.m.a.n, Sven Eckelmann, Simon Wunderlich
In-Reply-To: <20260630140623.88431-1-sw@simonwunderlich.de>

From: Sven Eckelmann <sven@narfation.org>

The lock used to protect the receiver from reading/writing in parallel to
ack sequence number relevant data was still called unacked_lock. But it is
no longer only about the unacked_list. Use a broader term to reflect this.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/tp_meter.c | 20 ++++++++++----------
 net/batman-adv/types.h    |  4 ++--
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index f18ce360839d3..ffd3171d4b992 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1252,13 +1252,13 @@ static void batadv_tp_receiver_release(struct kref *ref)
 	/* lock should not be needed because this object is now out of any
 	 * context!
 	 */
-	spin_lock_bh(&tp_vars->unacked_lock);
+	spin_lock_bh(&tp_vars->ack_seqno_lock);
 	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
 		list_del(&un->list);
 		kfree(un);
 		tp_vars->unacked_count--;
 	}
-	spin_unlock_bh(&tp_vars->unacked_lock);
+	spin_unlock_bh(&tp_vars->ack_seqno_lock);
 
 	kfree_rcu(tp_vars, common.rcu);
 }
@@ -1316,13 +1316,13 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
 	if (batadv_tp_list_detach(&tp_vars->common))
 		batadv_tp_receiver_put(tp_vars);
 
-	spin_lock_bh(&tp_vars->unacked_lock);
+	spin_lock_bh(&tp_vars->ack_seqno_lock);
 	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
 		list_del(&un->list);
 		kfree(un);
 		tp_vars->unacked_count--;
 	}
-	spin_unlock_bh(&tp_vars->unacked_lock);
+	spin_unlock_bh(&tp_vars->ack_seqno_lock);
 
 	/* drop reference of timer */
 	if (WARN_ON(atomic_xchg(&tp_vars->receiving, 0) != 1))
@@ -1415,7 +1415,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
  */
 static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 					  u32 seqno, u32 payload_len)
-	__must_hold(&tp_vars->unacked_lock)
+	__must_hold(&tp_vars->ack_seqno_lock)
 {
 	struct batadv_tp_unacked *un, *new;
 	struct batadv_tp_unacked *safe;
@@ -1532,7 +1532,7 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
  * @tp_vars: the private data of the current TP meter session
  */
 static void batadv_tp_ack_unordered(struct batadv_tp_receiver *tp_vars)
-	__must_hold(&tp_vars->unacked_lock)
+	__must_hold(&tp_vars->ack_seqno_lock)
 {
 	struct batadv_tp_unacked *un, *safe;
 	u32 to_ack;
@@ -1602,7 +1602,7 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv,
 	tp_vars->common.bat_priv = bat_priv;
 	kref_init(&tp_vars->common.refcount);
 
-	spin_lock_init(&tp_vars->unacked_lock);
+	spin_lock_init(&tp_vars->ack_seqno_lock);
 	INIT_LIST_HEAD(&tp_vars->unacked_list);
 	tp_vars->unacked_count = 0;
 
@@ -1664,7 +1664,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 		WRITE_ONCE(tp_vars->last_recv_time, jiffies);
 	}
 
-	spin_lock_bh(&tp_vars->unacked_lock);
+	spin_lock_bh(&tp_vars->ack_seqno_lock);
 
 	/* if the packet is a duplicate, it may be the case that an ACK has been
 	 * lost. Resend the ACK
@@ -1680,7 +1680,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 		 * not been enqueued correctly
 		 */
 		if (!batadv_tp_handle_out_of_order(tp_vars, seqno, payload_len)) {
-			spin_unlock_bh(&tp_vars->unacked_lock);
+			spin_unlock_bh(&tp_vars->ack_seqno_lock);
 			goto out;
 		}
 
@@ -1696,7 +1696,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 
 send_ack:
 	to_ack = tp_vars->last_recv;
-	spin_unlock_bh(&tp_vars->unacked_lock);
+	spin_unlock_bh(&tp_vars->ack_seqno_lock);
 
 	/* send the ACK. If the received packet was out of order, the ACK that
 	 * is going to be sent is a duplicate (the sender will count them and
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index c194d8069774c..cd12755d21f35 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1474,8 +1474,8 @@ struct batadv_tp_receiver {
 	/** @unacked_list: list of unacked packets (meta-info only) */
 	struct list_head unacked_list;
 
-	/** @unacked_lock: protect unacked_list + &batadv_tp_receiver.last_recv */
-	spinlock_t unacked_lock;
+	/** @ack_seqno_lock: protect unacked_list + &batadv_tp_receiver.last_recv */
+	spinlock_t ack_seqno_lock;
 
 	/** @unacked_count: number of unacked entries */
 	size_t unacked_count;
-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 13/15] batman-adv: tp_meter: keep unacked list for receivers
From: Simon Wunderlich @ 2026-06-30 14:06 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, b.a.t.m.a.n, Sven Eckelmann, Simon Wunderlich
In-Reply-To: <20260630140623.88431-1-sw@simonwunderlich.de>

From: Sven Eckelmann <sven@narfation.org>

There is no need to share the unacked list between sender and receivers.
Only receivers will ever write to and read from it. The initialization in
batadv_tp_start() was therefore never needed. After its removal, it is
enough to just store it in struct batadv_tp_receiver.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/tp_meter.c | 110 +++++++++++++++++++++-----------------
 net/batman-adv/types.h    |  20 +++----
 2 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index 5cc719c81ea0b..f18ce360839d3 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -358,28 +358,16 @@ batadv_tp_list_find_sender_session(struct batadv_priv *bat_priv, const u8 *dst,
 }
 
 /**
- * batadv_tp_vars_common_release() - release batadv_tp_vars_common from lists
+ * batadv_tp_sender_release() - release batadv_tp_sender
  *  and queue for free after rcu grace period
- * @ref: kref pointer of the batadv_tp_vars_common
+ * @ref: kref pointer of the batadv_tp_sender
  */
-static void batadv_tp_vars_common_release(struct kref *ref)
+static void batadv_tp_sender_release(struct kref *ref)
 {
-	struct batadv_tp_vars_common *tp_vars;
-	struct batadv_tp_unacked *un, *safe;
-
-	tp_vars = container_of(ref, struct batadv_tp_vars_common, refcount);
-
-	/* lock should not be needed because this object is now out of any
-	 * context!
-	 */
-	spin_lock_bh(&tp_vars->unacked_lock);
-	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
-		list_del(&un->list);
-		kfree(un);
-	}
-	spin_unlock_bh(&tp_vars->unacked_lock);
+	struct batadv_tp_sender *tp_vars;
 
-	kfree_rcu(tp_vars, rcu);
+	tp_vars = container_of(ref, struct batadv_tp_sender, common.refcount);
+	kfree_rcu(tp_vars, common.rcu);
 }
 
 /**
@@ -392,7 +380,7 @@ static void batadv_tp_sender_put(struct batadv_tp_sender *tp_vars)
 	if (!tp_vars)
 		return;
 
-	kref_put(&tp_vars->common.refcount, batadv_tp_vars_common_release);
+	kref_put(&tp_vars->common.refcount, batadv_tp_sender_release);
 }
 
 /**
@@ -1145,9 +1133,6 @@ void batadv_tp_start(struct batadv_priv *bat_priv, const u8 *dst,
 	init_waitqueue_head(&tp_vars->more_bytes);
 	init_completion(&tp_vars->finished);
 
-	spin_lock_init(&tp_vars->common.unacked_lock);
-	INIT_LIST_HEAD(&tp_vars->common.unacked_list);
-
 	spin_lock_init(&tp_vars->cc_lock);
 
 	tp_vars->prerandom_offset = 0;
@@ -1251,6 +1236,33 @@ batadv_tp_list_find_receiver_session(struct batadv_priv *bat_priv, const u8 *dst
 	return tp_vars;
 }
 
+/**
+ * batadv_tp_receiver_release() - release batadv_tp_receiver
+ *  and queue for free after rcu grace period
+ * @ref: kref pointer of the batadv_tp_receiver
+ */
+static void batadv_tp_receiver_release(struct kref *ref)
+{
+	struct batadv_tp_receiver *tp_vars;
+	struct batadv_tp_unacked *safe;
+	struct batadv_tp_unacked *un;
+
+	tp_vars = container_of(ref, struct batadv_tp_receiver, common.refcount);
+
+	/* lock should not be needed because this object is now out of any
+	 * context!
+	 */
+	spin_lock_bh(&tp_vars->unacked_lock);
+	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
+		list_del(&un->list);
+		kfree(un);
+		tp_vars->unacked_count--;
+	}
+	spin_unlock_bh(&tp_vars->unacked_lock);
+
+	kfree_rcu(tp_vars, common.rcu);
+}
+
 /**
  * batadv_tp_receiver_put() - decrement the batadv_tp_receiver
  *  refcounter and possibly release it
@@ -1261,7 +1273,7 @@ static void batadv_tp_receiver_put(struct batadv_tp_receiver *tp_vars)
 	if (!tp_vars)
 		return;
 
-	kref_put(&tp_vars->common.refcount, batadv_tp_vars_common_release);
+	kref_put(&tp_vars->common.refcount, batadv_tp_receiver_release);
 }
 
 /**
@@ -1304,13 +1316,13 @@ static void batadv_tp_receiver_shutdown(struct timer_list *t)
 	if (batadv_tp_list_detach(&tp_vars->common))
 		batadv_tp_receiver_put(tp_vars);
 
-	spin_lock_bh(&tp_vars->common.unacked_lock);
-	list_for_each_entry_safe(un, safe, &tp_vars->common.unacked_list, list) {
+	spin_lock_bh(&tp_vars->unacked_lock);
+	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
 		list_del(&un->list);
 		kfree(un);
-		tp_vars->common.unacked_count--;
+		tp_vars->unacked_count--;
 	}
-	spin_unlock_bh(&tp_vars->common.unacked_lock);
+	spin_unlock_bh(&tp_vars->unacked_lock);
 
 	/* drop reference of timer */
 	if (WARN_ON(atomic_xchg(&tp_vars->receiving, 0) != 1))
@@ -1403,7 +1415,7 @@ static int batadv_tp_send_ack(struct batadv_priv *bat_priv, const u8 *dst,
  */
 static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 					  u32 seqno, u32 payload_len)
-	__must_hold(&tp_vars->common.unacked_lock)
+	__must_hold(&tp_vars->unacked_lock)
 {
 	struct batadv_tp_unacked *un, *new;
 	struct batadv_tp_unacked *safe;
@@ -1417,9 +1429,9 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 	new->len = payload_len;
 
 	/* if the list is empty immediately attach this new object */
-	if (list_empty(&tp_vars->common.unacked_list)) {
-		list_add(&new->list, &tp_vars->common.unacked_list);
-		tp_vars->common.unacked_count++;
+	if (list_empty(&tp_vars->unacked_list)) {
+		list_add(&new->list, &tp_vars->unacked_list);
+		tp_vars->unacked_count++;
 		return true;
 	}
 
@@ -1430,7 +1442,7 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 	 * the last received packet (the one being processed now) has a bigger
 	 * seqno than all the others already stored.
 	 */
-	list_for_each_entry_reverse(un, &tp_vars->common.unacked_list, list) {
+	list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) {
 		/* look for the right position - an un which is smaller */
 		if (batadv_seq_before(new->seqno, un->seqno))
 			continue;
@@ -1476,19 +1488,19 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 		 */
 		list_add(&new->list, &un->list);
 		added = true;
-		tp_vars->common.unacked_count++;
+		tp_vars->unacked_count++;
 		break;
 	}
 
 	/* received packet with smallest seqno out of order; add it to front */
 	if (!added) {
-		list_add(&new->list, &tp_vars->common.unacked_list);
-		tp_vars->common.unacked_count++;
+		list_add(&new->list, &tp_vars->unacked_list);
+		tp_vars->unacked_count++;
 	}
 
 	/* check if new filled the gap to the next list entries */
 	un = new;
-	list_for_each_entry_safe_continue(un, safe, &tp_vars->common.unacked_list, list) {
+	list_for_each_entry_safe_continue(un, safe, &tp_vars->unacked_list, list) {
 		if (batadv_seq_before(new->seqno + new->len, un->seqno))
 			break;
 
@@ -1499,16 +1511,16 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 
 		list_del(&un->list);
 		kfree(un);
-		tp_vars->common.unacked_count--;
+		tp_vars->unacked_count--;
 	}
 
 	/* remove the last (biggest) unacked seqno when list is too large */
-	if (tp_vars->common.unacked_count > BATADV_TP_MAX_UNACKED) {
-		un = list_last_entry(&tp_vars->common.unacked_list,
+	if (tp_vars->unacked_count > BATADV_TP_MAX_UNACKED) {
+		un = list_last_entry(&tp_vars->unacked_list,
 				     struct batadv_tp_unacked, list);
 		list_del(&un->list);
 		kfree(un);
-		tp_vars->common.unacked_count--;
+		tp_vars->unacked_count--;
 	}
 
 	return true;
@@ -1520,7 +1532,7 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
  * @tp_vars: the private data of the current TP meter session
  */
 static void batadv_tp_ack_unordered(struct batadv_tp_receiver *tp_vars)
-	__must_hold(&tp_vars->common.unacked_lock)
+	__must_hold(&tp_vars->unacked_lock)
 {
 	struct batadv_tp_unacked *un, *safe;
 	u32 to_ack;
@@ -1528,7 +1540,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_receiver *tp_vars)
 	/* go through the unacked packet list and possibly ACK them as
 	 * well
 	 */
-	list_for_each_entry_safe(un, safe, &tp_vars->common.unacked_list, list) {
+	list_for_each_entry_safe(un, safe, &tp_vars->unacked_list, list) {
 		/* the list is ordered, therefore it is possible to stop as soon
 		 * there is a gap between the last acked seqno and the seqno of
 		 * the packet under inspection
@@ -1543,7 +1555,7 @@ static void batadv_tp_ack_unordered(struct batadv_tp_receiver *tp_vars)
 
 		list_del(&un->list);
 		kfree(un);
-		tp_vars->common.unacked_count--;
+		tp_vars->unacked_count--;
 	}
 }
 
@@ -1590,9 +1602,9 @@ batadv_tp_init_recv(struct batadv_priv *bat_priv,
 	tp_vars->common.bat_priv = bat_priv;
 	kref_init(&tp_vars->common.refcount);
 
-	spin_lock_init(&tp_vars->common.unacked_lock);
-	INIT_LIST_HEAD(&tp_vars->common.unacked_list);
-	tp_vars->common.unacked_count = 0;
+	spin_lock_init(&tp_vars->unacked_lock);
+	INIT_LIST_HEAD(&tp_vars->unacked_list);
+	tp_vars->unacked_count = 0;
 
 	kref_get(&tp_vars->common.refcount);
 	timer_setup(&tp_vars->common.timer, batadv_tp_receiver_shutdown, 0);
@@ -1652,7 +1664,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 		WRITE_ONCE(tp_vars->last_recv_time, jiffies);
 	}
 
-	spin_lock_bh(&tp_vars->common.unacked_lock);
+	spin_lock_bh(&tp_vars->unacked_lock);
 
 	/* if the packet is a duplicate, it may be the case that an ACK has been
 	 * lost. Resend the ACK
@@ -1668,7 +1680,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 		 * not been enqueued correctly
 		 */
 		if (!batadv_tp_handle_out_of_order(tp_vars, seqno, payload_len)) {
-			spin_unlock_bh(&tp_vars->common.unacked_lock);
+			spin_unlock_bh(&tp_vars->unacked_lock);
 			goto out;
 		}
 
@@ -1684,7 +1696,7 @@ static void batadv_tp_recv_msg(struct batadv_priv *bat_priv,
 
 send_ack:
 	to_ack = tp_vars->last_recv;
-	spin_unlock_bh(&tp_vars->common.unacked_lock);
+	spin_unlock_bh(&tp_vars->unacked_lock);
 
 	/* send the ACK. If the received packet was out of order, the ACK that
 	 * is going to be sent is a duplicate (the sender will count them and
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index c2ab00d8ef160..c194d8069774c 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1334,7 +1334,7 @@ struct batadv_tp_unacked {
 	/** @len: length of the packet */
 	u32 len;
 
-	/** @list: list node for &batadv_tp_vars_common.unacked_list */
+	/** @list: list node for &batadv_tp_receiver.unacked_list */
 	struct list_head list;
 };
 
@@ -1357,15 +1357,6 @@ struct batadv_tp_vars_common {
 	/** @session: TP session identifier */
 	u8 session[2];
 
-	/** @unacked_list: list of unacked packets (meta-info only) */
-	struct list_head unacked_list;
-
-	/** @unacked_lock: protect unacked_list + &batadv_tp_receiver.last_recv */
-	spinlock_t unacked_lock;
-
-	/** @unacked_count: number of unacked entries */
-	size_t unacked_count;
-
 	/** @refcount: number of context where the object is used */
 	struct kref refcount;
 
@@ -1479,6 +1470,15 @@ struct batadv_tp_receiver {
 
 	/** @last_recv_time: time (jiffies) a msg was received */
 	unsigned long last_recv_time;
+
+	/** @unacked_list: list of unacked packets (meta-info only) */
+	struct list_head unacked_list;
+
+	/** @unacked_lock: protect unacked_list + &batadv_tp_receiver.last_recv */
+	spinlock_t unacked_lock;
+
+	/** @unacked_count: number of unacked entries */
+	size_t unacked_count;
 };
 
 /**
-- 
2.47.3


^ permalink raw reply related

* [PATCH net-next 15/15] batman-adv: tp_meter: delay allocation of unacked entry
From: Simon Wunderlich @ 2026-06-30 14:06 UTC (permalink / raw)
  To: netdev
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, b.a.t.m.a.n, Sven Eckelmann, Simon Wunderlich
In-Reply-To: <20260630140623.88431-1-sw@simonwunderlich.de>

From: Sven Eckelmann <sven@narfation.org>

When batadv_tp_handle_out_of_order() searches the already existing list of
unacked packets, it can often find an entry to merge with. In this case, it
would be a waste of time and resources to allocate a batadv_tp_unacked
which is then immediately freed again.

Instead, search first through the list. Only when no mergeable entry could
be found, it is necessary to record the place to allocate+store the new
entry.

Signed-off-by: Sven Eckelmann <sven@narfation.org>
Signed-off-by: Simon Wunderlich <sw@simonwunderlich.de>
---
 net/batman-adv/tp_meter.c | 88 ++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 47 deletions(-)

diff --git a/net/batman-adv/tp_meter.c b/net/batman-adv/tp_meter.c
index ffd3171d4b992..00467aa79de9d 100644
--- a/net/batman-adv/tp_meter.c
+++ b/net/batman-adv/tp_meter.c
@@ -1417,26 +1417,15 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 					  u32 seqno, u32 payload_len)
 	__must_hold(&tp_vars->ack_seqno_lock)
 {
-	struct batadv_tp_unacked *un, *new;
+	struct list_head *pos = &tp_vars->unacked_list;
+	struct batadv_tp_unacked *new = NULL;
+	u32 end_seqno = seqno + payload_len;
 	struct batadv_tp_unacked *safe;
-	bool added = false;
-
-	new = kmalloc_obj(*new, GFP_ATOMIC);
-	if (unlikely(!new))
-		return false;
-
-	new->seqno = seqno;
-	new->len = payload_len;
-
-	/* if the list is empty immediately attach this new object */
-	if (list_empty(&tp_vars->unacked_list)) {
-		list_add(&new->list, &tp_vars->unacked_list);
-		tp_vars->unacked_count++;
-		return true;
-	}
+	struct batadv_tp_unacked *un;
 
-	/* otherwise loop over the list and either drop the packet because this
-	 * is a duplicate or store it at the right position.
+	/* loop over the list to find either an existing entry which the new
+	 * seqno range can be merged with or the position at which a new entry
+	 * has to be inserted.
 	 *
 	 * The iteration is done in the reverse way because it is likely that
 	 * the last received packet (the one being processed now) has a bigger
@@ -1444,7 +1433,7 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 	 */
 	list_for_each_entry_reverse(un, &tp_vars->unacked_list, list) {
 		/* look for the right position - an un which is smaller */
-		if (batadv_seq_before(new->seqno, un->seqno))
+		if (batadv_seq_before(seqno, un->seqno))
 			continue;
 
 		/* smaller/equal seqno was found but they might be directly
@@ -1452,62 +1441,67 @@ static bool batadv_tp_handle_out_of_order(struct batadv_tp_receiver *tp_vars,
 		 *
 		 * It is already known that:
 		 *
-		 *	un->seqno <= new->seqno
+		 *	un->seqno <= seqno
 		 *
 		 * When establishing that:
 		 *
-		 *	new->seqno <= un->seqno + un->len
+		 *	seqno <= un->seqno + un->len
 		 *
 		 * Then it is not necessary to add a new entry because the
 		 * smaller/equal seqno of un might already contain the new
 		 * received packet or we only add new data directly after
 		 * the end of un. The latter can be identified using:
 		 *
-		 *	un->seqno + un->len <= new->seqno + new->len
+		 *	un->seqno + un->len <= end_seqno
 		 */
-		if (!batadv_seq_before(un->seqno + un->len, new->seqno)) {
+		if (!batadv_seq_before(un->seqno + un->len, seqno)) {
 			/* new data directly after un? */
-			if (!batadv_seq_before(new->seqno + new->len,
-					       un->seqno + un->len))
-				un->len = new->seqno + new->len - un->seqno;
+			if (!batadv_seq_before(end_seqno, un->seqno + un->len))
+				un->len = end_seqno - un->seqno;
 
-			/* un now represents both old un + new */
-			kfree(new);
-			added = true;
-
-			/* un has to be used to check if the gap to the next
-			 * seqno range was closed
+			/* un now represents both old un + new range and has to
+			 * be used to check if the gap to the next seqno range
+			 * was closed
 			 */
 			new = un;
-			break;
+		} else {
+			/* as soon as an entry having a smaller seqno is found,
+			 * the new one is attached _after_ it. In this way the
+			 * list is kept in ascending order
+			 */
+			pos = &un->list;
 		}
 
-		/* as soon as an entry having a smaller seqno is found, the new
-		 * one is attached _after_ it. In this way the list is kept in
-		 * ascending order
-		 */
-		list_add(&new->list, &un->list);
-		added = true;
-		tp_vars->unacked_count++;
 		break;
 	}
 
-	/* received packet with smallest seqno out of order; add it to front */
-	if (!added) {
-		list_add(&new->list, &tp_vars->unacked_list);
+	/* no entry to merge with was found; insert a new one after the entry
+	 * with the next smaller seqno (or at the front of the list when the
+	 * new seqno is the smallest or the list is empty)
+	 */
+	if (!new) {
+		new = kmalloc_obj(*new, GFP_ATOMIC);
+		if (unlikely(!new))
+			return false;
+
+		new->seqno = seqno;
+		new->len = payload_len;
+
+		list_add(&new->list, pos);
 		tp_vars->unacked_count++;
 	}
 
 	/* check if new filled the gap to the next list entries */
 	un = new;
 	list_for_each_entry_safe_continue(un, safe, &tp_vars->unacked_list, list) {
-		if (batadv_seq_before(new->seqno + new->len, un->seqno))
+		if (batadv_seq_before(end_seqno, un->seqno))
 			break;
 
 		/* next entry is overlapping or adjacent - combine both */
-		if (batadv_seq_before(new->seqno + new->len,
-				      un->seqno + un->len))
-			new->len = un->seqno + un->len - new->seqno;
+		if (batadv_seq_before(end_seqno, un->seqno + un->len)) {
+			end_seqno = un->seqno + un->len;
+			new->len = end_seqno - new->seqno;
+		}
 
 		list_del(&un->list);
 		kfree(un);
-- 
2.47.3


^ permalink raw reply related

* Re: [PATCH net v5 0/2] xfrm: fix async crypto (-EINPROGRESS) handling in validate_xmit_xfrm()
From: Steffen Klassert @ 2026-06-30 14:01 UTC (permalink / raw)
  To: Petr Wozniak; +Cc: netdev, sd, herbert, kuba, horms, pabeni, edumazet, davem
In-Reply-To: <20260621100327.40203-1-petr.wozniak@gmail.com>

On Sun, Jun 21, 2026 at 12:03:25PM +0200, Petr Wozniak wrote:
> This series fixes how the async crypto path (-EINPROGRESS from ->xmit())
> is handled in validate_xmit_xfrm() and its callers.
> 
> Patch 1 (previously sent on its own, v1-v4) makes validate_xmit_xfrm()
> return ERR_PTR(-EINPROGRESS) instead of NULL when a packet is stolen by
> async crypto, so __dev_queue_xmit() can tell it apart from a real drop
> and stop reporting -ENOMEM on noqueue/bridge interfaces.  v5 also covers
> the GSO segment loop, as Sabrina pointed out.
> 
> Patch 2 fixes a use-after-free found while looking at that GSO loop:
> validate_xmit_xfrm() unlinks async-stolen segments but never updates the
> list head ->prev, which validate_xmit_skb_list() later dereferences.
> 
> Changes in v5:
>  - 1/2: also propagate ERR_PTR(-EINPROGRESS) from the GSO segment loop
>    (the 2nd ->xmit() call); v4 only handled the single-skb path.  Restore
>    the blank line in validate_xmit_skb_list().  Add the missing
>    maintainers to Cc. (Sabrina Dubroca)
>  - 2/2: new patch -- fix the stale skb->prev use-after-free (also flagged
>    by Sashiko)
> 
> Changes in v4:
>  - Drop bool stolen tracking and the ERR_PTR return in
>    validate_xmit_skb_list(); use IS_ERR_OR_NULL() so stolen skbs are
>    silently skipped (Sabrina Dubroca)
>  - Drop ERR_PTR(-EINPROGRESS) handling in __dev_direct_xmit() (Sabrina Dubroca)
>  - Move validate_xmit_skb() return-value comment above the function
>    (Sabrina Dubroca)
> 
> Changes in v3:
>  - validate_xmit_skb_list(): set stolen=true only for -EINPROGRESS
>    (Sabrina Dubroca)
> 
> Changes in v2:
>  - Reset rc to NET_XMIT_SUCCESS only when PTR_ERR(skb) == -EINPROGRESS
>    (Sabrina Dubroca)
> 
> Petr Wozniak (2):
>   xfrm: propagate -EINPROGRESS from validate_xmit_xfrm()
>   xfrm: fix stale skb->prev after async crypto steals a GSO segment
> 
>  net/core/dev.c         | 10 ++++++++--
>  net/xfrm/xfrm_device.c | 12 ++++++++++--
>  2 files changed, 18 insertions(+), 4 deletions(-)

Series applied, thanks a lot!

^ permalink raw reply

* Re: [PATCH net v3 1/1] net/sched: sch_teql: Introduce slaves_lock to avoid race condition and UAF
From: Paolo Abeni @ 2026-06-30 14:10 UTC (permalink / raw)
  To: Jamal Hadi Salim
  Cc: netdev, davem, edumazet, kuba, horms, victor, jiri, security,
	zdi-disclosures, stable
In-Reply-To: <CAM0EoMn-6Ayjd3mxsiifDXwN1zdefx9eiRk_wWRpsuEh22LziA@mail.gmail.com>

On 6/30/26 1:49 PM, Jamal Hadi Salim wrote:
> On Tue, Jun 30, 2026 at 7:15 AM Paolo Abeni <pabeni@redhat.com> wrote:
>> On 6/28/26 1:12 PM, Jamal Hadi Salim wrote:
>>> The teql master->slaves singly linked list is not protected against
>>> multiple writes. It can be mod'ed concurently from teql_master_xmit(),
>>> teql_dequeue(), teql_init() and teql_destroy() without holding any list
>>> lock or RCU protection.
>>>
>>> zdi-disclosures@trendmicro.com has demonstrated that the qdisc is freed
>>> after an RCU grace period, but teql_master_xmit() running on another
>>> CPU can still hold a stale pointer into the list, resulting in a
>>> slab-use-after-free:
>>>
>>> BUG: KASAN: slab-use-after-free in teql_master_xmit+0xf0f/0x16b0
>>> Read of size 8 at addr ffff888013fb0440 by task poc/332
>>> Freed 512-byte region [ffff888013fb0400, ffff888013fb0600) (kmalloc-512)
>>>
>>> The fix?
>>> Add a per-master slaves_lock spinlock that serializes all mutations of
>>> master->slaves and the NEXT_SLAVE() links in teql_destroy() and
>>> teql_qdisc_init(). teql_master_xmit() also takes the same slaves_lock
>>> around those updates.
>>> Annotate master->slaves and the per-slave ->next pointer with __rcu and
>>> use the appropriate RCU accessors everywhere they are touched:
>>> rcu_assign_pointer() on the writer side (under slaves_lock),
>>> rcu_dereference_protected() for the writer-side loads (also under
>>> slaves_lock), rcu_dereference_bh() for the loads in teql_master_xmit() and
>>> rtnl_dereference() for the loads in teql_master_open()/teql_master_mtu(),
>>> which run under RTNL.
>>> Pair this with rcu_read_lock_bh()/rcu_read_unlock_bh() around the list
>>> traversal in teql_master_xmit(), so that readers either observe a fully
>>> linked list or are deferred until the in-flight mutation completes. The two
>>> early-return paths in teql_master_xmit() are updated to release the RCU-bh
>>> read-side critical section before returning, since leaving it held would
>>> disable BH on that CPU for good.
>>>
>>> Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
>>> Reported-by: zdi-disclosures@trendmicro.com
>>> Tested-by: Victor Nogueira <victor@mojatatu.com>
>>> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
>>
>> Looks good, thanks!
>>
>> Please note that sashiko/gemini found a pre-existing issues which may
>> require a follow-up/separate fix:
>>
>> https://sashiko.dev/#/patchset/20260628111229.669751-1-jhs%40mojatatu.com
>>
>> (the 2nd one in the above link, IDK how to generate a direct link to a
>> specific comment)
> 
> I just sent v4 which covered that but i will send a followup instead
> if you already applied.

The PW bot is went on vacation and no 'patch applied' notification is
reaching the ML; v3 is already applied.

> BTW: What is the ruling on when Sashiko finds a pre-existing issue?
> Should we address that as a separate follow-up patch? It is unclear
> what the policy is.

The general guidance is that pre-existing issues should be addressed
separately.

> This teql patch was one of the hardest to deal with in terms of
> reproduciability and the fact sashiko kept coming up with pre-existing
> issues - including the one Simon and I were discussing. Note: None of
> the pre-existing issues affected reproducibility at all although i am
> sure one of the AI-kiddies reading the sashiko reports will find a way
> to create a poc (this is why i entertain fixing them when they look
> simple enough)
Not an ideal situation both ways (which is increasingly the case).

Addressing incrementally pre-existing issues can lead to an huge/endless
number of iterations when touching some unfortunate area (4 is _not_ a
big number ;) delaying the actual fix indefinitely.

/P



^ permalink raw reply

* [PATCH v4] bpf: Fix smp_processor_id() call trace for preemptible kernels
From: Edward Adam Davis @ 2026-06-30 14:11 UTC (permalink / raw)
  To: sashiko-bot
  Cc: eadavis, jiayuan.chen, sashiko-reviews, andrii, ast, bpf, daniel,
	eddyz87, emil, jolsa, linux-kernel, martin.lau, memxor, netdev,
	song, syzkaller-bugs, yonghong.song
In-Reply-To: <20260630132226.C44601F000E9@smtp.kernel.org>

bpf_mem_cache_free_rcu() maybe called in preemptible context, this
will trigger the below warning message:

BUG: using smp_processor_id() in preemptible [00000000] code: syz.0.17/5820
caller is bpf_mem_cache_free_rcu+0x48/0xc0 kernel/bpf/memalloc.c:954
Call Trace:
 check_preemption_disabled+0xd3/0xe0 lib/smp_processor_id.c:47
 bpf_mem_cache_free_rcu+0x48/0xc0 kernel/bpf/memalloc.c:954
 rhtab_delete_elem+0x185a/0x1b30 kernel/bpf/hashtab.c:2969
 __rhtab_map_lookup_and_delete_batch+0x935/0xcb0 kernel/bpf/hashtab.c:3349
 bpf_map_do_batch+0x445/0x630 kernel/bpf/syscall.c:-1
 __sys_bpf+0x906/0xd90 kernel/bpf/syscall.c:-1

this_cpu_ptr() access needs to be guarded against migration.
Wrapping this batch operation in bpf_disable_instrumentation() risk
blinding BPF tracing globally on the CPU if preemption occurs.

bpf_disable_instrumentation() increments the per-CPU bpf_prog_active counter.
Because migrate_disable() and rcu_read_lock() do not disable preemption under
CONFIG_PREEMPT_RCU, the task can be preempted during this potentially long
loop.

If preempted, bpf_prog_active would remain elevated on that CPU, which could
cause subsequent tasks scheduled on the same CPU to silently drop BPF tracing
events (kprobes, tracepoints, perf).

Therefore, we fix this by disabling preemption rather than prohibiting
migration.

Fixes: 5af6807bdb10 ("bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().")
Reported-by: syzbot+fd7e415d891073b83e1f@syzkaller.appspotmail.com
Closes: https://syzkaller.appspot.com/bug?extid=fd7e415d891073b83e1f
Signed-off-by: Edward Adam Davis <eadavis@qq.com>
---
v1 -> v2: using guard against preemption
v2 -> v3: replace get/put_cpu() to bpf_disable/enable_instrumentation()
v3 -> v4: disable preempt to make this_cpu_ptr() work

 kernel/bpf/hashtab.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 9f394e1aa2e8..7b98c2eea685 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -3345,8 +3345,10 @@ static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
 	}
 
 	if (do_delete) {
+		get_cpu();
 		for (i = 0; i < total; i++)
 			rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+		put_cpu();
 	}
 
 	rcu_read_unlock();
-- 
2.43.0


^ permalink raw reply related

* [PATCH net] net: usb: lan78xx: disable VLAN filter in promiscuous mode
From: Enrico Pozzobon via B4 Relay @ 2026-06-30 14:15 UTC (permalink / raw)
  To: Thangaraj Samynathan, Rengarajan Sundararajan, UNGLinuxDriver,
	Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Woojung.Huh
  Cc: netdev, linux-usb, linux-kernel, Enrico Pozzobon

From: Enrico Pozzobon <enrico.pozzobon@dissecto.com>

The hardware VLAN filter (RFE_CTL_VLAN_FILTER_) drops VLAN-tagged frames
whose VID has not been registered via lan78xx_vlan_rx_add_vid(). It is
left enabled in promiscuous mode, so packet capture (e.g. tcpdump or
Wireshark) does not see tagged frames for unregistered VIDs.

Clear the filter while the interface is promiscuous and restore it from
NETIF_F_HW_VLAN_CTAG_FILTER otherwise. Enforce the same condition in
lan78xx_set_features() so netdev_update_features() cannot re-enable the
filter while promiscuous.

Fixes: 55d7de9de6c3 ("Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver")
Signed-off-by: Enrico Pozzobon <enrico.pozzobon@dissecto.com>
---
Currently, on microchip lan7801, enabling promiscuous mode does not
result in VLAN tagged packets being captured. This patch fixes this,
forcing the RFE_CTL_VLAN_FILTER_ flag to be off when promiscuous mode is
enabled.
---
 drivers/net/usb/lan78xx.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
index c4cebacabcb5..a1a53ef85cb9 100644
--- a/drivers/net/usb/lan78xx.c
+++ b/drivers/net/usb/lan78xx.c
@@ -1525,7 +1525,14 @@ static void lan78xx_set_multicast(struct net_device *netdev)
 	if (dev->net->flags & IFF_PROMISC) {
 		netif_dbg(dev, drv, dev->net, "promiscuous mode enabled");
 		pdata->rfe_ctl |= RFE_CTL_MCAST_EN_ | RFE_CTL_UCAST_EN_;
+		/* bypass VLAN filter so all tagged frames are captured */
+		pdata->rfe_ctl &= ~RFE_CTL_VLAN_FILTER_;
 	} else {
+		if (dev->net->features & NETIF_F_HW_VLAN_CTAG_FILTER)
+			pdata->rfe_ctl |= RFE_CTL_VLAN_FILTER_;
+		else
+			pdata->rfe_ctl &= ~RFE_CTL_VLAN_FILTER_;
+
 		if (dev->net->flags & IFF_ALLMULTI) {
 			netif_dbg(dev, drv, dev->net,
 				  "receive all multicast enabled");
@@ -3074,7 +3081,9 @@ static int lan78xx_set_features(struct net_device *netdev,
 	else
 		pdata->rfe_ctl &= ~RFE_CTL_VLAN_STRIP_;
 
-	if (features & NETIF_F_HW_VLAN_CTAG_FILTER)
+	/* keep VLAN filter off while promiscuous */
+	if ((features & NETIF_F_HW_VLAN_CTAG_FILTER) &&
+	    !(netdev->flags & IFF_PROMISC))
 		pdata->rfe_ctl |= RFE_CTL_VLAN_FILTER_;
 	else
 		pdata->rfe_ctl &= ~RFE_CTL_VLAN_FILTER_;

---
base-commit: dc59e4fea9d83f03bad6bddf3fa2e52491777482
change-id: 20260623-lan78xx-vlan-promisc-83af8a48a7ec

Best regards,
--  
Enrico Pozzobon <enrico.pozzobon@dissecto.com>



^ permalink raw reply related

* Re: [PATCH net-next v3 5/5] selftest: Add tests for useful handling of LSM denials on SCM_RIGHTS
From: Jakub Kicinski @ 2026-06-30 14:17 UTC (permalink / raw)
  To: Jori Koolstra
  Cc: Christian Brauner, Aleksa Sarai, Kuniyuki Iwashima,
	David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman, netdev,
	linux-fsdevel, linux-kernel
In-Reply-To: <20260629194327.2270798-6-jkoolstra@xs4all.nl>

On Mon, 29 Jun 2026 21:43:27 +0200 Jori Koolstra wrote:
> The test uses the following Smack labels:
> 
>    "Sender"   - label for the sending process
>    "Receiver" - label for the receiving process
>    "SecretX"   - labels for the files being passed

Not sure this test belongs in net/
99.9% of people running this test do not use Smack.
At the very least you need to use XFAIL instead of SKIP
we use skip for problems with the env which are fixable,
like a command missing.

^ permalink raw reply

* Re: [PATCH bpf-next v5 1/3] bpf: Add BPF_FIB_LOOKUP_VLAN flag to bpf_fib_lookup() helper
From: David Ahern @ 2026-06-30 14:18 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen, Avinash Duduskar, ast, daniel,
	andrii
  Cc: eddyz87, memxor, martin.lau, song, yonghong.song, jolsa, emil,
	john.fastabend, sdf, davem, edumazet, kuba, pabeni, horms, shuah,
	hawk, yatsenko, leon.hwang, kpsingh, a.s.protopopov, ameryhung,
	rongtao, eyal.birger, bpf, netdev, linux-kernel, linux-kselftest
In-Reply-To: <87echobb5h.fsf@toke.dk>

On 6/30/26 4:00 AM, Toke Høiland-Jørgensen wrote:
>> It does not make sense to require a flag to get lookup output. vlan
>> proto of 0 is not valid, so it is a clear indication that the vlan
>> output parameters were not set during the lookup.
> 
> Okay, so we could just unconditionally set the VLAN fields, but if we
> start rewriting the ifindex that would be a change of the existing
> behaviour that could break existing applications, no?

Consistently dealing with upper devices is one of the reasons I never
sent patches for vlan support.

xdp support is at the driver layer for real (physical) devices. The fib
lookup is going to return the vlan device index - a virtual device.
Support for xdp should not be propagated to virtual devices; it goes
against the intent of xdp. Any trip down this path will have to decide
how to handle vlan-in-vlan use cases. Where is the line drawn for fast
networking?

> 
> Specifically, if an XDP application has a table of the interfaces it
> forwards between, today they'd get a VLAN interface ifindex, which would
> not be in that table, and the application would return XDP_PASS. Whereas
> if we change the ifindex and populate the VLAN tag, suddenly the
> interface would be in the table, but because the application doesn't
> read the returned VLAN tag, it will end up sending packets out without
> tagging them, leading to broken forwarding.

I have not followed developments over the past few years. Does XDP have
support for vlan acceleration in the Tx path now? You really want that
to deal with vlans and not replicating s/w processing in ebpf.

> 
> So if we don't want the flag, we'd need some other mechanism to resolve
> the parent ifindex, AFAICT? Maybe a xdp_get_parent_ifindex() kfunc, say?
> That could also be made generic for other stacked interface types, I
> suppose.
> 
> WDYT?

dealing with stacked devices is hard :-)

What is the return is a bond device or a vlan on a bond device?


^ permalink raw reply

* Re: [PATCH] net: airoha: fix MIB stats collection to be lossless
From: Lorenzo Bianconi @ 2026-06-30 14:21 UTC (permalink / raw)
  To: Aniket Negi
  Cc: Andrew Lunn, David S . Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Christian Marangi, Simon Horman, linux-arm-kernel,
	linux-mediatek, netdev, linux-kernel
In-Reply-To: <20260630111834.233643-1-aniket.negi03@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 15019 bytes --]

> The airoha_dev_get_hw_stats() function had two correctness issues in the
> way it collects hardware MIB counters.
> 
> Bug 1: Read-clear race causes silent packet loss in statistics
> 
> airoha_update_hw_stats() read all MIB registers and then cleared them
> via REG_FE_GDM_MIB_CLEAR. There is a time window between the last
> register read and the hardware clear. Any packet that the hardware
> counts during this window is lost: the register is incremented, then
> cleared, without the increment ever being read by software. Under
> sustained traffic this causes a permanent and growing undercount in all
> reported statistics.
> 
> This is particularly misleading for tx_ok_pkts and tx_ok_bytes, which
> routers and traffic monitors use to detect packet forwarding loss
> between two points in a hardware-accelerated path (e.g., between two
> netdevs in the QDMA/PPE fast-path). An inaccurate count makes it
> impossible to reliably attribute drops in the forwarding pipeline
> without capturing traffic at both ends independently.
> 
> Bug 2: 32-bit counter overflow causes stat corruption
> 
> Several MIB registers are only 32 bits wide: tx_drops, tx_broadcast,
> tx_multicast, rx_drops, rx_broadcast, rx_multicast, rx_errors,
> rx_crc_error, rx_over_errors, rx_fragment, rx_jabber, and the runt and
> long buckets of the tx_len[]/rx_len[].
> 
> The original code relied on MIB_CLEAR to keep register values small
> enough that a simple '+= val' per cycle did not lose data across a
> wrap. Once clearing is removed (to fix Bug 1), raw '+= val' silently
> corrupts the accumulated software counter on overflow.
> 
> Fix both issues together:
> 
> - 64-bit H+L register pairs (tx_ok_pkts, tx_ok_bytes, tx_len[1..5],
>   rx_ok_pkts, rx_ok_bytes, rx_len[1..5]): read directly from hardware
>   without clearing. Hardware accumulates the full running total; a
>   single direct assignment per poll is correct and lossless.
> 
> - 32-bit registers (tx_drops, tx_broadcast, tx_multicast, rx_drops,
>   rx_broadcast, rx_multicast, rx_errors, rx_crc_error, rx_over_errors,
>   rx_fragment, rx_jabber, and the runt/long buckets in tx_len[0]/[6]
>   and rx_len[0]/[6]): track the previous hardware value in a new
>   hw_prev_stats sub-struct inside airoha_hw_stats and accumulate
>   (u32)(curr - prev) into the 64-bit software counter. Unsigned
>   subtraction handles wrap-around transparently:
>   prev=0xFFFFFF00, curr=0x00000010 -> delta=(u32)(0x10-0xFFFFFF00)=0x110
> 
> Remove the REG_FE_GDM_MIB_CLEAR write from airoha_update_hw_stats()
> entirely. Because the driver no longer clears hardware counters, the
> read-clear race window is eliminated.
> 
> The hw_prev_stats fields are zero-initialised by the existing
> devm_kzalloc() call in airoha_alloc_gdm_device().
> 
> Fixes: 8f4695fb67b2 ("net: airoha: better handle MIBs for GDM ports with multiple devs attached")
> Signed-off-by: Aniket Negi <aniket.negi03@gmail.com>
> ---
>  drivers/net/ethernet/airoha/airoha_eth.c | 132 +++++++++++------------
>  drivers/net/ethernet/airoha/airoha_eth.h |  22 ++++
>  2 files changed, 86 insertions(+), 68 deletions(-)
> 
> diff --git a/drivers/net/ethernet/airoha/airoha_eth.c b/drivers/net/ethernet/airoha/airoha_eth.c
> index 1caf6766f2c0..7ae4e294478e 100644
> --- a/drivers/net/ethernet/airoha/airoha_eth.c
> +++ b/drivers/net/ethernet/airoha/airoha_eth.c
> @@ -1696,133 +1696,133 @@ static void airoha_dev_get_hw_stats(struct airoha_gdm_dev *dev)
>  
>  	u64_stats_update_begin(&dev->stats.syncp);
>  
> -	/* TX */
> +	/* TX - 64-bit H+L registers: hw accumulates the total, read directly. */
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_OK_PKT_CNT_H(port->id));
> -	dev->stats.tx_ok_pkts += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_OK_PKT_CNT_L(port->id));
> -	dev->stats.tx_ok_pkts += val;
> +	dev->stats.tx_ok_pkts = (u64)val << 32;

I guess it is more readable to store REG_FE_GDM_TX_OK_PKT_CNT_L() read in val
here. Something like:

	val = airoha_fe_rr(eth, REG_FE_GDM_TX_OK_PKT_CNT_L(port->id));
	dev->stats.tx_ok_pkts += val;

This apply even to occurrence below

> +	dev->stats.tx_ok_pkts += airoha_fe_rr(eth, REG_FE_GDM_TX_OK_PKT_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_OK_BYTE_CNT_H(port->id));
> -	dev->stats.tx_ok_bytes += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_OK_BYTE_CNT_L(port->id));
> -	dev->stats.tx_ok_bytes += val;
> +	dev->stats.tx_ok_bytes = (u64)val << 32;
> +	dev->stats.tx_ok_bytes += airoha_fe_rr(eth, REG_FE_GDM_TX_OK_BYTE_CNT_L(port->id));
>  
> +	/* TX - 32-bit registers: accumulate delta to handle wrap-around. */
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_DROP_CNT(port->id));
> -	dev->stats.tx_drops += val;
> +	dev->stats.tx_drops += (u32)(val - dev->stats.hw_prev_stats.tx_drops);
> +	dev->stats.hw_prev_stats.tx_drops = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_BC_CNT(port->id));
> -	dev->stats.tx_broadcast += val;
> +	dev->stats.tx_broadcast += (u32)(val - dev->stats.hw_prev_stats.tx_broadcast);
> +	dev->stats.hw_prev_stats.tx_broadcast = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_MC_CNT(port->id));
> -	dev->stats.tx_multicast += val;
> +	dev->stats.tx_multicast += (u32)(val - dev->stats.hw_prev_stats.tx_multicast);
> +	dev->stats.hw_prev_stats.tx_multicast = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_RUNT_CNT(port->id));
> -	dev->stats.tx_len[i] += val;
> +	dev->stats.tx_len[i] += (u32)(val - dev->stats.hw_prev_stats.tx_len[i]);
> +	dev->stats.hw_prev_stats.tx_len[i] = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_E64_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_E64_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] += (u64)val << 32;

Since now we do not reset MIB counters, this is wrong, you can't use "+="

> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_E64_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L64_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L64_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] = (u64)val << 32;
> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L64_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L127_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L127_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] = (u64)val << 32;
> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L127_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L255_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L255_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] = (u64)val << 32;
> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L255_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L511_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L511_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] = (u64)val << 32;
> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L511_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L1023_CNT_H(port->id));
> -	dev->stats.tx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L1023_CNT_L(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] = (u64)val << 32;
> +	dev->stats.tx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_L1023_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_TX_ETH_LONG_CNT(port->id));
> -	dev->stats.tx_len[i++] += val;
> +	dev->stats.tx_len[i] += (u32)(val - dev->stats.hw_prev_stats.tx_len[i]);
> +	dev->stats.hw_prev_stats.tx_len[i++] = val;
>  
>  	/* RX */
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_OK_PKT_CNT_H(port->id));
> -	dev->stats.rx_ok_pkts += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_OK_PKT_CNT_L(port->id));
> -	dev->stats.rx_ok_pkts += val;
> +	dev->stats.rx_ok_pkts = (u64)val << 32;
> +	dev->stats.rx_ok_pkts += airoha_fe_rr(eth, REG_FE_GDM_RX_OK_PKT_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_OK_BYTE_CNT_H(port->id));
> -	dev->stats.rx_ok_bytes += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_OK_BYTE_CNT_L(port->id));
> -	dev->stats.rx_ok_bytes += val;
> +	dev->stats.rx_ok_bytes = (u64)val << 32;
> +	dev->stats.rx_ok_bytes += airoha_fe_rr(eth, REG_FE_GDM_RX_OK_BYTE_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_DROP_CNT(port->id));
> -	dev->stats.rx_drops += val;
> +	dev->stats.rx_drops += (u32)(val - dev->stats.hw_prev_stats.rx_drops);
> +	dev->stats.hw_prev_stats.rx_drops = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_BC_CNT(port->id));
> -	dev->stats.rx_broadcast += val;
> +	dev->stats.rx_broadcast += (u32)(val - dev->stats.hw_prev_stats.rx_broadcast);
> +	dev->stats.hw_prev_stats.rx_broadcast = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_MC_CNT(port->id));
> -	dev->stats.rx_multicast += val;
> +	dev->stats.rx_multicast += (u32)(val - dev->stats.hw_prev_stats.rx_multicast);
> +	dev->stats.hw_prev_stats.rx_multicast = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ERROR_DROP_CNT(port->id));
> -	dev->stats.rx_errors += val;
> +	dev->stats.rx_errors += (u32)(val - dev->stats.hw_prev_stats.rx_errors);
> +	dev->stats.hw_prev_stats.rx_errors = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_CRC_ERR_CNT(port->id));
> -	dev->stats.rx_crc_error += val;
> +	dev->stats.rx_crc_error += (u32)(val - dev->stats.hw_prev_stats.rx_crc_error);
> +	dev->stats.hw_prev_stats.rx_crc_error = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_OVERFLOW_DROP_CNT(port->id));
> -	dev->stats.rx_over_errors += val;
> +	dev->stats.rx_over_errors += (u32)(val - dev->stats.hw_prev_stats.rx_over_errors);
> +	dev->stats.hw_prev_stats.rx_over_errors = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_FRAG_CNT(port->id));
> -	dev->stats.rx_fragment += val;
> +	dev->stats.rx_fragment += (u32)(val - dev->stats.hw_prev_stats.rx_fragment);
> +	dev->stats.hw_prev_stats.rx_fragment = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_JABBER_CNT(port->id));
> -	dev->stats.rx_jabber += val;
> +	dev->stats.rx_jabber += (u32)(val - dev->stats.hw_prev_stats.rx_jabber);
> +	dev->stats.hw_prev_stats.rx_jabber = val;
>  
>  	i = 0;
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_RUNT_CNT(port->id));
> -	dev->stats.rx_len[i] += val;
> +	dev->stats.rx_len[i] += (u32)(val - dev->stats.hw_prev_stats.rx_len[i]);
> +	dev->stats.hw_prev_stats.rx_len[i] = val;
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_E64_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_E64_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] += (u64)val << 32;

same here.

> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_E64_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L64_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L64_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] = (u64)val << 32;
> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L64_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L127_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L127_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] = (u64)val << 32;
> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L127_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L255_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L255_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] = (u64)val << 32;
> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L255_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L511_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L511_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] = (u64)val << 32;
> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L511_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L1023_CNT_H(port->id));
> -	dev->stats.rx_len[i] += ((u64)val << 32);
> -	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L1023_CNT_L(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] = (u64)val << 32;
> +	dev->stats.rx_len[i++] += airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_L1023_CNT_L(port->id));
>  
>  	val = airoha_fe_rr(eth, REG_FE_GDM_RX_ETH_LONG_CNT(port->id));
> -	dev->stats.rx_len[i++] += val;
> +	dev->stats.rx_len[i] += (u32)(val - dev->stats.hw_prev_stats.rx_len[i]);
> +	dev->stats.hw_prev_stats.rx_len[i++] = val;
>  
>  	u64_stats_update_end(&dev->stats.syncp);
>  }
> @@ -1839,10 +1839,6 @@ static void airoha_update_hw_stats(struct airoha_gdm_dev *dev)
>  			airoha_dev_get_hw_stats(port->devs[i]);
>  	}
>  
> -	/* Reset MIB counters */
> -	airoha_fe_set(dev->eth, REG_FE_GDM_MIB_CLEAR(port->id),
> -		      FE_GDM_MIB_RX_CLEAR_MASK | FE_GDM_MIB_TX_CLEAR_MASK);
> -
>  	spin_unlock(&port->stats_lock);
>  }
>  
> diff --git a/drivers/net/ethernet/airoha/airoha_eth.h b/drivers/net/ethernet/airoha/airoha_eth.h
> index 2765244d937c..af12ad6eac17 100644
> --- a/drivers/net/ethernet/airoha/airoha_eth.h
> +++ b/drivers/net/ethernet/airoha/airoha_eth.h
> @@ -244,6 +244,28 @@ struct airoha_hw_stats {
>  	u64 rx_fragment;
>  	u64 rx_jabber;
>  	u64 rx_len[7];
> +
> +	struct {
> +	/* Previous HW register values for 32-bit counter delta tracking.
> +	 * Storing the last seen value and accumulating (u32)(curr - prev)
> +	 * in 64-bit software counter & handles wrap-around transparently
> +	 * via unsigned arithmetic. These fields are never reported to
> +	 * userspace.
> +	 */

can you please align the comment here?

> +		u32 tx_drops;
> +		u32 tx_broadcast;
> +		u32 tx_multicast;
> +		u32 tx_len[7];
> +		u32 rx_drops;
> +		u32 rx_broadcast;
> +		u32 rx_multicast;
> +		u32 rx_errors;
> +		u32 rx_crc_error;
> +		u32 rx_over_errors;
> +		u32 rx_fragment;
> +		u32 rx_jabber;
> +		u32 rx_len[7];
> +	} hw_prev_stats;

Maybe something like "prev_val32" ?

Regards,
Lorenzo

>  };
>  
>  enum {
> -- 
> 2.43.0
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH net-next v4 09/13] dpaa2-switch: add support for LAG offload
From: Ioana Ciornei @ 2026-06-30 14:23 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-10-ioana.ciornei@nxp.com>

On Mon, Jun 29, 2026 at 02:23:05PM +0300, Ioana Ciornei wrote:
> This patch adds the bulk of the changes needed in order to support
> offloading of an upper bond device.
> 
> First of all, handling of the NETDEV_CHANGEUPPER and
> NETDEV_PRECHANGEUPPER events is extended so that the driver is capable
> to handle joining or leaving an upper bond device.
> All the restrictions around the LAG offload support are added in the
> newly added dpaa2_switch_pre_lag_join() function.
> 
> The same events are extended to also detect if one of our upper bond
> devices changes its own upper device. In this case, on each lower device
> that is DPAA2 the corresponding dpaa2_switch_port_[pre]changeupper()
> function will be called. This will start the process of joining the same
> FDB as the one used by the bridge device.
> 
> Setting the 'offload_fwd_mark' field on the skbs is also extended to be
> setup not only when the port is under a bridge but also under a bond
> device that is offloaded.
> 
> Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
> ---
> Changes in v4:
> - Add a defensive check in dpaa2_switch_port_bond_leave() for a NULL
> port_priv->lag
> - Extend the dpaa2_switch_prevent_bridging_with_8021q_upper() function
> so that we prevent a bond device with VLAN uppers joinging a bridge.
> The restriction is related to VLAN management in terms of the FDB which
> can change upon a topology change. VLAN uppers can only be added once
> the bridge topology is setup.
> - Remove all FDB management from the bond join/leave paths. Decided to
> reconfigure the FDB only on bridge join/leave since the FDB determines
> the forwarding domain and when a bond is not bridged, from a
> configuration standpoint, the individual lowers can be viewed as
> standalone.
> - Moved here the update to the dpaa2_switch_port_to_bridge_port()
> function so that the LAG state is taken into account.
> - Add a new per LAG field - primary - which is used to keep track of the
> primary port of a LAG group instead of determining each time we need to
> use it.
> - Set 'skb->offload_fwd_mark' only when the port is under a bridge.
> 
> Changes in v3:
> - Fix logic in prechangeupper callback in order to not call
> dpaa2_switch_prechangeupper_sanity_checks() on !info->linking
> - Fixed up the logic in the dpaa2_switch_port_bond_join()'s error path
> so that the FDBs are cleaned-up properly and we do not end-up with FDB's
> leaked, meaning that they could have been marked as in-use but actually
> no port was using it.
> - Mark the port_priv->lag field as __rcu and use the proper accesors for
> it. This will eventually become useful in a later patch when the lag
> field will be accessed concurrently from the NAPI context and the
> join/leave paths
> 
> Changes in v2:
> - Extend dpaa2_switch_prechangeupper_sanity_checks() with
> netdev_walk_all_lower_dev() so that checks are done on all lower devices
> of a bridge, even for the lowers of a bridged bond.
> - Manage better the default VLAN on bond join
> - Clean-up the error path in dpaa2_switch_port_bond_join()
> - Call dpaa2_switch_port_bridge_leave() in case a port is leaving a bond
> which is also a bridged port
> - Update dpaa2_switch_port_bond_leave() so that in case of any failure
> the driver tries to cleanup the LAG offload configuration.
> - Call switchdev_bridge_port_unoffload() in a switch port is leaving a
> bridge bond device.
> ---
>  .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 473 +++++++++++++++++-
>  .../ethernet/freescale/dpaa2/dpaa2-switch.h   |  15 +-
>  2 files changed, 476 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> index 3472f5d5b08a..949a7241a00f 100644
> --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> @@ -51,6 +51,17 @@ dpaa2_switch_filter_block_get_unused(struct ethsw_core *ethsw)
>  	return NULL;
>  }
>  
> +static struct dpaa2_switch_lag *
> +dpaa2_switch_lag_get_unused(struct ethsw_core *ethsw)
> +{
> +	int i;
> +
> +	for (i = 0; i < ethsw->sw_attr.num_ifs; i++)
> +		if (!ethsw->lags[i].in_use)
> +			return &ethsw->lags[i];
> +	return NULL;
> +}
> +
>  static bool dpaa2_switch_fdb_in_use_by_others(struct ethsw_core *ethsw,
>  					      struct dpaa2_switch_fdb *fdb,
>  					      struct ethsw_port_priv *except)
> @@ -2042,9 +2053,15 @@ static int dpaa2_switch_port_attr_set_event(struct net_device *netdev,
>  static struct net_device *
>  dpaa2_switch_port_to_bridge_port(struct ethsw_port_priv *port_priv)
>  {
> +	struct dpaa2_switch_lag *lag;
> +
>  	if (!port_priv->fdb->bridge_dev)
>  		return NULL;
>  
> +	lag = rtnl_dereference(port_priv->lag);
> +	if (lag)
> +		return lag->bond_dev;
> +
>  	return port_priv->netdev;
>  }
>  
> @@ -2193,30 +2210,53 @@ static int dpaa2_switch_port_bridge_leave(struct net_device *netdev)
>  					  false);
>  }
>  
> +static int
> +dpaa2_switch_have_vlan_upper(struct net_device *upper_dev,
> +			     __always_unused struct netdev_nested_priv *priv)
> +{
> +	return is_vlan_dev(upper_dev);
> +}
> +
>  static int dpaa2_switch_prevent_bridging_with_8021q_upper(struct net_device *netdev)
>  {
> -	struct net_device *upper_dev;
> -	struct list_head *iter;
> +	struct netdev_nested_priv priv = {};
>  
>  	/* RCU read lock not necessary because we have write-side protection
> -	 * (rtnl_mutex), however a non-rcu iterator does not exist.
> +	 * (rtnl_mutex), however a non-rcu iterator does not exist. Walk the
> +	 * entire upper chain so that a VLAN device stacked on a intermediate
> +	 * bond is caught too.
>  	 */
> -	netdev_for_each_upper_dev_rcu(netdev, upper_dev, iter)
> -		if (is_vlan_dev(upper_dev))
> -			return -EOPNOTSUPP;
> +	if (netdev_walk_all_upper_dev_rcu(netdev, dpaa2_switch_have_vlan_upper,
> +					  &priv))
> +		return -EOPNOTSUPP;
>  
>  	return 0;
>  }
>  
> +static int dpaa2_switch_check_dpsw_instance(struct net_device *dev,
> +					    struct netdev_nested_priv *priv)
> +{
> +	struct ethsw_port_priv *port_priv = (struct ethsw_port_priv *)priv->data;
> +	struct ethsw_port_priv *other_priv = netdev_priv(dev);
> +
> +	if (!dpaa2_switch_port_dev_check(dev))
> +		return 0;
> +
> +	if (other_priv->ethsw_data == port_priv->ethsw_data)
> +		return 0;
> +
> +	return 1;
> +}
> +
>  static int
>  dpaa2_switch_prechangeupper_sanity_checks(struct net_device *netdev,
>  					  struct net_device *upper_dev,
>  					  struct netlink_ext_ack *extack)
>  {
>  	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
> -	struct ethsw_port_priv *other_port_priv;
> -	struct net_device *other_dev;
> -	struct list_head *iter;
> +	struct netdev_nested_priv data = {
> +		.data = (void *)port_priv,
> +	};
>  	int err;
>  
>  	if (!br_vlan_enabled(upper_dev)) {
> @@ -2231,6 +2271,70 @@ dpaa2_switch_prechangeupper_sanity_checks(struct net_device *netdev,
>  		return err;
>  	}
>  
> +	err = netdev_walk_all_lower_dev(upper_dev,
> +					dpaa2_switch_check_dpsw_instance,
> +					&data);
> +	if (err) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Interface from a different DPSW is in the bridge already");
> +		return -EINVAL;
> +	}
> +
> +	return 0;
> +}
> +
> +static int dpaa2_switch_pre_lag_join(struct net_device *netdev,
> +				     struct net_device *upper_dev,
> +				     struct netdev_lag_upper_info *info,
> +				     struct netlink_ext_ack *extack)
> +{
> +	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
> +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> +	struct ethsw_port_priv *other_port_priv;
> +	struct dpaa2_switch_lag *lag = NULL;
> +	struct dpsw_lag_cfg cfg = {0};
> +	struct net_device *other_dev;
> +	int i, num_ifs = 0, err;
> +	struct list_head *iter;
> +
> +	if (!(ethsw->features & ETHSW_FEATURE_LAG_OFFLOAD)) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "LAG offload is supported only for DPSW >= v8.13");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (info->tx_type != NETDEV_LAG_TX_TYPE_HASH) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Can only offload LAG using hash TX type");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (info->hash_type != NETDEV_LAG_HASH_L23) {
> +		NL_SET_ERR_MSG_MOD(extack, "Can only offload L2+L3 Tx hash");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	if (!dpaa2_switch_port_has_mac(port_priv)) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Only switch interfaces connected to MACs can be under a LAG");
> +		return -EINVAL;
> +	}
> +
> +	if (vlan_uses_dev(upper_dev)) {
> +		NL_SET_ERR_MSG_MOD(extack,
> +				   "Cannot join a LAG upper that has a VLAN");
> +		return -EOPNOTSUPP;
> +	}
> +
> +	for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
> +		if (!ethsw->lags[i].in_use)
> +			continue;
> +		if (ethsw->lags[i].bond_dev != upper_dev)
> +			continue;
> +		lag = &ethsw->lags[i];
> +		break;
> +	}
> +
>  	netdev_for_each_lower_dev(upper_dev, other_dev, iter) {
>  		if (!dpaa2_switch_port_dev_check(other_dev))
>  			continue;
> @@ -2238,11 +2342,229 @@ dpaa2_switch_prechangeupper_sanity_checks(struct net_device *netdev,
>  		other_port_priv = netdev_priv(other_dev);
>  		if (other_port_priv->ethsw_data != port_priv->ethsw_data) {
>  			NL_SET_ERR_MSG_MOD(extack,
> -					   "Interface from a different DPSW is in the bridge already");
> +					   "Interface from a different DPSW is in the bond already");
> +			return -EINVAL;
> +		}
> +
> +		cfg.if_id[num_ifs++] = other_port_priv->idx;
> +
> +		if (num_ifs >= DPSW_MAX_LAG_IFS) {
> +			NL_SET_ERR_MSG_MOD(extack,
> +					   "Cannot add more than 8 DPAA2 switch ports under the same bond");
>  			return -EINVAL;
>  		}
>  	}
>  
> +	if (lag) {
> +		cfg.group_id = lag->id;
> +		cfg.if_id[num_ifs++] = port_priv->idx;
> +		cfg.num_ifs = num_ifs;
> +		cfg.phase = DPSW_LAG_SET_PHASE_CHECK;
> +
> +		err = dpsw_lag_set(ethsw->mc_io, 0, ethsw->dpsw_handle, &cfg);
> +		if (err) {
> +			NL_SET_ERR_MSG_MOD(extack,
> +					   "Cannot offload LAG configuration");
> +			return -EOPNOTSUPP;
> +		}
> +	}
> +
> +	return 0;
> +}
> +
> +static void dpaa2_switch_port_set_lag_group(struct ethsw_port_priv *port_priv,
> +					    struct net_device *bond_dev)
> +{
> +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> +	struct ethsw_port_priv *other_port_priv = NULL;
> +	struct dpaa2_switch_lag *lag = NULL;
> +	struct dpaa2_switch_lag *other_lag;
> +	struct net_device *other_dev;
> +	struct list_head *iter;
> +
> +	netdev_for_each_lower_dev(bond_dev, other_dev, iter) {
> +		if (!dpaa2_switch_port_dev_check(other_dev))
> +			continue;
> +
> +		other_port_priv = netdev_priv(other_dev);
> +		other_lag = rtnl_dereference(other_port_priv->lag);
> +		if (!other_lag)
> +			continue;
> +
> +		if (other_lag->bond_dev == bond_dev) {
> +			rcu_assign_pointer(port_priv->lag, other_lag);
> +			return;
> +		}
> +	}
> +
> +	/* This is the first interface to be added under a bond device. Find an
> +	 * unused LAG group. No need to check for NULL since there are the same
> +	 * amount of DPSW ports as LAG groups, meaning that each port can have
> +	 * its own LAG group.
> +	 */
> +	lag = dpaa2_switch_lag_get_unused(ethsw);
> +	lag->in_use = true;
> +	lag->bond_dev = bond_dev;
> +	lag->primary = port_priv;
> +	rcu_assign_pointer(port_priv->lag, lag);
> +}
> +
> +static bool dpaa2_switch_port_in_lag(struct ethsw_port_priv *port_priv,
> +				     struct net_device *bond_dev)
> +{
> +	struct dpaa2_switch_lag *lag;
> +
> +	if (!port_priv)
> +		return false;
> +
> +	lag = rtnl_dereference(port_priv->lag);
> +	return lag && lag->bond_dev == bond_dev;
> +}
> +
> +static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
> +				    struct ethsw_core *ethsw)
> +{
> +	struct dpaa2_switch_lag *lag = &ethsw->lags[lag_id - 1];
> +	struct ethsw_port_priv *primary, *new_primary = NULL;
> +	struct ethsw_port_priv *port_priv = NULL;
> +	struct dpsw_lag_cfg cfg = {0};
> +	u8 num_ifs = 0;
> +	int err, i;
> +
> +	cfg.group_id = lag_id;
> +
> +	/* Determine the primary port. The caller clears ->lag on the port that
> +	 * is leaving, so a NULL ->lag on the current primary means it is the
> +	 * one leaving: elect the first remaining member as the new primary.
> +	 * Otherwise keep the current primary.
> +	 */
> +	if (rtnl_dereference(lag->primary->lag)) {
> +		primary = lag->primary;
> +	} else {
> +		primary = NULL;
> +		for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
> +			if (dpaa2_switch_port_in_lag(ethsw->ports[i], bond_dev)) {
> +				new_primary = ethsw->ports[i];
> +				primary = new_primary;
> +				break;
> +			}
> +		}
> +	}
> +
> +	/* Build the interface list, always placing the primary first */
> +	if (primary)
> +		cfg.if_id[num_ifs++] = primary->idx;
> +
> +	for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
> +		port_priv = ethsw->ports[i];
> +		if (port_priv == primary)
> +			continue;
> +		if (!dpaa2_switch_port_in_lag(port_priv, bond_dev))
> +			continue;
> +
> +		cfg.if_id[num_ifs++] = port_priv->idx;
> +	}
> +	cfg.num_ifs = num_ifs;
> +
> +	/* No more interfaces under this LAG group, mark it as not in use. Wait
> +	 * for a grace period so that any readers of the lag structure finished.
> +	 */
> +	if (!num_ifs) {
> +		synchronize_net();
> +
> +		lag->bond_dev = NULL;
> +		lag->primary = NULL;
> +		lag->in_use = false;
> +	}
> +
> +	err = dpsw_lag_set(ethsw->mc_io, 0, ethsw->dpsw_handle, &cfg);
> +	if (err)
> +		return err;
> +
> +	if (new_primary) {
> +		synchronize_net();
> +		lag->primary = new_primary;
> +	}
> +
> +	return 0;
> +}
> +
> +static int dpaa2_switch_port_bond_join(struct net_device *netdev,
> +				       struct net_device *bond_dev,
> +				       struct netdev_lag_upper_info *info,
> +				       struct netlink_ext_ack *extack)
> +{
> +	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
> +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> +	struct net_device *bridge_dev;
> +	struct dpaa2_switch_lag *lag;
> +	int err = 0;
> +	u8 lag_id;
> +
> +	/* Setup the port_priv->lag pointer for this switch port */
> +	dpaa2_switch_port_set_lag_group(port_priv, bond_dev);
> +
> +	/* Create the LAG configuration and apply it in MC */
> +	lag = rtnl_dereference(port_priv->lag);
> +	lag_id = lag->id;
> +	err = dpaa2_switch_set_lag_cfg(bond_dev, lag_id, ethsw);
> +	if (err)
> +		goto err_lag_cfg;
> +
> +	/* If the bond device is a switch port, join the bridge as well */
> +	bridge_dev = netdev_master_upper_dev_get(bond_dev);
> +	if (!bridge_dev || !netif_is_bridge_master(bridge_dev))
> +		return 0;
> +
> +	err = dpaa2_switch_port_bridge_join(netdev, bridge_dev, extack);
> +	if (err)
> +		goto err_lag_cfg;
> +
> +	return err;
> +
> +err_lag_cfg:
> +	rcu_assign_pointer(port_priv->lag, NULL);
> +	dpaa2_switch_set_lag_cfg(bond_dev, lag_id, ethsw);
> +
> +	return err;
> +}
> +
> +static int dpaa2_switch_port_bond_leave(struct net_device *netdev,
> +					struct net_device *bond_dev)
> +{
> +	struct net_device *bridge_dev = netdev_master_upper_dev_get(bond_dev);
> +	struct ethsw_port_priv *port_priv = netdev_priv(netdev);
> +	struct dpaa2_switch_lag *lag = rtnl_dereference(port_priv->lag);
> +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> +	struct net_device *brpdev;
> +	bool learn_ena;
> +	int err;
> +
> +	if (!lag)
> +		return 0;
> +
> +	/* Recreate the LAG configuration for the LAG group that we left. */
> +	rcu_assign_pointer(port_priv->lag, NULL);
> +	dpaa2_switch_set_lag_cfg(bond_dev, lag->id, ethsw);
> +
> +	if (bridge_dev && netif_is_bridge_master(bridge_dev)) {
> +		/* Make sure that the new primary inherits the learning state */
> +		if (lag->primary) {
> +			brpdev = dpaa2_switch_port_to_bridge_port(lag->primary);
> +			learn_ena = br_port_flag_is_set(brpdev, BR_LEARNING);
> +			err = dpaa2_switch_port_set_learning(lag->primary,
> +							     learn_ena);
> +			if (err)
> +				return err;
> +			lag->primary->learn_ena = learn_ena;
> +		}
> +
> +		/* In case the bond is a bridge port, leave the upper bridge as
> +		 * well.
> +		 */
> +		return dpaa2_switch_port_bridge_leave(netdev);
> +	}
> +
>  	return 0;
>  }
>  
> @@ -2250,8 +2572,8 @@ static int dpaa2_switch_port_prechangeupper(struct net_device *netdev,
>  					    struct netdev_notifier_changeupper_info *info)
>  {
>  	struct ethsw_port_priv *port_priv;
> +	struct net_device *upper_dev, *br;
>  	struct netlink_ext_ack *extack;
> -	struct net_device *upper_dev;
>  	int err;
>  
>  	if (!dpaa2_switch_port_dev_check(netdev))
> @@ -2268,6 +2590,24 @@ static int dpaa2_switch_port_prechangeupper(struct net_device *netdev,
>  
>  		if (!info->linking)
>  			dpaa2_switch_port_pre_bridge_leave(netdev);
> +	} else if (netif_is_lag_master(upper_dev)) {
> +		if (!info->linking) {
> +			if (netif_is_bridge_port(upper_dev))
> +				dpaa2_switch_port_pre_bridge_leave(netdev);
> +			return 0;
> +		}
> +

sashiko-nipa notes:


	When a single DPAA2 port leaves a bond that itself is a bridge port,
	dpaa2_switch_port_pre_bridge_leave(netdev) is called unconditionally,
	regardless of whether other DPAA2 ports still remain in the same bond.

	Inside dpaa2_switch_port_pre_bridge_leave(), the bridge port being
	unoffloaded is computed by dpaa2_switch_port_to_bridge_port(), which
	now returns the bond:

		lag = rtnl_dereference(port_priv->lag);
		if (lag)
			return lag->bond_dev;
		return port_priv->netdev;

	So switchdev_bridge_port_unoffload(bond_dev, NULL, NULL, NULL) is
	issued for every leaving member. Since the matching join path also
	calls switchdev_bridge_port_offload(bond_dev, netdev, NULL, ...) per
	member with the same brport_dev and a NULL ctx, the bridge layer has
	no per-port handle either.

	When the first of several bonded DPAA2 ports leaves, this dispatches
	SWITCHDEV_BRPORT_UNOFFLOADED for the bond while the remaining members
	still rely on the bond being offloaded.

	Should the unoffload only happen when the last DPAA2 port leaves the
	bond, similar to how lan966x tracks per-port bridge offload state?

No, switchdev_bridge_port_offload() and
switchdev_bridge_port_unoffload() can be called multiple times for the
same bridge port, see nbp_switchdev_add():

	/* Tolerate drivers that call switchdev_bridge_port_offload()
	 * more than once for the same bridge port, such as when the
	 * bridge port is an offloaded bonding/team interface.
	 */ 
	p->offload_count++;

The ctx parameter being NULL in this patch does not have any effect on
the offload_count shown above. A proper ctx parameter is provided in the
next patch when we add support for FDBs on bond devices.

Ioana

^ permalink raw reply

* Re: [PATCH net-next 0/8] drivers/net: replace __get_free_pages() with kmalloc()
From: Jakub Kicinski @ 2026-06-30 14:23 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Brian Norris, Edward Cree, Francesco Dolcini, Manish Chopra,
	Przemek Kitszel, Sudarsana Kalluru, Tony Nguyen, b43-dev,
	intel-wired-lan, libertas-dev, linux-kernel, linux-mm,
	linux-net-drivers, linux-wireless, netdev
In-Reply-To: <20260630-b4-drivers-net-v1-0-672162a91f37@kernel.org>

On Tue, 30 Jun 2026 13:59:19 +0300 Mike Rapoport (Microsoft) wrote:
>  drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c    |  6 +--
>  drivers/net/ethernet/intel/ice/ice_gnss.c         |  5 +-
>  drivers/net/ethernet/sfc/mcdi.c                   |  7 +--
>  drivers/net/ethernet/sfc/siena/mcdi.c             |  7 +--
>  drivers/net/wireless/broadcom/b43/debugfs.c       | 12 ++---
>  drivers/net/wireless/broadcom/b43legacy/debugfs.c | 11 ++--
>  drivers/net/wireless/marvell/libertas/debugfs.c   | 39 ++++++--------
>  drivers/net/wireless/marvell/mwifiex/debugfs.c    | 62 ++++++++++-------------
>  drivers/net/wireless/ti/wlcore/main.c             | 14 +++--

You gotta split this, wireless and ethernet go via separate trees.
BTW cocci also suggests folding in a memset, IDK if it's worth it.

drivers/net/wireless/broadcom/b43legacy/debugfs.c:217:8-15: WARNING: kzalloc should be used for buf, instead of kmalloc/memset
-- 
pw-bot: cr

^ permalink raw reply

* Re: [PATCH net-next v4 10/13] dpaa2-switch: offload FDBs added on an upper bond device
From: Ioana Ciornei @ 2026-06-30 14:30 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-11-ioana.ciornei@nxp.com>

On Mon, Jun 29, 2026 at 02:23:06PM +0300, Ioana Ciornei wrote:
> This patch adds support for offloading FDB entries added on upper bond
> devices.
> 
> First of all, the call to switchdev_bridge_port_offload() is updated so
> that the notifier blocks needed for FDB events replay are available to
> the bridge core.
> 
> Using switchdev_handle_*() helpers is also necessary because each FDB
> event needs to be fanned out to any DPAA2 switch lower device. This
> triggers another change in the return type used by the
> dpaa2_switch_port_fdb_event() - from notifier types to regular errno
> types.
> 
> Handling of the SWITCHDEV_FDB_ADD_TO_DEVICE/SWITCHDEV_FDB_DEL_TO_DEVICE
> events is updated so that the newly dpaa2_switch_lag_fdb_add() /
> dpaa2_switch_lag_fdb_del() functions are called anytime a port is under
> a bond device. This will allow us to manage refcounting on FDB entries
> which are added on the upper bond devices.
> 
> The DPAA2 switch uses shared-VLAN learning which means that the vid
> parameter is not used when adding an FDB entry to HW. The current
> behavior when dealing with FDB entries with the same MAC address but
> different VLANs is to add the entry to HW every time while removal will
> get done on the first 'bridge fdb del' command issued by the user.
> 
> The same behavior is kept also for FDBs added on bond devices by keeping
> the refcount on the {vid, addr} pair while the HW operation disregards
> entirely the vid parameter.
> 
> Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
> ---
> Changes in v4:
> - Migrate FDBs in case the primary interface of a LAG changes.
> - Use lag->primary instead of determining each time the primary
> interface of a LAG device
> 
> Changes in v3:
> - Update dpaa2_switch_foreign_dev_check() so that we check if there is
> any port in the same switch as dev which offloads foreign_dev in case
> this is a bridge port.
> - Add mutex_destroy on the per LAG fdb_lock
> - Make sure that all FDB events were processed on the workqueue on the
> .remove() path.
> - Delete the refcounted entry in dpaa2_switch_lag_fdb_del() as soon as
> possible, even if the HW deletion would fail
> - Access the port_priv->lag field only through the proper rcu accessors.
> 
> Changes in v2:
> - Update dpaa2_switch_foreign_dev_check() so that we check if between
> the switch port and the foreign net_device is an offloaded path. Before
> this change we also checked if the foreign_dev was offloaded or not by
> the switch port.
> - Update the switchdev_bridge_port_unoffload() by passing it the proper
> context and the notifier blocks.
> - Add dev_hold() and dev_put() calls for orig_dev
> ---
>  .../ethernet/freescale/dpaa2/dpaa2-switch.c   | 227 ++++++++++++++++--
>  .../ethernet/freescale/dpaa2/dpaa2-switch.h   |  24 ++
>  2 files changed, 225 insertions(+), 26 deletions(-)
> 
> diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> index 949a7241a00f..307b3b7a1bfb 100644
> --- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> +++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
> @@ -25,6 +25,9 @@
>  
>  #define DEFAULT_VLAN_ID			1
>  
> +static struct notifier_block dpaa2_switch_port_switchdev_nb;
> +static struct notifier_block dpaa2_switch_port_switchdev_blocking_nb;
> +
>  static u16 dpaa2_switch_port_get_fdb_id(struct ethsw_port_priv *port_priv)
>  {
>  	return port_priv->fdb->fdb_id;
> @@ -585,6 +588,81 @@ static int dpaa2_switch_port_fdb_del(struct ethsw_port_priv *port_priv,
>  		return dpaa2_switch_port_fdb_del_mc(port_priv, addr);
>  }
>  
> +static struct dpaa2_mac_addr *
> +dpaa2_switch_mac_addr_find(struct list_head *addr_list,
> +			   const unsigned char *addr, u16 vid)
> +{
> +	struct dpaa2_mac_addr *a;
> +
> +	list_for_each_entry(a, addr_list, list)
> +		if (ether_addr_equal(a->addr, addr) && a->vid == vid)
> +			return a;
> +
> +	return NULL;
> +}
> +
> +static int dpaa2_switch_lag_fdb_add(struct dpaa2_switch_lag *lag,
> +				    const unsigned char *addr, u16 vid)
> +{
> +	struct ethsw_port_priv *port_priv = lag->primary;
> +	struct dpaa2_mac_addr *a;
> +	int err = 0;
> +
> +	mutex_lock(&lag->fdb_lock);
> +
> +	a = dpaa2_switch_mac_addr_find(&lag->fdbs, addr, vid);
> +	if (a) {
> +		refcount_inc(&a->refcount);
> +		goto out;
> +	}
> +
> +	a = kzalloc(sizeof(*a), GFP_KERNEL);
> +	if (!a) {
> +		err = -ENOMEM;
> +		goto out;
> +	}
> +
> +	err = dpaa2_switch_port_fdb_add(port_priv, addr);
> +	if (err) {
> +		kfree(a);
> +		goto out;
> +	}
> +
> +	ether_addr_copy(a->addr, addr);
> +	a->vid = vid;
> +	refcount_set(&a->refcount, 1);
> +	list_add_tail(&a->list, &lag->fdbs);
> +
> +out:
> +	mutex_unlock(&lag->fdb_lock);
> +
> +	return err;
> +}
> +
> +static void dpaa2_switch_lag_fdb_del(struct dpaa2_switch_lag *lag,
> +				     const unsigned char *addr, u16 vid)
> +{
> +	struct ethsw_port_priv *port_priv = lag->primary;
> +	struct dpaa2_mac_addr *a;
> +
> +	mutex_lock(&lag->fdb_lock);
> +
> +	a = dpaa2_switch_mac_addr_find(&lag->fdbs, addr, vid);
> +	if (!a)
> +		goto out;
> +
> +	if (!refcount_dec_and_test(&a->refcount))
> +		goto out;
> +
> +	list_del(&a->list);
> +	kfree(a);
> +
> +	dpaa2_switch_port_fdb_del(port_priv, addr);
> +
> +out:
> +	mutex_unlock(&lag->fdb_lock);
> +}
> +
>  static void dpaa2_switch_port_get_stats(struct net_device *netdev,
>  					struct rtnl_link_stats64 *stats)
>  {
> @@ -1533,6 +1611,33 @@ bool dpaa2_switch_port_dev_check(const struct net_device *netdev)
>  	return netdev->netdev_ops == &dpaa2_switch_port_ops;
>  }
>  
> +static bool dpaa2_switch_foreign_dev_check(const struct net_device *dev,
> +					   const struct net_device *foreign_dev)
> +{
> +	struct ethsw_port_priv *port_priv = netdev_priv(dev);
> +	struct ethsw_core *ethsw = port_priv->ethsw_data;
> +	struct ethsw_port_priv *other_port;
> +	int i;
> +
> +	if (netif_is_bridge_master(foreign_dev))
> +		if (port_priv->fdb->bridge_dev == foreign_dev)
> +			return false;
> +
> +	if (netif_is_bridge_port(foreign_dev)) {
> +		for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
> +			other_port = ethsw->ports[i];
> +
> +			if (!other_port)
> +				continue;
> +			if (dpaa2_switch_port_offloads_bridge_port(other_port,
> +								   foreign_dev))
> +				return false;
> +		}
> +	}
> +
> +	return true;
> +}
> +
>  static int dpaa2_switch_port_connect_mac(struct ethsw_port_priv *port_priv)
>  {
>  	struct fsl_mc_device *dpsw_port_dev, *dpmac_dev;
> @@ -2100,8 +2205,10 @@ static int dpaa2_switch_port_bridge_join(struct net_device *netdev,
>  		goto err_egress_flood;
>  
>  	brport_dev = dpaa2_switch_port_to_bridge_port(port_priv);
> -	err = switchdev_bridge_port_offload(brport_dev, netdev, NULL,
> -					    NULL, NULL, false, extack);
> +	err = switchdev_bridge_port_offload(brport_dev, netdev, port_priv,
> +					    &dpaa2_switch_port_switchdev_nb,
> +					    &dpaa2_switch_port_switchdev_blocking_nb,
> +					    false, extack);
>  	if (err)
>  		goto err_switchdev_offload;
>  
> @@ -2143,7 +2250,9 @@ static void dpaa2_switch_port_pre_bridge_leave(struct net_device *netdev)
>  	if (!brport_dev)
>  		return;
>  
> -	switchdev_bridge_port_unoffload(brport_dev, NULL, NULL, NULL);
> +	switchdev_bridge_port_unoffload(brport_dev, port_priv,
> +					&dpaa2_switch_port_switchdev_nb,
> +					&dpaa2_switch_port_switchdev_blocking_nb);
>  
>  	/* Make sure that any FDB add/del operations are completed before the
>  	 * bridge layout changes
> @@ -2425,9 +2534,10 @@ static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
>  				    struct ethsw_core *ethsw)
>  {
>  	struct dpaa2_switch_lag *lag = &ethsw->lags[lag_id - 1];
> -	struct ethsw_port_priv *primary, *new_primary = NULL;
> -	struct ethsw_port_priv *port_priv = NULL;
> +	struct ethsw_port_priv *primary, *port_priv;
> +	struct ethsw_port_priv *new_primary = NULL;
>  	struct dpsw_lag_cfg cfg = {0};
> +	struct dpaa2_mac_addr *a;
>  	u8 num_ifs = 0;
>  	int err, i;
>  
> @@ -2454,7 +2564,6 @@ static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
>  	/* Build the interface list, always placing the primary first */
>  	if (primary)
>  		cfg.if_id[num_ifs++] = primary->idx;
> -
>  	for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
>  		port_priv = ethsw->ports[i];
>  		if (port_priv == primary)
> @@ -2477,11 +2586,32 @@ static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
>  		lag->in_use = false;
>  	}
>  
> +	/* When the primary changes, migrate the FDB entries from the old
> +	 * primary to the new one: remove them before reconfiguring the LAG in
> +	 * hardware and re-add them on the new primary afterwards. We do not
> +	 * touch any refcounting since the intention is to change the HW entry,
> +	 * not the parallel software tracking.
> +	 */
> +	if (new_primary) {
> +		mutex_lock(&lag->fdb_lock);
> +		list_for_each_entry(a, &lag->fdbs, list)
> +			dpaa2_switch_port_fdb_del(lag->primary, a->addr);
> +		mutex_unlock(&lag->fdb_lock);
> +	}
> +
>  	err = dpsw_lag_set(ethsw->mc_io, 0, ethsw->dpsw_handle, &cfg);
>  	if (err)
>  		return err;
>  

sashiko-nipa notes:

	[High, Medium] When the last port leaves the bond, the block
	above sets

	  if (!num_ifs) {
	  	synchronize_net();

	  	lag->bond_dev = NULL;
	  	lag->primary = NULL;
	  	lag->in_use = false;
	  }

	Can a queued workqueue item still race with this teardown?
	Looking at dpaa2_switch_event_work():

	  rcu_read_lock();
	  lag = rcu_dereference(port_priv->lag);
	  rcu_read_unlock();

	  switch (switchdev_work->event) {
	  case SWITCHDEV_FDB_ADD_TO_DEVICE:
	  	if (lag)
	  		err = dpaa2_switch_lag_fdb_add(lag, fdb_info->addr, ...);

	The RCU read section ends before the lag is used, so the
	synchronize_net() in set_lag_cfg returns immediately without waiting
	for the work. dpaa2_switch_lag_fdb_add() then reads lag->primary while
	holding only fdb_lock, which the writer does not take. If lag->primary
	has been set to NULL by the writer, port_priv = lag->primary; ... in
	dpaa2_switch_lag_fdb_add() will dereference NULL through
	dpaa2_switch_port_fdb_add() -> dpaa2_switch_port_fdb_add_uc() reading
	port_priv->idx and port_priv->ethsw_data.

	The in-file comment claims the lag pointer staying alive is enough, but
	lag->primary is a separately mutable field with no shared lock between
	this reader and the writer. Should lag->primary itself be protected by
	fdb_lock (or by the rtnl/RCU pattern actually waited on), or should
	the bond-leave path flush_workqueue() before clearing primary?

	A related window exists during the primary migration below: between
	unlock of fdb_lock after the add-loop and the lag->primary = new_primary
	store, a concurrent work item can still observe the OLD primary value
	and install entries on it while the HW LAG is being reconfigured. Is
	that intentional?

Not correct. As stated in the commit message for 2/13, any concurrency
between on-going work items and changeupper events is resolved by
flushing the workqueue from the prechangeupper event.

Ioana

^ permalink raw reply

* Re: [PATCH net-next v3 5/5] selftest: Add tests for useful handling of LSM denials on SCM_RIGHTS
From: Jori Koolstra @ 2026-06-30 14:35 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Christian Brauner, Aleksa Sarai, Kuniyuki Iwashima,
	David S . Miller, Eric Dumazet, Paolo Abeni, Simon Horman, netdev,
	linux-fsdevel, linux-kernel
In-Reply-To: <20260630071701.6b583d1e@kernel.org>

> Op 30-06-2026 16:17 CEST schreef Jakub Kicinski <kuba@kernel.org>:
> 
>  
> On Mon, 29 Jun 2026 21:43:27 +0200 Jori Koolstra wrote:
> > The test uses the following Smack labels:
> > 
> >    "Sender"   - label for the sending process
> >    "Receiver" - label for the receiving process
> >    "SecretX"   - labels for the files being passed
> 
> Not sure this test belongs in net/
> 99.9% of people running this test do not use Smack.
> At the very least you need to use XFAIL instead of SKIP
> we use skip for problems with the env which are fixable,
> like a command missing.

Ah, right, because you can only use one of these LSMs at a time?
I mean one of AppArmour, SELinux, Smack, TOMOYO.

I just need some LSM to trigger the reject of security_file_receive()
and Smack was the easiest to get going. The series is totally agnostic
to the used LSM. I am fine with moving the tests elsewhere or porting
them to SELinux if that is really necessary. We could also drop them
altogether.

What do you propose?

Thanks,
Jori.

^ permalink raw reply

* Re: [PATCH iproute2-next v2 2/2] devlink: support u64-array values in devlink param show/set
From: David Ahern @ 2026-06-30 14:36 UTC (permalink / raw)
  To: Ratheesh Kannoth, stephen, kuba, linux-kernel, netdev
  Cc: andrew+netdev, edumazet, pabeni, jiri
In-Reply-To: <20260630015012.3728870-3-rkannoth@marvell.com>

On 6/29/26 7:50 PM, Ratheesh Kannoth wrote:
> diff --git a/devlink/devlink.c b/devlink/devlink.c
> index 9372e92f..3c29601d 100644
> --- a/devlink/devlink.c
> +++ b/devlink/devlink.c
> @@ -3496,13 +3496,115 @@ static const struct param_val_conv param_val_conv[] = {
>  };
>  
>  #define PARAM_VAL_CONV_LEN ARRAY_SIZE(param_val_conv)
> +#define DEVLINK_PARAM_MAX_ARRAY_SIZE 32

Why 32? Is that based on current code? How does the kernel side handle
the number of parameters? What happens if the kernel sends more than 32
parameters - from a user's perspective, not this code and processing the
output?

> +
> +struct devlink_param_u64_array {
> +	uint64_t size;
> +	uint64_t val[DEVLINK_PARAM_MAX_ARRAY_SIZE];
> +};
> +
> +static int param_value_nested_u64_attr_cb(const struct nlattr *attr, void *data)
> +{
> +	struct devlink_param_u64_array *arr = data;
> +	unsigned int len;
> +
> +	if (mnl_attr_get_type(attr) != DEVLINK_ATTR_PARAM_VALUE_DATA)
> +		return MNL_CB_OK;
> +
> +	if (arr->size >= DEVLINK_PARAM_MAX_ARRAY_SIZE)
> +		return MNL_CB_ERROR;
> +
> +	len = mnl_attr_get_payload_len(attr);
> +	if (len == sizeof(uint32_t))
> +		arr->val[arr->size++] = mnl_attr_get_u32(attr);
> +	else if (len == sizeof(uint64_t))
> +		arr->val[arr->size++] = mnl_attr_get_u64(attr);
> +	else
> +		return MNL_CB_ERROR;
> +
> +	return MNL_CB_OK;
> +}
> +
> +static int param_value_u64_array_fill(struct nlattr *nl,
> +				      struct devlink_param_u64_array *arr)
> +{
> +	int err;
> +
> +	arr->size = 0;
> +	err = mnl_attr_parse_nested(nl, param_value_nested_u64_attr_cb, arr);
> +	if (err != MNL_CB_OK)
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
> +static bool param_value_u64_array_equal(const struct devlink_param_u64_array *a,
> +					const struct devlink_param_u64_array *b)
> +{
> +	uint64_t i;
> +
> +	if (a->size != b->size)
> +		return false;
> +
> +	for (i = 0; i < a->size; i++) {
> +		if (a->val[i] != b->val[i])
> +			return false;
> +	}
> +
> +	return true;
> +}
> +
> +static int param_value_u64_array_put_from_str(struct nlmsghdr *nlh,
> +					      const char *param_value,
> +					      const struct devlink_param_u64_array *cur)
> +{
> +	struct devlink_param_u64_array new_arr = {};
> +	char *copy, *token, *saveptr = NULL;
> +	char delim[] = " ,";
> +	uint64_t val;
> +	int err;
> +
> +	copy = strdup(param_value);
> +	if (!copy)
> +		return -ENOMEM;
> +
> +	token = strtok_r(copy, delim, &saveptr);
> +	while (token) {
> +		if (new_arr.size >= DEVLINK_PARAM_MAX_ARRAY_SIZE) {
> +			free(copy);
> +			pr_err("Too many array elements (max %d)\n",
> +			       DEVLINK_PARAM_MAX_ARRAY_SIZE);
> +			return -EINVAL;
> +		}
> +		err = get_u64((__u64 *)&val, token, 10);
> +		if (err) {
> +			free(copy);
> +			pr_err("Value \"%s\" is not a number or not within range\n",
> +			       token);
> +			return err;
> +		}
> +		new_arr.val[new_arr.size++] = val;
> +		token = strtok_r(NULL, delim, &saveptr);
> +	}
> +	free(copy);
> +
> +	if (cur && param_value_u64_array_equal(&new_arr, cur))
> +		return 1;
> +
> +	for (uint64_t i = 0; i < new_arr.size; i++)

put the declaration at the top of the function with the rest of them.
global comment; fix all of them.

> +		mnl_attr_put_u64(nlh, DEVLINK_ATTR_PARAM_VALUE_DATA, new_arr.val[i]);

Why can't this put be done in the loop above as the string is processed?

> +
> +	return 0;
> +}
>  
>  static int pr_out_param_value_print(const char *nla_name, int nla_type,
>  				     struct nlattr *val_attr, bool conv_exists,
> -				     const char *label, bool flag_as_u8)
> +				     const char *label, bool flag_as_u8, struct nlattr *nl)
>  {
> +	struct devlink_param_u64_array u64_arr = { };
>  	const char *vstr;
> -	int err;
> +	char buffer[1024];
> +	int err, cnt = 0;
>  
>  	print_string(PRINT_FP, NULL, " %s ", label);
>  


^ permalink raw reply

* Re: [PATCH net 0/3] Fix broken TC_ACT_REDIRECT from qdiscs
From: Sebastian Andrzej Siewior @ 2026-06-30 14:37 UTC (permalink / raw)
  To: Daniel Borkmann; +Cc: kuba, pabeni, jhs, andrii, memxor, bpf, netdev
In-Reply-To: <20260630123331.186840-1-daniel@iogearbox.net>

On 2026-06-30 14:33:28 [+0200], Daniel Borkmann wrote:
> This is an alternative fix to [0] in order to not uglify
> __dev_queue_xmit() with sprinkled ifdefs given this can be
> simplified and isolated through a simple test into the BPF
> redirect helper itself.
> 
> I've also added a proper BPF selftest, so there is no need
> to check-in a binary BPF object into selftests given we do
> have BPF infra for all of this.

1/3 makes sense. Assuming we wouldn't have this per-task memory
assignment, wouldn't then the state from one redirect leak into another?

For what it's worth:
Reviewed-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>

Sebastian

^ permalink raw reply

* Re: [PATCH 2/3] arm64: dts: socfpga: agilex5: Add SoCDK TSN Config2 board
From: Nazle Asmade, Muhammad Nazim Amirul @ 2026-06-30 14:39 UTC (permalink / raw)
  To: Andrew Lunn
  Cc: dinguyen@kernel.org, maxime.chevallier@bootlin.com,
	rmk+kernel@armlinux.org.uk, krzk+dt@kernel.org,
	conor+dt@kernel.org, robh@kernel.org, davem@davemloft.net,
	edumazet@google.com, kuba@kernel.org, pabeni@redhat.com,
	andrew+netdev@lunn.ch, devicetree@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org
In-Reply-To: <edf84080-a5e0-478c-9977-af2376cb71c5@lunn.ch>

On 30/6/2026 9:58 pm, Andrew Lunn wrote:
>> + * gmac1 is the TSN port. The MAC operates in GMII mode internally
>> + * while the PHY-side interface is RGMII, so mac-mode and phy-mode differ.
>> + */
>> +&gmac1 {
>> +	status = "okay";
>> +	phy-mode = "rgmii"; /* TX/RX clock delays provided by Agilex5 I/O hardware */
> Could you provide more details about this. I want to understand the
> big picture.
> 
> Normally we talk about the PCB providing the delays. This sounds like
> it is the FPGA? So i need convincing this is correct.
Hi Andrew,

Thanks for your quick review and yes, it is the FPGA — specifically a 
soft IP block in the FPGA fabric that implements the RGMII clock delays 
and is configured before Linux boots via the FPGA bitstream. The driver 
must not add additional delays on top.

BR,
Nazim

^ permalink raw reply

* Re: [PATCH net-next 0/8] drivers/net: replace __get_free_pages() with kmalloc()
From: Mike Rapoport @ 2026-06-30 14:40 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Paolo Abeni,
	Brian Norris, Edward Cree, Francesco Dolcini, Manish Chopra,
	Przemek Kitszel, Sudarsana Kalluru, Tony Nguyen, b43-dev,
	intel-wired-lan, libertas-dev, linux-kernel, linux-mm,
	linux-net-drivers, linux-wireless, netdev
In-Reply-To: <20260630072344.159b5d99@kernel.org>

On Tue, Jun 30, 2026 at 07:23:44AM -0700, Jakub Kicinski wrote:
> On Tue, 30 Jun 2026 13:59:19 +0300 Mike Rapoport (Microsoft) wrote:
> >  drivers/net/ethernet/broadcom/bnx2x/bnx2x_sp.c    |  6 +--
> >  drivers/net/ethernet/intel/ice/ice_gnss.c         |  5 +-
> >  drivers/net/ethernet/sfc/mcdi.c                   |  7 +--
> >  drivers/net/ethernet/sfc/siena/mcdi.c             |  7 +--
> >  drivers/net/wireless/broadcom/b43/debugfs.c       | 12 ++---
> >  drivers/net/wireless/broadcom/b43legacy/debugfs.c | 11 ++--
> >  drivers/net/wireless/marvell/libertas/debugfs.c   | 39 ++++++--------
> >  drivers/net/wireless/marvell/mwifiex/debugfs.c    | 62 ++++++++++-------------
> >  drivers/net/wireless/ti/wlcore/main.c             | 14 +++--
> 
> You gotta split this, wireless and ethernet go via separate trees.

Sure.

> BTW cocci also suggests folding in a memset, IDK if it's worth it.

Same churn, less lines :)
 
> drivers/net/wireless/broadcom/b43legacy/debugfs.c:217:8-15: WARNING: kzalloc should be used for buf, instead of kmalloc/memset
> -- 
> pw-bot: cr

-- 
Sincerely yours,
Mike.

^ permalink raw reply

* Re: [PATCH net-next v4 10/13] dpaa2-switch: offload FDBs added on an upper bond device
From: Ioana Ciornei @ 2026-06-30 14:41 UTC (permalink / raw)
  To: andrew+netdev, davem, edumazet, kuba, pabeni, netdev; +Cc: linux-kernel
In-Reply-To: <20260629112309.154328-11-ioana.ciornei@nxp.com>

On Mon, Jun 29, 2026 at 02:23:06PM +0300, Ioana Ciornei wrote:
> This patch adds support for offloading FDB entries added on upper bond
> devices.
> 
> First of all, the call to switchdev_bridge_port_offload() is updated so
> that the notifier blocks needed for FDB events replay are available to
> the bridge core.
> 
> Using switchdev_handle_*() helpers is also necessary because each FDB
> event needs to be fanned out to any DPAA2 switch lower device. This
> triggers another change in the return type used by the
> dpaa2_switch_port_fdb_event() - from notifier types to regular errno
> types.
> 
> Handling of the SWITCHDEV_FDB_ADD_TO_DEVICE/SWITCHDEV_FDB_DEL_TO_DEVICE
> events is updated so that the newly dpaa2_switch_lag_fdb_add() /
> dpaa2_switch_lag_fdb_del() functions are called anytime a port is under
> a bond device. This will allow us to manage refcounting on FDB entries
> which are added on the upper bond devices.
> 
> The DPAA2 switch uses shared-VLAN learning which means that the vid
> parameter is not used when adding an FDB entry to HW. The current
> behavior when dealing with FDB entries with the same MAC address but
> different VLANs is to add the entry to HW every time while removal will
> get done on the first 'bridge fdb del' command issued by the user.
> 
> The same behavior is kept also for FDBs added on bond devices by keeping
> the refcount on the {vid, addr} pair while the HW operation disregards
> entirely the vid parameter.
> 
> Signed-off-by: Ioana Ciornei <ioana.ciornei@nxp.com>
> ---
> Changes in v4:
> - Migrate FDBs in case the primary interface of a LAG changes.
> - Use lag->primary instead of determining each time the primary
> interface of a LAG device
> 
> Changes in v3:
> - Update dpaa2_switch_foreign_dev_check() so that we check if there is
> any port in the same switch as dev which offloads foreign_dev in case
> this is a bridge port.
> - Add mutex_destroy on the per LAG fdb_lock
> - Make sure that all FDB events were processed on the workqueue on the
> .remove() path.
> - Delete the refcounted entry in dpaa2_switch_lag_fdb_del() as soon as
> possible, even if the HW deletion would fail
> - Access the port_priv->lag field only through the proper rcu accessors.
> 
> Changes in v2:
> - Update dpaa2_switch_foreign_dev_check() so that we check if between
> the switch port and the foreign net_device is an offloaded path. Before
> this change we also checked if the foreign_dev was offloaded or not by
> the switch port.
> - Update the switchdev_bridge_port_unoffload() by passing it the proper
> context and the notifier blocks.
> - Add dev_hold() and dev_put() calls for orig_dev
> ---

(...)

> @@ -2454,7 +2564,6 @@ static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
>  	/* Build the interface list, always placing the primary first */
>  	if (primary)
>  		cfg.if_id[num_ifs++] = primary->idx;
> -
>  	for (i = 0; i < ethsw->sw_attr.num_ifs; i++) {
>  		port_priv = ethsw->ports[i];
>  		if (port_priv == primary)
> @@ -2477,11 +2586,32 @@ static int dpaa2_switch_set_lag_cfg(struct net_device *bond_dev, u8 lag_id,
>  		lag->in_use = false;
>  	}

sashiko.dev notes:

	Does this leak the dynamically allocated items in the lag->fdbs
	list?  When the last interface leaves a LAG, lag->in_use is set
	to false here, but the elements in lag->fdbs are not freed.

The mechanism initiated by nbp_switchdev_unsync_objs() will remove all
the lag->fdbs automatically without the need to add extra code which
frees manually all entries.

>  
> +	/* When the primary changes, migrate the FDB entries from the old
> +	 * primary to the new one: remove them before reconfiguring the LAG in
> +	 * hardware and re-add them on the new primary afterwards. We do not
> +	 * touch any refcounting since the intention is to change the HW entry,
> +	 * not the parallel software tracking.
> +	 */
> +	if (new_primary) {
> +		mutex_lock(&lag->fdb_lock);
> +		list_for_each_entry(a, &lag->fdbs, list)
> +			dpaa2_switch_port_fdb_del(lag->primary, a->addr);
> +		mutex_unlock(&lag->fdb_lock);
> +	}

sashiko.dev notes:

	Does dropping the fdb_lock here open a race window for leaking
	FDB entries?  While the lock is dropped, a concurrent
	asynchronous work item could execute dpaa2_switch_lag_fdb_add(),
	acquire fdb_lock, add a new FDB entry to the old lag->primary in
	hardware, and append it to the software lag->fdbs list.

	When this thread re-acquires the lock below, it will loop over
	lag->fdbs and add all entries to the new primary. The new entry
	would then be programmed on both the old and new primary ports.
	Later, a delete operation would only remove it from the new
	primary, permanently leaking the hardware entry on the old port.

The LAG primary changes only on bridge leave which will not run
concurrently with any workqueue events since there is a
flush_workqueue() in the pre bridge leave event.

>  static void dpaa2_switch_event_work(struct work_struct *work)
>  {
>  	struct ethsw_switchdev_event_work *switchdev_work =
>  		container_of(work, struct ethsw_switchdev_event_work, work);
> +	struct net_device *orig_dev = switchdev_work->orig_dev;
>  	struct net_device *dev = switchdev_work->dev;
> +	struct ethsw_port_priv *port_priv = netdev_priv(dev);
>  	struct switchdev_notifier_fdb_info *fdb_info;
> +	struct dpaa2_switch_lag *lag;
>  	int err;
>  
>  	fdb_info = &switchdev_work->fdb_info;
>  
> +	/* The lag structures are freed only from dpaa2_switch_remove(), which
> +	 * first flushes this workqueue, so the pointer stays valid for the
> +	 * lifetime of the work item. Only the dereference needs the RCU
> +	 * read-side lock; the FDB helpers below can sleep and must run outside
> +	 * of it.
> +	 */
> +	rcu_read_lock();
> +	lag = rcu_dereference(port_priv->lag);
> +	rcu_read_unlock();

sashiko.dev notes:

	Does this sequence leave lag unprotected against concurrent
	teardown?

	By dropping the RCU read lock here, a concurrent call to
	dpaa2_switch_set_lag_cfg() could set lag->primary to NULL and
	call synchronize_net(). Since the RCU read lock is dropped,
	synchronize_net() would not wait for this work item to finish.

	If the work item then resumes, dpaa2_switch_lag_fdb_add() reads
	lag->primary locklessly:

	dpaa2_switch_lag_fdb_add() {
		...
		struct ethsw_port_priv *port_priv = lag->primary;
		err = dpaa2_switch_port_fdb_add(port_priv, addr);

	This would pass a NULL port_priv to dpaa2_switch_port_fdb_add(),
	potentially causing a NULL pointer dereference.

Again, the flush_workqueue() avoids these kind of concurrencies.

Ioana

^ permalink raw reply

* Re: [PATCH v4] bpf: Fix smp_processor_id() call trace for preemptible kernels
From: bot+bpf-ci @ 2026-06-30 14:46 UTC (permalink / raw)
  To: eadavis, sashiko-bot
  Cc: eadavis, jiayuan.chen, sashiko-reviews, andrii, ast, bpf, daniel,
	eddyz87, emil, jolsa, linux-kernel, martin.lau, memxor, netdev,
	song, syzkaller-bugs, yonghong.song, ast, andrii, daniel,
	martin.lau, eddyz87, yonghong.song, clm, ihor.solodrai
In-Reply-To: <tencent_5E4F8F8DADA5DE0DAD924ADD835DD7991707@qq.com>

[-- Attachment #1: Type: text/plain, Size: 3946 bytes --]

> bpf: Fix smp_processor_id() call trace for preemptible kernels
>
> bpf_mem_cache_free_rcu() maybe called in preemptible context, this
> will trigger the below warning message:
>
> BUG: using smp_processor_id() in preemptible [00000000] code: syz.0.17/5820
> caller is bpf_mem_cache_free_rcu+0x48/0xc0 kernel/bpf/memalloc.c:954
> Call Trace:
>  check_preemption_disabled+0xd3/0xe0 lib/smp_processor_id.c:47
>  bpf_mem_cache_free_rcu+0x48/0xc0 kernel/bpf/memalloc.c:954
>  rhtab_delete_elem+0x185a/0x1b30 kernel/bpf/hashtab.c:2969
>  __rhtab_map_lookup_and_delete_batch+0x935/0xcb0 kernel/bpf/hashtab.c:3349
>  bpf_map_do_batch+0x445/0x630 kernel/bpf/syscall.c:-1
>  __sys_bpf+0x906/0xd90 kernel/bpf/syscall.c:-1
>
> this_cpu_ptr() access needs to be guarded against migration.

The commit message says "guarded against migration" but the fix actually
disables preemption using get_cpu()/put_cpu(). Is this description accurate?

get_cpu() does preempt_disable() + __smp_processor_id(), not just
migrate_disable(). The later sentence "disabling preemption rather than
prohibiting migration" correctly describes the fix, but this earlier line
understates the requirement.

> Wrapping this batch operation in bpf_disable_instrumentation() risk
> blinding BPF tracing globally on the CPU if preemption occurs.
>
> bpf_disable_instrumentation() increments the per-CPU bpf_prog_active counter.
> Because migrate_disable() and rcu_read_lock() do not disable preemption under
> CONFIG_PREEMPT_RCU, the task can be preempted during this potentially long
> loop.
>
> If preempted, bpf_prog_active would remain elevated on that CPU, which could
> cause subsequent tasks scheduled on the same CPU to silently drop BPF tracing
> events (kprobes, tracepoints, perf).

The reasoning about bpf_disable_instrumentation() is sound, but could the
commit message clarify that rhtab_delete_elem() already calls
bpf_disable_instrumentation() internally?

Looking at rhtab_delete_elem():

kernel/bpf/hashtab.c:rhtab_delete_elem() {
    ...
    bpf_disable_instrumentation();
    ret = rhashtable_remove_fast(&rhtab->table, &elem->htab_node,
                                  rhtab->rhtable_params);
    bpf_enable_instrumentation();
    ...
}

This means the same "blinding BPF tracing" window already exists in the
per-element path. The argument is against a longer-held instrumentation
disable across the entire loop, not a categorical concern.

> Therefore, we fix this by disabling preemption rather than prohibiting
> migration.
>
> Fixes: 5af6807bdb10 ("bpf: Introduce bpf_mem_free_rcu() similar to kfree_rcu().")

Should the Fixes tag point to a different commit? The bug occurs when
__rhtab_map_lookup_and_delete_batch() calls rhtab_delete_elem() which calls
bpf_mem_cache_free_rcu() in preemptible context.

Commit 5af6807bdb10 introduced bpf_mem_free_rcu() in 2023 but did not modify
kernel/bpf/hashtab.c or introduce the rhtab code. The call chain was
introduced by:

  Fixes: 6905f8601298 ("bpf: Allow special fields in resizable hashtab")

which added the rhtab implementation including __rhtab_map_lookup_and_delete_batch()
and the call to bpf_mem_cache_free_rcu() in rhtab_delete_elem().

> Reported-by: syzbot+fd7e415d891073b83e1f@syzkaller.appspotmail.com
> Closes: https://syzkaller.appspot.com/bug?extid=fd7e415d891073b83e1f
> Signed-off-by: Edward Adam Davis <eadavis@qq.com>

[ ... ]

> diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
> --- a/kernel/bpf/hashtab.c
> +++ b/kernel/bpf/hashtab.c
> @@ -3345,8 +3345,10 @@
>  	}
>
>  	if (do_delete) {
> +		get_cpu();
>  		for (i = 0; i < total; i++)
>  			rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
> +		put_cpu();
>  	}
>
>  	rcu_read_unlock();

---
AI reviewed your patch. Please fix the bug or email reply why it's not a bug.
See: https://github.com/kernel-patches/vmtest/blob/master/ci/claude/README.md

CI run summary: https://github.com/kernel-patches/bpf/actions/runs/28451219226

^ permalink raw reply

* [PATCH net] net:ethernet: mtk_wed: setup WDMA_RING_TX(0) for non-DBDC MT7986
From: Zhi-Jun You @ 2026-06-30 14:48 UTC (permalink / raw)
  To: lorenzo; +Cc: nbd, netdev, linux-mediatek, Zhi-Jun You

WDMA_RING_TX(0) is required to set MTK_WED_WDMA_RING_TX for WED RX
but on a non-DBDC MT7986 it is never setup because idx is 1.

Setting MTK_WED_WDMA_RING_TX with WDMA_RING_TX(1) is not feasible because
WED still tries to send through WDMA_RING_TX(0). This is verified with
register dump.

Fix this by calling mtk_wed_wdma_tx_ring_setup if WDMA_RING_TX(0) is not
setup and guard it with mtk_wed_is_v2.

Fixes: 4c5de09eb0d0 ("net: ethernet: mtk_wed: add configure wed wo support")
Signed-off-by: Zhi-Jun You <hujy652@gmail.com>
---
 drivers/net/ethernet/mediatek/mtk_wed.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/mediatek/mtk_wed.c b/drivers/net/ethernet/mediatek/mtk_wed.c
index 10d9beaae372..2420557f6c96 100644
--- a/drivers/net/ethernet/mediatek/mtk_wed.c
+++ b/drivers/net/ethernet/mediatek/mtk_wed.c
@@ -2334,6 +2334,10 @@ mtk_wed_start(struct mtk_wed_device *dev, u32 irq_mask)
 		if (!dev->rx_wdma[i].desc)
 			mtk_wed_wdma_rx_ring_setup(dev, i, 16, false);
 
+	if (mtk_wed_is_v2(dev->hw))
+		if (!dev->tx_wdma[0].desc)
+			mtk_wed_wdma_tx_ring_setup(dev, 0, MTK_WED_WDMA_RING_SIZE, false);
+
 	if (dev->wlan.hw_rro) {
 		for (i = 0; i < MTK_WED_RX_PAGE_QUEUES; i++) {
 			u32 addr = MTK_WED_RRO_MSDU_PG_CTRL0(i) +
-- 
2.47.3


^ permalink raw reply related

* Re: [ANNOUNCEMENT] LPC 2026: System Monitoring and Observability Microconference
From: Jason Xing @ 2026-06-30 14:49 UTC (permalink / raw)
  To: Breno Leitao
  Cc: linux-acpi, linux-hwmon, netdev, linux-kernel, linux-arm-kernel,
	kernel-team, linux-mm, iipeace5, gavinguo, linux, amscanne, sj,
	gpiccoli, Daniel Gomez, mfo, platform-driver-x86, acpica-devel
In-Reply-To: <aj5KJEIsTl6IK0nX@gmail.com>

On Fri, Jun 26, 2026 at 5:56 PM Breno Leitao <leitao@debian.org> wrote:
>
> We are pleased to announce the Call for Proposals (CFP) for another
> edition of  System Monitoring and Observability Microconference, this
> time at the 2026 Linux Plumbers Conference (LPC), taking place in
> Prague, Czechia, from Oct 5-7, 2026.
>
>   https://lpc.events/event/20/sessions/262/
>
> This microconference provides a valuable forum for key engineering areas
> such as:
>
>    - Kernel Health and Runtime Monitoring
>    - Hardware Integration and Error Detection
>    - Correlation of Issues (crashes, stalls, bugs)
>    - Virtualization Stack Monitoring
>    - Memory Management Monitoring and Observability
>    - Anomaly Detection Algorithms for System Behavior
>    - Automated Analysis, Remediation and post mortem analyzes
>
> The purpose of each talk is to share challenges and discuss potential
> improvements. Sessions will last 20 to 30 minutes and aim to encourage
> brainstorming and open dialogue about ongoing issues rather than
> delivering immediate solutions.
>
> The conference acts as both a knowledge-sharing platform and a strategic
> venue for guiding the future of kernel technologies to better meet the
> demands of large-scale infrastructure.
>
> We invite you to submit your proposals here:
>         https://lpc.events/event/20/abstracts/
>
> Please select track "Linux System Monitoring and Observability MC"

A kind reminder: at the bottom of the page [1], please click the
'Submit new abstract' button so that you will be able to quickly
submit a new proposal. (Admittedly, it's a bit easy to miss the entry
especially for the newcomers)

If you have any questions/suggestions/problems, please do not hesitate
to contact Breno or me.

Again, proposals are greatly appreciated :)

[1]: https://lpc.events/event/20/abstracts/

Thanks,
Jason

^ permalink raw reply

* [PATCH v1 00/18] ibmveth: Add multi-queue RX Support
From: Mingming Cao @ 2026-06-30 14:53 UTC (permalink / raw)
  To: netdev
  Cc: horms, bjking1, haren, ricklind, mmc, kuba, edumazet, pabeni,
	linuxppc-dev, maddy, mpe

Power11 PHYP firmware adds Virtual Ethernet multi-queue (MQ) RX for
the ibmveth device: multiple logical-LAN RX queues, per-queue buffer
posting, and completion delivery. Guest Linux did not use that
platform support; ibmveth still registered one RX queue even when
PHYP was MQ-capable.

This series adds the ibmveth MQ client. When PHYP advertises the
capability through H_ILLAN_ATTRIBUTES, the driver registers
multiple RX queues, receives on per-queue NAPI, and exposes queue
count through ethtool. Older firmware without the bit is unchanged.

ibmveth today registers one logical LAN, one set of buffer pools, and
one NAPI context. PHYP MQ mode gives each RX queue its own handle:
buffers are posted with H_ADD_LOGICAL_LAN_BUFFERS_QUEUE, subordinate
queues register through H_REG_LOGICAL_LAN_QUEUE, and traffic can
land on any active queue. Queue selection is firmware-defined; v1
does not program RSS or hash tables. The driver needs per-queue
pools, IRQs, and poll state to match.

Queue-aware hcalls are selected only when probe sets multi_queue
from H_ILLAN_ATTRIBUTES; legacy firmware keeps the original hcall
path unchanged through the entire series.

This splits the work so review follows the actual bring-up sequence:

 1. Hypercall definitions and MQ data structures (patches 1-3)
 2. Refactor open/close into helpers - RX, per-queue pools,
    IRQ, TX, PHYP (4-10)
 3. Turn on the MQ datapath at probe/open (11)
 4. Per-queue RX/TX stats, get_stats64, and sysfs pool readout
    (12-14)
 5. Runtime RX queue resize via ethtool -L (15-17)
 6. Runtime stability fixes from LPAR testing (18)

 - Helper patches (4-10) reshape ibmveth_open()/close() into
queue-aware helpers. Runtime behaviour is unchanged through that
block: num_rx_queues stays 1 and multi_queue is false until patch 11.

- Patch 11 is the switch: probe sets multi_queue from firmware, raises
num_rx_queues, registers subordinates, and replenishes every active
queue.

- Patch 18 fixes poll hangs after aggressive ethtool -L cycling,
NAPI/close deadlocks on ip link down, and preserves probe-time
pool->active across close/open so RX works after link down/up.

Design notes
* Per-queue buffer pools (rx_buff_pool[queue][pool]) - PHYP ties
 posted buffers to a queue handle; a shared pool set does not work.
 Patch 5 also disables the 64 KiB pool at standard MTU to save
 per-queue memory in MQ.
* Legacy mode keeps queue 0 on h_register_logical_lan(); MQ uses
 handles for all queues (subordinates via H_REG_LOGICAL_LAN_QUEUE).
 Close uses H_FREE_LOGICAL_LAN for the whole adapter.
* ethtool -L resizes incrementally while the netdev stays up so
 surviving queues keep PHYP handles, pools, and IRQ state. A
 close/open cycle would drop traffic and force full LAN
 re-registration for every queue.

Tested on ppc64le PowerVM LPAR with MQ-capable firmware:
* MQ path: ethtool -L under iperf3 load, link down/up during traffic
* Legacy firmware (no MQ bit): full open/close/stress on the
 refactored helper path to confirm single-queue behaviour is
 unchanged
* ethtool -L resize while all RX queues are receiving traffic, not
 only a single-flow iperf session
* ip link down/up and ping after reopen (patch 18)

Future work
* IRQ affinity hints for subordinate queue IRQs returned by PHYP
* Summed global no_buffer drop counter across all RX queues in MQ mode
Comments and suggestions on patch split, design, and testing are
welcome.

Mingming Cao <mmc@linux.ibm.com>

Mingming Cao (18):
  ibmveth: Add MQ RX hypercall wrappers and call definitions
  ibmveth: Prepare adapter data structures for MQ RX
  ibmveth: Add MQ-ready RX statistics structures
  ibmveth: Refactor RX resource allocation for MQ RX bring-up
  ibmveth: Refactor buffer pool management for per-queue MQ RX
  ibmveth: Refactor RX interrupt control for MQ RX queues
  ibmveth: Refactor TX resource allocation in open/close paths
  ibmveth: Add RX queue register/deregister helpers for MQ
  ibmveth: Refactor open/close into MQ-ready resource pipeline
  ibmveth: Add queue-aware RX buffer submit helper for MQ
  ibmveth: Enable multi-queue RX receive path
  ibmveth: Add per-queue RX statistics collection and reporting
  ibmveth: Add per-queue TX statistics reporting
  ibmveth: Expose per-queue buffer pool details via sysfs
  ibmveth: Add helpers for incremental MQ RX queue resize
  ibmveth: Implement incremental MQ RX queue resize
  ibmveth: Wire ethtool set_channels to MQ RX queue resize
  ibmveth: Fix MQ RX poll and shutdown hangs after queue resize

 arch/powerpc/include/asm/hvcall.h  |    6 +-
 drivers/net/ethernet/ibm/ibmveth.c | 2451 +++++++++++++++++++++++-----
 drivers/net/ethernet/ibm/ibmveth.h |  226 ++-
 3 files changed, 2284 insertions(+), 399 deletions(-)

-- 
2.39.3 (Apple Git-146)

^ permalink raw reply

* [PATCH v1 01/18] ibmveth: Add MQ RX hypercall wrappers and call definitions
From: Mingming Cao @ 2026-06-30 14:53 UTC (permalink / raw)
  To: netdev
  Cc: horms, bjking1, haren, ricklind, mmc, kuba, edumazet, pabeni,
	linuxppc-dev, maddy, mpe, Dave Marquardt
In-Reply-To: <cover.1782758799.git.mmc@linux.ibm.com>

Single-queue ibmveth only needs h_register_logical_lan() plus legacy
buffer add/free calls. MQ RX uses per-queue handles, so the driver must
also be able to register/deregister subordinate queues and post/free
buffers against a specific queue handle.

Add the PHYP call IDs for:

  H_REG_LOGICAL_LAN_QUEUE
  H_ADD_LOGICAL_LAN_BUFFERS_QUEUE
  H_FREE_LOGICAL_LAN_BUFFER_QUEUE
  H_FREE_LOGICAL_LAN_QUEUE

and add ibmveth.h wrapper helpers (h_reg_logical_lan_queue(),
h_add_logical_lan_buffers_queue(), h_free_logical_lan_buffer_queue(),
h_free_logical_lan_queue()) with argument ordering and return semantics
matching the existing ibmveth hcall wrappers.

This patch is intentionally plumbing only: no runtime behavior change
yet. Legacy firmware keeps H_REGISTER_LOGICAL_LAN and the existing
buffer hcalls. The new wrappers are used only when a later commit sets
multi_queue from H_ILLAN_ATTRIBUTES.

Signed-off-by: Mingming Cao <mmc@linux.ibm.com>
Reviewed-by: Dave Marquardt <davemarq@linux.ibm.com>
---
 arch/powerpc/include/asm/hvcall.h  |   6 +-
 drivers/net/ethernet/ibm/ibmveth.c |   3 +-
 drivers/net/ethernet/ibm/ibmveth.h | 158 +++++++++++++++++++++++++++++
 3 files changed, 165 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index dff90a7d7f70..bf2f1b0356c4 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -362,7 +362,11 @@
 #define H_GUEST_DELETE		0x488
 #define H_PKS_WRAP_OBJECT	0x490
 #define H_PKS_UNWRAP_OBJECT	0x494
-#define MAX_HCALL_OPCODE	H_PKS_UNWRAP_OBJECT
+#define H_REG_LOGICAL_LAN_QUEUE 0x49C
+#define H_ADD_LOGICAL_LAN_BUFFERS_QUEUE 0x4A0
+#define H_FREE_LOGICAL_LAN_BUFFER_QUEUE 0x4A4
+#define H_FREE_LOGICAL_LAN_QUEUE 0x4A8
+#define MAX_HCALL_OPCODE	H_FREE_LOGICAL_LAN_QUEUE
 
 /* Scope args for H_SCM_UNBIND_ALL */
 #define H_UNBIND_SCOPE_ALL (0x1)
diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index 73e051d26b9d..af287eeafc0c 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -584,7 +584,8 @@ static int ibmveth_allocate_tx_ltb(struct ibmveth_adapter *adapter, int idx)
 }
 
 static int ibmveth_register_logical_lan(struct ibmveth_adapter *adapter,
-        union ibmveth_buf_desc rxq_desc, u64 mac_address)
+				   union ibmveth_buf_desc rxq_desc,
+				   u64 mac_address)
 {
 	int rc, try_again = 1;
 
diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h
index d87713668ed3..45cfb0d054e3 100644
--- a/drivers/net/ethernet/ibm/ibmveth.h
+++ b/drivers/net/ethernet/ibm/ibmveth.h
@@ -66,6 +66,164 @@ static inline long h_add_logical_lan_buffers(unsigned long unit_address,
 			    desc5, desc6, desc7, desc8);
 }
 
+/**
+ * h_reg_logical_lan_queue - Register a subordinate receive queue
+ * @unit_address: Device unit address
+ * @buffer_list: DMA address of 4KB page for tracking registered buffers
+ * @rec_queue: Buffer descriptor of receive queue
+ *
+ * Registers a subordinate receive queue with the hypervisor.
+ *
+ * Return:
+ *   H_SUCCESS (0) on success
+ *   H_PARAMETER if parameters are invalid
+ *
+ * On success, hypervisor returns:
+ *   R3: H_SUCCESS
+ *   R4: Queue handle
+ *   R5: IRQ number for this queue
+ */
+static inline long h_reg_logical_lan_queue(unsigned long unit_address,
+					   unsigned long buffer_list,
+					   unsigned long rec_queue,
+					   unsigned long *queue_handle,
+					   unsigned long *irq)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall9(H_REG_LOGICAL_LAN_QUEUE,
+			  retbuf, unit_address,
+			  buffer_list, rec_queue);
+
+	if (rc == H_SUCCESS) {
+		if (queue_handle)
+			*queue_handle = retbuf[0];
+		if (irq)
+			*irq = retbuf[1];
+	}
+
+	return rc;
+}
+
+/**
+ * h_add_logical_lan_buffers_queue - Add buffers to subordinate queue
+ * @unit_address: Device unit address
+ * @queue_handle: Queue handle from h_reg_logical_lan_queue()
+ * @buffersznum: Buffer size (upper 32 bits) | count (lower 32 bits)
+ * @ioba12: Buffer addresses 1 and 2 packed (addr1 | addr2 << 32)
+ * @ioba34: Buffer addresses 3 and 4 packed
+ * @ioba56: Buffer addresses 5 and 6 packed
+ * @ioba78: Buffer addresses 7 and 8 packed
+ * @ioba910: Buffer addresses 9 and 10 packed
+ * @ioba1112: Buffer addresses 11 and 12 packed
+ *
+ * Return:
+ *   H_SUCCESS - All buffers added successfully
+ *   H_PARAMETER - Invalid parameters
+ *   H_HARDWARE - Hardware error
+ */
+static inline long h_add_logical_lan_buffers_queue(unsigned long unit_address,
+						   unsigned long queue_handle,
+						   unsigned long buffersznum,
+						   unsigned long ioba12,
+						   unsigned long ioba34,
+						   unsigned long ioba56,
+						   unsigned long ioba78,
+						   unsigned long ioba910,
+						   unsigned long ioba1112)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	return plpar_hcall9(H_ADD_LOGICAL_LAN_BUFFERS_QUEUE,
+			    retbuf, unit_address,
+			    queue_handle, buffersznum,
+			    ioba12, ioba34, ioba56,
+			    ioba78, ioba910, ioba1112);
+}
+
+/**
+ * h_free_logical_lan_buffer_queue - Free buffer from subordinate queue
+ * @unit_address: Device unit address
+ * @buf_size: Size of buffer to remove from pool
+ * @queue_handle: Queue handle from h_reg_logical_lan_queue()
+ *
+ * Removes a buffer of specified size from the subordinate queue's buffer pool.
+ *
+ * Return:
+ *   H_SUCCESS - Buffer removed successfully
+ *   H_PARAMETER - Invalid parameters
+ *   H_HARDWARE - Hardware error
+ *   H_NOT_FOUND - Buffer pool does not exist
+ */
+static inline long h_free_logical_lan_buffer_queue(unsigned long unit_address,
+						   unsigned long buf_size,
+						   unsigned long queue_handle)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	return plpar_hcall9(H_FREE_LOGICAL_LAN_BUFFER_QUEUE,
+			    retbuf, unit_address, buf_size, queue_handle);
+}
+
+/**
+ * h_free_logical_lan_queue - Deregister subordinate receive queue
+ * @unit_address: Device unit address
+ * @queue_handle: Queue handle from h_reg_logical_lan_queue()
+ *
+ * Deregisters and frees all structures associated with the subordinate queue.
+ *
+ * Return:
+ *   H_SUCCESS - Queue freed successfully
+ *   H_PARAMETER - Invalid parameters
+ *   H_HARDWARE - Hardware error
+ *   H_STATE - VIOA not in valid state
+ *   H_BUSY / H_LONG_BUSY_* - Resource busy, retry
+ */
+static inline long h_free_logical_lan_queue(unsigned long unit_address,
+					    unsigned long queue_handle)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+
+	return plpar_hcall9(H_FREE_LOGICAL_LAN_QUEUE,
+			    retbuf, unit_address, queue_handle);
+}
+
+/**
+ * h_register_logical_lan_with_handle - Register primary queue and get handle
+ * @unit_address: Device unit address
+ * @buffer_list: DMA address of buffer list
+ * @rec_queue: Buffer descriptor of receive queue
+ * @filter_list: DMA address of filter list
+ * @mac_address: MAC address
+ * @queue_handle: Output parameter for queue handle
+ *
+ * Registers the primary receive queue (queue 0) with the hypervisor and
+ * returns the queue handle. This is needed in multi-queue mode to use
+ * h_add_logical_lan_buffers_queue() for all queues including queue 0.
+ *
+ * Return: H_SUCCESS (0) on success, error code otherwise
+ */
+static inline long h_register_logical_lan_with_handle(unsigned long unit_address,
+						      unsigned long buffer_list,
+						      unsigned long rec_queue,
+						      unsigned long filter_list,
+						      unsigned long mac_address,
+						      u64 *queue_handle)
+{
+	unsigned long retbuf[PLPAR_HCALL9_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall9(H_REGISTER_LOGICAL_LAN, retbuf,
+			  unit_address, buffer_list, rec_queue,
+			  filter_list, mac_address);
+
+	if (rc == H_SUCCESS && queue_handle)
+		*queue_handle = retbuf[0];
+
+	return rc;
+}
+
 /* FW allows us to send 6 descriptors but we only use one so mark
  * the other 5 as unused (0)
  */
-- 
2.39.3 (Apple Git-146)


^ permalink raw reply related

* [PATCH v1 02/18] ibmveth: Prepare adapter data structures for MQ RX
From: Mingming Cao @ 2026-06-30 14:53 UTC (permalink / raw)
  To: netdev
  Cc: horms, bjking1, haren, ricklind, mmc, kuba, edumazet, pabeni,
	linuxppc-dev, maddy, mpe, Dave Marquardt
In-Reply-To: <cover.1782758799.git.mmc@linux.ibm.com>

MQ RX needs per-queue state for NAPI, queue handles/IRQs, RX rings,
buffer-list DMA mappings, and buffer pools. The current driver stores
most of this as single instances tied to queue 0.

Convert those fields to queue-indexed layouts sized by
IBMVETH_MAX_RX_QUEUES:

  rx_queue[]
  napi[]
  queue_handle[] / queue_irq[]
  buffer_list_addr[] / buffer_list_dma[]
  rx_buff_pool[queue][pool]

and add num_rx_queues to track how many RX queues are active.

This patch keeps behavior unchanged by mechanically switching existing
references to index 0 — e.g. rx_queue[0], rx_buff_pool[0][pool], and
napi[0]. open/poll/close still drive a single RX queue only.

The goal is to make later helper and datapath patches queue-aware
without mixing structural churn and behavior changes in one commit.

Signed-off-by: Mingming Cao <mmc@linux.ibm.com>
Reviewed-by: Dave Marquardt <davemarq@linux.ibm.com>
---
 drivers/net/ethernet/ibm/ibmveth.c | 195 +++++++++++++++--------------
 drivers/net/ethernet/ibm/ibmveth.h |  16 ++-
 2 files changed, 112 insertions(+), 99 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmveth.c b/drivers/net/ethernet/ibm/ibmveth.c
index af287eeafc0c..4f9dbee7477d 100644
--- a/drivers/net/ethernet/ibm/ibmveth.c
+++ b/drivers/net/ethernet/ibm/ibmveth.c
@@ -101,7 +101,7 @@ static struct ibmveth_stat ibmveth_stats[] = {
 /* simple methods of getting data from the current rxq entry */
 static inline u32 ibmveth_rxq_flags(struct ibmveth_adapter *adapter)
 {
-	return be32_to_cpu(adapter->rx_queue.queue_addr[adapter->rx_queue.index].flags_off);
+	return be32_to_cpu(adapter->rx_queue[0].queue_addr[adapter->rx_queue[0].index].flags_off);
 }
 
 static inline int ibmveth_rxq_toggle(struct ibmveth_adapter *adapter)
@@ -112,7 +112,7 @@ static inline int ibmveth_rxq_toggle(struct ibmveth_adapter *adapter)
 
 static inline int ibmveth_rxq_pending_buffer(struct ibmveth_adapter *adapter)
 {
-	return ibmveth_rxq_toggle(adapter) == adapter->rx_queue.toggle;
+	return ibmveth_rxq_toggle(adapter) == adapter->rx_queue[0].toggle;
 }
 
 static inline int ibmveth_rxq_buffer_valid(struct ibmveth_adapter *adapter)
@@ -132,7 +132,7 @@ static inline int ibmveth_rxq_large_packet(struct ibmveth_adapter *adapter)
 
 static inline int ibmveth_rxq_frame_length(struct ibmveth_adapter *adapter)
 {
-	return be32_to_cpu(adapter->rx_queue.queue_addr[adapter->rx_queue.index].length);
+	return be32_to_cpu(adapter->rx_queue[0].queue_addr[adapter->rx_queue[0].index].length);
 }
 
 static inline int ibmveth_rxq_csum_good(struct ibmveth_adapter *adapter)
@@ -386,7 +386,7 @@ static void ibmveth_replenish_buffer_pool(struct ibmveth_adapter *adapter,
  */
 static void ibmveth_update_rx_no_buffer(struct ibmveth_adapter *adapter)
 {
-	__be64 *p = adapter->buffer_list_addr + 4096 - 8;
+	__be64 *p = adapter->buffer_list_addr[0] + 4096 - 8;
 
 	adapter->rx_no_buffer = be64_to_cpup(p);
 }
@@ -399,7 +399,7 @@ static void ibmveth_replenish_task(struct ibmveth_adapter *adapter)
 	adapter->replenish_task_cycles++;
 
 	for (i = (IBMVETH_NUM_BUFF_POOLS - 1); i >= 0; i--) {
-		struct ibmveth_buff_pool *pool = &adapter->rx_buff_pool[i];
+		struct ibmveth_buff_pool *pool = &adapter->rx_buff_pool[0][i];
 
 		if (pool->active &&
 		    (atomic_read(&pool->available) < pool->threshold))
@@ -463,12 +463,12 @@ static int ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter,
 	struct sk_buff *skb;
 
 	if (WARN_ON(pool >= IBMVETH_NUM_BUFF_POOLS) ||
-	    WARN_ON(index >= adapter->rx_buff_pool[pool].size)) {
+	    WARN_ON(index >= adapter->rx_buff_pool[0][pool].size)) {
 		schedule_work(&adapter->work);
 		return -EINVAL;
 	}
 
-	skb = adapter->rx_buff_pool[pool].skbuff[index];
+	skb = adapter->rx_buff_pool[0][pool].skbuff[index];
 	if (WARN_ON(!skb)) {
 		schedule_work(&adapter->work);
 		return -EFAULT;
@@ -482,24 +482,24 @@ static int ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter,
 		/* remove the skb pointer to mark free. actual freeing is done
 		 * by upper level networking after gro_receive
 		 */
-		adapter->rx_buff_pool[pool].skbuff[index] = NULL;
+		adapter->rx_buff_pool[0][pool].skbuff[index] = NULL;
 
 		dma_unmap_single(&adapter->vdev->dev,
-				 adapter->rx_buff_pool[pool].dma_addr[index],
-				 adapter->rx_buff_pool[pool].buff_size,
+				 adapter->rx_buff_pool[0][pool].dma_addr[index],
+				 adapter->rx_buff_pool[0][pool].buff_size,
 				 DMA_FROM_DEVICE);
 	}
 
-	free_index = adapter->rx_buff_pool[pool].producer_index;
-	adapter->rx_buff_pool[pool].producer_index++;
-	if (adapter->rx_buff_pool[pool].producer_index >=
-	    adapter->rx_buff_pool[pool].size)
-		adapter->rx_buff_pool[pool].producer_index = 0;
-	adapter->rx_buff_pool[pool].free_map[free_index] = index;
+	free_index = adapter->rx_buff_pool[0][pool].producer_index;
+	adapter->rx_buff_pool[0][pool].producer_index++;
+	if (adapter->rx_buff_pool[0][pool].producer_index >=
+	    adapter->rx_buff_pool[0][pool].size)
+		adapter->rx_buff_pool[0][pool].producer_index = 0;
+	adapter->rx_buff_pool[0][pool].free_map[free_index] = index;
 
 	mb();
 
-	atomic_dec(&(adapter->rx_buff_pool[pool].available));
+	atomic_dec(&adapter->rx_buff_pool[0][pool].available);
 
 	return 0;
 }
@@ -507,17 +507,17 @@ static int ibmveth_remove_buffer_from_pool(struct ibmveth_adapter *adapter,
 /* get the current buffer on the rx queue */
 static inline struct sk_buff *ibmveth_rxq_get_buffer(struct ibmveth_adapter *adapter)
 {
-	u64 correlator = adapter->rx_queue.queue_addr[adapter->rx_queue.index].correlator;
+	u64 correlator = adapter->rx_queue[0].queue_addr[adapter->rx_queue[0].index].correlator;
 	unsigned int pool = correlator >> 32;
 	unsigned int index = correlator & 0xffffffffUL;
 
 	if (WARN_ON(pool >= IBMVETH_NUM_BUFF_POOLS) ||
-	    WARN_ON(index >= adapter->rx_buff_pool[pool].size)) {
+	    WARN_ON(index >= adapter->rx_buff_pool[0][pool].size)) {
 		schedule_work(&adapter->work);
 		return NULL;
 	}
 
-	return adapter->rx_buff_pool[pool].skbuff[index];
+	return adapter->rx_buff_pool[0][pool].skbuff[index];
 }
 
 /**
@@ -538,14 +538,14 @@ static int ibmveth_rxq_harvest_buffer(struct ibmveth_adapter *adapter,
 	u64 cor;
 	int rc;
 
-	cor = adapter->rx_queue.queue_addr[adapter->rx_queue.index].correlator;
+	cor = adapter->rx_queue[0].queue_addr[adapter->rx_queue[0].index].correlator;
 	rc = ibmveth_remove_buffer_from_pool(adapter, cor, reuse);
 	if (unlikely(rc))
 		return rc;
 
-	if (++adapter->rx_queue.index == adapter->rx_queue.num_slots) {
-		adapter->rx_queue.index = 0;
-		adapter->rx_queue.toggle = !adapter->rx_queue.toggle;
+	if (++adapter->rx_queue[0].index == adapter->rx_queue[0].num_slots) {
+		adapter->rx_queue[0].index = 0;
+		adapter->rx_queue[0].toggle = !adapter->rx_queue[0].toggle;
 	}
 
 	return 0;
@@ -596,7 +596,7 @@ static int ibmveth_register_logical_lan(struct ibmveth_adapter *adapter,
 	 */
 retry:
 	rc = h_register_logical_lan(adapter->vdev->unit_address,
-				    adapter->buffer_list_dma, rxq_desc.desc,
+				    adapter->buffer_list_dma[0], rxq_desc.desc,
 				    adapter->filter_list_dma, mac_address);
 
 	if (rc != H_SUCCESS && try_again) {
@@ -624,14 +624,14 @@ static int ibmveth_open(struct net_device *netdev)
 
 	netdev_dbg(netdev, "open starting\n");
 
-	napi_enable(&adapter->napi);
+	napi_enable(&adapter->napi[0]);
 
 	for(i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		rxq_entries += adapter->rx_buff_pool[i].size;
+		rxq_entries += adapter->rx_buff_pool[0][i].size;
 
 	rc = -ENOMEM;
-	adapter->buffer_list_addr = (void*) get_zeroed_page(GFP_KERNEL);
-	if (!adapter->buffer_list_addr) {
+	adapter->buffer_list_addr[0] = (void *)get_zeroed_page(GFP_KERNEL);
+	if (!adapter->buffer_list_addr[0]) {
 		netdev_err(netdev, "unable to allocate list pages\n");
 		goto out;
 	}
@@ -644,17 +644,18 @@ static int ibmveth_open(struct net_device *netdev)
 
 	dev = &adapter->vdev->dev;
 
-	adapter->rx_queue.queue_len = sizeof(struct ibmveth_rx_q_entry) *
+	adapter->rx_queue[0].queue_len = sizeof(struct ibmveth_rx_q_entry) *
 						rxq_entries;
-	adapter->rx_queue.queue_addr =
-		dma_alloc_coherent(dev, adapter->rx_queue.queue_len,
-				   &adapter->rx_queue.queue_dma, GFP_KERNEL);
-	if (!adapter->rx_queue.queue_addr)
+	adapter->rx_queue[0].queue_addr =
+		dma_alloc_coherent(dev, adapter->rx_queue[0].queue_len,
+				   &adapter->rx_queue[0].queue_dma, GFP_KERNEL);
+	if (!adapter->rx_queue[0].queue_addr)
 		goto out_free_filter_list;
 
-	adapter->buffer_list_dma = dma_map_single(dev,
-			adapter->buffer_list_addr, 4096, DMA_BIDIRECTIONAL);
-	if (dma_mapping_error(dev, adapter->buffer_list_dma)) {
+	adapter->buffer_list_dma[0] = dma_map_single(dev,
+						     adapter->buffer_list_addr[0],
+						     4096, DMA_BIDIRECTIONAL);
+	if (dma_mapping_error(dev, adapter->buffer_list_dma[0])) {
 		netdev_err(netdev, "unable to map buffer list pages\n");
 		goto out_free_queue_mem;
 	}
@@ -671,19 +672,19 @@ static int ibmveth_open(struct net_device *netdev)
 			goto out_free_tx_ltb;
 	}
 
-	adapter->rx_queue.index = 0;
-	adapter->rx_queue.num_slots = rxq_entries;
-	adapter->rx_queue.toggle = 1;
+	adapter->rx_queue[0].index = 0;
+	adapter->rx_queue[0].num_slots = rxq_entries;
+	adapter->rx_queue[0].toggle = 1;
 
 	mac_address = ether_addr_to_u64(netdev->dev_addr);
 
 	rxq_desc.fields.flags_len = IBMVETH_BUF_VALID |
-					adapter->rx_queue.queue_len;
-	rxq_desc.fields.address = adapter->rx_queue.queue_dma;
+					adapter->rx_queue[0].queue_len;
+	rxq_desc.fields.address = adapter->rx_queue[0].queue_dma;
 
-	netdev_dbg(netdev, "buffer list @ 0x%p\n", adapter->buffer_list_addr);
+	netdev_dbg(netdev, "buffer list @ 0x%p\n", adapter->buffer_list_addr[0]);
 	netdev_dbg(netdev, "filter list @ 0x%p\n", adapter->filter_list_addr);
-	netdev_dbg(netdev, "receive q   @ 0x%p\n", adapter->rx_queue.queue_addr);
+	netdev_dbg(netdev, "receive q   @ 0x%p\n", adapter->rx_queue[0].queue_addr);
 
 	h_vio_signal(adapter->vdev->unit_address, VIO_IRQ_DISABLE);
 
@@ -694,7 +695,7 @@ static int ibmveth_open(struct net_device *netdev)
 			   lpar_rc);
 		netdev_err(netdev, "buffer TCE:0x%llx filter TCE:0x%llx rxq "
 			   "desc:0x%llx MAC:0x%llx\n",
-				     adapter->buffer_list_dma,
+				     adapter->buffer_list_dma[0],
 				     adapter->filter_list_dma,
 				     rxq_desc.desc,
 				     mac_address);
@@ -703,11 +704,11 @@ static int ibmveth_open(struct net_device *netdev)
 	}
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
-		if (!adapter->rx_buff_pool[i].active)
+		if (!adapter->rx_buff_pool[0][i].active)
 			continue;
-		if (ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[i])) {
+		if (ibmveth_alloc_buffer_pool(&adapter->rx_buff_pool[0][i])) {
 			netdev_err(netdev, "unable to alloc pool\n");
-			adapter->rx_buff_pool[i].active = 0;
+			adapter->rx_buff_pool[0][i].active = 0;
 			rc = -ENOMEM;
 			goto out_free_buffer_pools;
 		}
@@ -739,9 +740,9 @@ static int ibmveth_open(struct net_device *netdev)
 
 out_free_buffer_pools:
 	while (--i >= 0) {
-		if (adapter->rx_buff_pool[i].active)
+		if (adapter->rx_buff_pool[0][i].active)
 			ibmveth_free_buffer_pool(adapter,
-						 &adapter->rx_buff_pool[i]);
+						 &adapter->rx_buff_pool[0][i]);
 	}
 out_unmap_filter_list:
 	dma_unmap_single(dev, adapter->filter_list_dma, 4096,
@@ -753,18 +754,18 @@ static int ibmveth_open(struct net_device *netdev)
 	}
 
 out_unmap_buffer_list:
-	dma_unmap_single(dev, adapter->buffer_list_dma, 4096,
+	dma_unmap_single(dev, adapter->buffer_list_dma[0], 4096,
 			 DMA_BIDIRECTIONAL);
 out_free_queue_mem:
-	dma_free_coherent(dev, adapter->rx_queue.queue_len,
-			  adapter->rx_queue.queue_addr,
-			  adapter->rx_queue.queue_dma);
+	dma_free_coherent(dev, adapter->rx_queue[0].queue_len,
+			  adapter->rx_queue[0].queue_addr,
+			  adapter->rx_queue[0].queue_dma);
 out_free_filter_list:
 	free_page((unsigned long)adapter->filter_list_addr);
 out_free_buffer_list:
-	free_page((unsigned long)adapter->buffer_list_addr);
+	free_page((unsigned long)adapter->buffer_list_addr[0]);
 out:
-	napi_disable(&adapter->napi);
+	napi_disable(&adapter->napi[0]);
 	return rc;
 }
 
@@ -777,7 +778,7 @@ static int ibmveth_close(struct net_device *netdev)
 
 	netdev_dbg(netdev, "close starting\n");
 
-	napi_disable(&adapter->napi);
+	napi_disable(&adapter->napi[0]);
 
 	netif_tx_stop_all_queues(netdev);
 
@@ -796,22 +797,22 @@ static int ibmveth_close(struct net_device *netdev)
 
 	ibmveth_update_rx_no_buffer(adapter);
 
-	dma_unmap_single(dev, adapter->buffer_list_dma, 4096,
+	dma_unmap_single(dev, adapter->buffer_list_dma[0], 4096,
 			 DMA_BIDIRECTIONAL);
-	free_page((unsigned long)adapter->buffer_list_addr);
+	free_page((unsigned long)adapter->buffer_list_addr[0]);
 
 	dma_unmap_single(dev, adapter->filter_list_dma, 4096,
 			 DMA_BIDIRECTIONAL);
 	free_page((unsigned long)adapter->filter_list_addr);
 
-	dma_free_coherent(dev, adapter->rx_queue.queue_len,
-			  adapter->rx_queue.queue_addr,
-			  adapter->rx_queue.queue_dma);
+	dma_free_coherent(dev, adapter->rx_queue[0].queue_len,
+			  adapter->rx_queue[0].queue_addr,
+			  adapter->rx_queue[0].queue_dma);
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		if (adapter->rx_buff_pool[i].active)
+		if (adapter->rx_buff_pool[0][i].active)
 			ibmveth_free_buffer_pool(adapter,
-						 &adapter->rx_buff_pool[i]);
+						 &adapter->rx_buff_pool[0][i]);
 
 	for (i = 0; i < netdev->real_num_tx_queues; i++)
 		ibmveth_free_tx_ltb(adapter, i);
@@ -1449,7 +1450,7 @@ static void ibmveth_rx_csum_helper(struct sk_buff *skb,
 static int ibmveth_poll(struct napi_struct *napi, int budget)
 {
 	struct ibmveth_adapter *adapter =
-			container_of(napi, struct ibmveth_adapter, napi);
+			container_of(napi, struct ibmveth_adapter, napi[0]);
 	struct net_device *netdev = adapter->netdev;
 	int frames_processed = 0;
 	unsigned long lpar_rc;
@@ -1574,11 +1575,11 @@ static irqreturn_t ibmveth_interrupt(int irq, void *dev_instance)
 	struct ibmveth_adapter *adapter = netdev_priv(netdev);
 	unsigned long lpar_rc;
 
-	if (napi_schedule_prep(&adapter->napi)) {
+	if (napi_schedule_prep(&adapter->napi[0])) {
 		lpar_rc = h_vio_signal(adapter->vdev->unit_address,
 				       VIO_IRQ_DISABLE);
 		WARN_ON(lpar_rc != H_SUCCESS);
-		__napi_schedule(&adapter->napi);
+		__napi_schedule(&adapter->napi[0]);
 	}
 	return IRQ_HANDLED;
 }
@@ -1646,7 +1647,7 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
 	int need_restart = 0;
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size)
+		if (new_mtu_oh <= adapter->rx_buff_pool[0][i].buff_size)
 			break;
 
 	if (i == IBMVETH_NUM_BUFF_POOLS)
@@ -1661,9 +1662,9 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu)
 
 	/* Look for an active buffer pool that can hold the new MTU */
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
-		adapter->rx_buff_pool[i].active = 1;
+		adapter->rx_buff_pool[0][i].active = 1;
 
-		if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) {
+		if (new_mtu_oh <= adapter->rx_buff_pool[0][i].buff_size) {
 			WRITE_ONCE(dev->mtu, new_mtu);
 			vio_cmo_set_dev_desired(viodev,
 						ibmveth_get_desired_dma
@@ -1721,12 +1722,12 @@ static unsigned long ibmveth_get_desired_dma(struct vio_dev *vdev)
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
 		/* add the size of the active receive buffers */
-		if (adapter->rx_buff_pool[i].active)
+		if (adapter->rx_buff_pool[0][i].active)
 			ret +=
-			    adapter->rx_buff_pool[i].size *
-			    IOMMU_PAGE_ALIGN(adapter->rx_buff_pool[i].
+			    adapter->rx_buff_pool[0][i].size *
+			    IOMMU_PAGE_ALIGN(adapter->rx_buff_pool[0][i].
 					     buff_size, tbl);
-		rxqentries += adapter->rx_buff_pool[i].size;
+		rxqentries += adapter->rx_buff_pool[0][i].size;
 	}
 	/* add the size of the receive queue entries */
 	ret += IOMMU_PAGE_ALIGN(
@@ -1845,7 +1846,7 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
 	adapter->mcastFilterSize = be32_to_cpu(*mcastFilterSize_p);
 	ibmveth_init_link_settings(netdev);
 
-	netif_napi_add_weight(netdev, &adapter->napi, ibmveth_poll, 16);
+	netif_napi_add_weight(netdev, &adapter->napi[0], ibmveth_poll, 16);
 
 	netdev->irq = dev->irq;
 	netdev->netdev_ops = &ibmveth_netdev_ops;
@@ -1877,6 +1878,10 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		netdev->features |= NETIF_F_FRAGLIST;
 	}
 
+	/* Initialize queue count - always 1 for now */
+	adapter->multi_queue = 0;
+	adapter->num_rx_queues = 1;
+
 	if (ret == H_SUCCESS &&
 	    (ret_attr & IBMVETH_ILLAN_RX_MULTI_BUFF_SUPPORT)) {
 		adapter->rx_buffers_per_hcall = IBMVETH_MAX_RX_PER_HCALL;
@@ -1899,10 +1904,10 @@ static int ibmveth_probe(struct vio_dev *dev, const struct vio_device_id *id)
 		memcpy(pool_count, pool_count_cmo, sizeof(pool_count));
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
-		struct kobject *kobj = &adapter->rx_buff_pool[i].kobj;
+		struct kobject *kobj = &adapter->rx_buff_pool[0][i].kobj;
 		int error;
 
-		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i,
+		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[0][i], i,
 					 pool_count[i], pool_size[i],
 					 pool_active[i]);
 		error = kobject_init_and_add(kobj, &ktype_veth_pool,
@@ -1950,7 +1955,7 @@ static void ibmveth_remove(struct vio_dev *dev)
 	cancel_work_sync(&adapter->work);
 
 	for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		kobject_put(&adapter->rx_buff_pool[i].kobj);
+		kobject_put(&adapter->rx_buff_pool[0][i].kobj);
 
 	unregister_netdev(netdev);
 
@@ -2036,11 +2041,11 @@ static ssize_t veth_pool_store(struct kobject *kobj, struct attribute *attr,
 			/* Make sure there is a buffer pool with buffers that
 			   can hold a packet of the size of the MTU */
 			for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) {
-				if (pool == &adapter->rx_buff_pool[i])
+				if (pool == &adapter->rx_buff_pool[0][i])
 					continue;
-				if (!adapter->rx_buff_pool[i].active)
+				if (!adapter->rx_buff_pool[0][i].active)
 					continue;
-				if (mtu <= adapter->rx_buff_pool[i].buff_size)
+				if (mtu <= adapter->rx_buff_pool[0][i].buff_size)
 					break;
 			}
 
@@ -2214,11 +2219,11 @@ static void ibmveth_remove_buffer_from_pool_test(struct kunit *test)
 
 	/* Set sane values for buffer pools */
 	for (int i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i,
+		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[0][i], i,
 					 pool_count[i], pool_size[i],
 					 pool_active[i]);
 
-	pool = &adapter->rx_buff_pool[0];
+	pool = &adapter->rx_buff_pool[0][0];
 	pool->skbuff = kunit_kcalloc(test, pool->size, sizeof(void *), GFP_KERNEL);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pool->skbuff);
 
@@ -2226,7 +2231,7 @@ static void ibmveth_remove_buffer_from_pool_test(struct kunit *test)
 	KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, false));
 	KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, true));
 
-	correlator = ((u64)0 << 32) | adapter->rx_buff_pool[0].size;
+	correlator = ((u64)0 << 32) | adapter->rx_buff_pool[0][0].size;
 	KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, false));
 	KUNIT_EXPECT_EQ(test, -EINVAL, ibmveth_remove_buffer_from_pool(adapter, correlator, true));
 
@@ -2259,30 +2264,32 @@ static void ibmveth_rxq_get_buffer_test(struct kunit *test)
 
 	INIT_WORK(&adapter->work, ibmveth_reset_kunit);
 
-	adapter->rx_queue.queue_len = 1;
-	adapter->rx_queue.index = 0;
-	adapter->rx_queue.queue_addr = kunit_kzalloc(test, sizeof(struct ibmveth_rx_q_entry),
-						     GFP_KERNEL);
-	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, adapter->rx_queue.queue_addr);
+	adapter->rx_queue[0].queue_len = 1;
+	adapter->rx_queue[0].index = 0;
+	adapter->rx_queue[0].queue_addr =
+		kunit_kzalloc(test, sizeof(struct ibmveth_rx_q_entry),
+			      GFP_KERNEL);
+	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, adapter->rx_queue[0].queue_addr);
 
 	/* Set sane values for buffer pools */
 	for (int i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++)
-		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[i], i,
+		ibmveth_init_buffer_pool(&adapter->rx_buff_pool[0][i], i,
 					 pool_count[i], pool_size[i],
 					 pool_active[i]);
 
-	pool = &adapter->rx_buff_pool[0];
+	pool = &adapter->rx_buff_pool[0][0];
 	pool->skbuff = kunit_kcalloc(test, pool->size, sizeof(void *), GFP_KERNEL);
 	KUNIT_ASSERT_NOT_ERR_OR_NULL(test, pool->skbuff);
 
-	adapter->rx_queue.queue_addr[0].correlator = (u64)IBMVETH_NUM_BUFF_POOLS << 32 | 0;
+	adapter->rx_queue[0].queue_addr[0].correlator = (u64)IBMVETH_NUM_BUFF_POOLS << 32 | 0;
 	KUNIT_EXPECT_PTR_EQ(test, NULL, ibmveth_rxq_get_buffer(adapter));
 
-	adapter->rx_queue.queue_addr[0].correlator = (u64)0 << 32 | adapter->rx_buff_pool[0].size;
+	adapter->rx_queue[0].queue_addr[0].correlator =
+		(u64)0 << 32 | adapter->rx_buff_pool[0][0].size;
 	KUNIT_EXPECT_PTR_EQ(test, NULL, ibmveth_rxq_get_buffer(adapter));
 
 	pool->skbuff[0] = skb;
-	adapter->rx_queue.queue_addr[0].correlator = (u64)0 << 32 | 0;
+	adapter->rx_queue[0].queue_addr[0].correlator = (u64)0 << 32 | 0;
 	KUNIT_EXPECT_PTR_EQ(test, skb, ibmveth_rxq_get_buffer(adapter));
 
 	flush_work(&adapter->work);
diff --git a/drivers/net/ethernet/ibm/ibmveth.h b/drivers/net/ethernet/ibm/ibmveth.h
index 45cfb0d054e3..b17894695c2e 100644
--- a/drivers/net/ethernet/ibm/ibmveth.h
+++ b/drivers/net/ethernet/ibm/ibmveth.h
@@ -279,6 +279,8 @@ static inline long h_illan_attributes(unsigned long unit_address,
 #define IBMVETH_MAX_TX_BUF_SIZE (1024 * 64)
 #define IBMVETH_MAX_QUEUES 16U
 #define IBMVETH_DEFAULT_QUEUES 8U
+#define IBMVETH_MAX_RX_QUEUES 1U
+#define IBMVETH_DEFAULT_RX_QUEUES 1U
 #define IBMVETH_MAX_RX_PER_HCALL 8U
 
 static int pool_size[] = { 512, 1024 * 2, 1024 * 16, 1024 * 32, 1024 * 64 };
@@ -315,18 +317,22 @@ struct ibmveth_rx_q {
 struct ibmveth_adapter {
 	struct vio_dev *vdev;
 	struct net_device *netdev;
-	struct napi_struct napi;
+	struct napi_struct napi[IBMVETH_MAX_RX_QUEUES];
 	struct work_struct work;
 	unsigned int mcastFilterSize;
-	void *buffer_list_addr;
+	void *buffer_list_addr[IBMVETH_MAX_RX_QUEUES];
 	void *filter_list_addr;
 	void *tx_ltb_ptr[IBMVETH_MAX_QUEUES];
 	unsigned int tx_ltb_size;
 	dma_addr_t tx_ltb_dma[IBMVETH_MAX_QUEUES];
-	dma_addr_t buffer_list_dma;
+	dma_addr_t buffer_list_dma[IBMVETH_MAX_RX_QUEUES];
 	dma_addr_t filter_list_dma;
-	struct ibmveth_buff_pool rx_buff_pool[IBMVETH_NUM_BUFF_POOLS];
-	struct ibmveth_rx_q rx_queue;
+	struct ibmveth_buff_pool rx_buff_pool[IBMVETH_MAX_RX_QUEUES][IBMVETH_NUM_BUFF_POOLS];
+	struct ibmveth_rx_q rx_queue[IBMVETH_MAX_RX_QUEUES];
+	u64 queue_handle[IBMVETH_MAX_RX_QUEUES];
+	unsigned int queue_irq[IBMVETH_MAX_RX_QUEUES];
+	int multi_queue;
+	unsigned int num_rx_queues;
 	int rx_csum;
 	int large_send;
 	bool is_active_trunk;
-- 
2.39.3 (Apple Git-146)


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox