public inbox for stable@vger.kernel.org
 help / color / mirror / Atom feed
* [PATCH net v2 01/10] Revert "net/sched: Restrict conditions for adding duplicating netems to qdisc tree"
       [not found] <20260315001649.23931-1-stephen@networkplumber.org>
@ 2026-03-15  0:14 ` Stephen Hemminger
  2026-03-15  0:14 ` [PATCH net v2 02/10] net/sched: netem: add per-CPU recursion guard for duplication Stephen Hemminger
  2026-03-15  0:14 ` [PATCH net v2 04/10] net/sched: netem: restructure dequeue to avoid re-entrancy with child qdisc Stephen Hemminger
  2 siblings, 0 replies; 3+ messages in thread
From: Stephen Hemminger @ 2026-03-15  0:14 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, Ji-Soo Chung, Gerlinde, stable, Cong Wang

This reverts commit ec8e0e3d7adef940cdf9475e2352c0680189d14e.

The restriction breaks valid uses of netem such as using different
netem values on different branches of HTB. This even broke some
of the examples in the netem documentation.

The intent of blocking recursion is handled in next patch.

Fixes: ec8e0e3d7adef ("net/sched: Restrict conditions for adding duplicating netems to qdisc tree")
Reported-by: Ji-Soo Chung <jschung2@proton.me>
Reported-by: Gerlinde <lrGerlinde@mailfence.com>
Link: https://bugzilla.kernel.org/show_bug.cgi?id=220774
Cc: stable@vger.kernel.org
Originally-by: Cong Wang <xiyou.wangcong@gmail.com>
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 40 ----------------------------------------
 1 file changed, 40 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5de1c932944a..0ccf74a9cb82 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -974,41 +974,6 @@ static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 	return 0;
 }
 
-static const struct Qdisc_class_ops netem_class_ops;
-
-static int check_netem_in_tree(struct Qdisc *sch, bool duplicates,
-			       struct netlink_ext_ack *extack)
-{
-	struct Qdisc *root, *q;
-	unsigned int i;
-
-	root = qdisc_root_sleeping(sch);
-
-	if (sch != root && root->ops->cl_ops == &netem_class_ops) {
-		if (duplicates ||
-		    ((struct netem_sched_data *)qdisc_priv(root))->duplicate)
-			goto err;
-	}
-
-	if (!qdisc_dev(root))
-		return 0;
-
-	hash_for_each(qdisc_dev(root)->qdisc_hash, i, q, hash) {
-		if (sch != q && q->ops->cl_ops == &netem_class_ops) {
-			if (duplicates ||
-			    ((struct netem_sched_data *)qdisc_priv(q))->duplicate)
-				goto err;
-		}
-	}
-
-	return 0;
-
-err:
-	NL_SET_ERR_MSG(extack,
-		       "netem: cannot mix duplicating netems with other netems in tree");
-	return -EINVAL;
-}
-
 /* Parse netlink message to set options */
 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 			struct netlink_ext_ack *extack)
@@ -1067,11 +1032,6 @@ static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 	q->gap = qopt->gap;
 	q->counter = 0;
 	q->loss = qopt->loss;
-
-	ret = check_netem_in_tree(sch, qopt->duplicate, extack);
-	if (ret)
-		goto unlock;
-
 	q->duplicate = qopt->duplicate;
 
 	/* for compatibility with earlier versions.
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH net v2 02/10] net/sched: netem: add per-CPU recursion guard for duplication
       [not found] <20260315001649.23931-1-stephen@networkplumber.org>
  2026-03-15  0:14 ` [PATCH net v2 01/10] Revert "net/sched: Restrict conditions for adding duplicating netems to qdisc tree" Stephen Hemminger
@ 2026-03-15  0:14 ` Stephen Hemminger
  2026-03-15  0:14 ` [PATCH net v2 04/10] net/sched: netem: restructure dequeue to avoid re-entrancy with child qdisc Stephen Hemminger
  2 siblings, 0 replies; 3+ messages in thread
From: Stephen Hemminger @ 2026-03-15  0:14 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, stable, William Liu, Savino Dicanosa

Add a per-CPU recursion depth counter to netem_enqueue(). When netem
duplicates a packet, the clone is re-enqueued at the root qdisc. If
the tree contains other netem instances, this can recurse without
bound, causing soft lockups and OOM.

This approach was previously considered but rejected on the grounds
that netem_dequeue calling enqueue on a child netem could bypass the
depth check. That concern does not apply: the child netem's
netem_enqueue() increments the same per-CPU counter, so the total
nesting depth across all netem instances in the call chain is tracked
correctly.

A depth limit of 4 is generous for any legitimate configuration.

Fixes: 0afb51e72855 ("[PKT_SCHED]: netem: reinsert for duplication")
Link: https://bugzilla.kernel.org/show_bug.cgi?id=220774
Cc: stable@vger.kernel.org
Reported-by: William Liu <will@willsroot.io>
Reported-by: Savino Dicanosa <savy@syst3mfailure.io>

Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 0ccf74a9cb82..085fa3ad6f83 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -21,6 +21,7 @@
 #include <linux/rtnetlink.h>
 #include <linux/reciprocal_div.h>
 #include <linux/rbtree.h>
+#include <linux/percpu.h>
 
 #include <net/gso.h>
 #include <net/netlink.h>
@@ -29,6 +30,15 @@
 
 #define VERSION "1.3"
 
+/*
+ * Limit for recursion from duplication.
+ * Duplicated packets are re-enqueued at the root qdisc, which may
+ * reach this or another netem instance, causing nested calls to
+ * netem_enqueue(). This per-CPU counter limits the total depth.
+ */
+static DEFINE_PER_CPU(unsigned int, netem_enqueue_depth);
+#define NETEM_RECURSION_LIMIT	4
+
 /*	Network Emulation Queuing algorithm.
 	====================================
 
@@ -460,6 +470,14 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	/* Do not fool qdisc_drop_all() */
 	skb->prev = NULL;
 
+	/* Guard against recursion from duplication re-injection. */
+	if (unlikely(this_cpu_inc_return(netem_enqueue_depth) >
+		     NETEM_RECURSION_LIMIT)) {
+		this_cpu_dec(netem_enqueue_depth);
+		qdisc_drop(skb, sch, to_free);
+		return NET_XMIT_DROP;
+	}
+
 	/* Random duplication */
 	if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor, &q->prng))
 		++count;
@@ -474,6 +492,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (count == 0) {
 		qdisc_qstats_drop(sch);
 		__qdisc_drop(skb, to_free);
+		this_cpu_dec(netem_enqueue_depth);
 		return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 	}
 
@@ -529,6 +548,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		qdisc_drop_all(skb, sch, to_free);
 		if (skb2)
 			__qdisc_drop(skb2, to_free);
+		this_cpu_dec(netem_enqueue_depth);
 		return NET_XMIT_DROP;
 	}
 
@@ -643,8 +663,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		/* Parent qdiscs accounted for 1 skb of size @prev_len */
 		qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
 	} else if (!skb) {
+		this_cpu_dec(netem_enqueue_depth);
 		return NET_XMIT_DROP;
 	}
+	this_cpu_dec(netem_enqueue_depth);
 	return NET_XMIT_SUCCESS;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* [PATCH net v2 04/10] net/sched: netem: restructure dequeue to avoid re-entrancy with child qdisc
       [not found] <20260315001649.23931-1-stephen@networkplumber.org>
  2026-03-15  0:14 ` [PATCH net v2 01/10] Revert "net/sched: Restrict conditions for adding duplicating netems to qdisc tree" Stephen Hemminger
  2026-03-15  0:14 ` [PATCH net v2 02/10] net/sched: netem: add per-CPU recursion guard for duplication Stephen Hemminger
@ 2026-03-15  0:14 ` Stephen Hemminger
  2 siblings, 0 replies; 3+ messages in thread
From: Stephen Hemminger @ 2026-03-15  0:14 UTC (permalink / raw)
  To: netdev; +Cc: Stephen Hemminger, stable

netem_dequeue() currently enqueues time-ready packets into the child
qdisc during the dequeue call path. This creates several problems:

1. Parent qdiscs like HFSC track class active/inactive state based on
   qlen transitions. The child enqueue during netem's dequeue can cause
   qlen to increase while the parent is mid-dequeue, leading to
   double-insertion in HFSC's eltree (CVE-2025-37890, CVE-2025-38001).

2. If the child qdisc is non-work-conserving (e.g., TBF), it may refuse
   to release packets during its dequeue even though they were just
   enqueued. The parent then sees netem returning NULL despite having
   backlog, violating the work-conserving contract and causing stalls
   with parents like DRR that deactivate classes in this case.

Restructure netem_dequeue so that when a child qdisc is present, all
time-ready packets are transferred from the tfifo to the child in a
batch before asking the child for output. This ensures the child only
receives packets whose delay has already elapsed. The no-child path
(tfifo direct dequeue) is unchanged.

Fixes: 50612537e9ab ("netem: fix classful handling")
Cc: stable@vger.kernel.org
Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
---
 net/sched/sch_netem.c | 79 +++++++++++++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 25 deletions(-)

diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 085fa3ad6f83..7488ff9f2933 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -726,7 +726,6 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 	struct netem_sched_data *q = qdisc_priv(sch);
 	struct sk_buff *skb;
 
-tfifo_dequeue:
 	skb = __qdisc_dequeue_head(&sch->q);
 	if (skb) {
 deliver:
@@ -734,24 +733,28 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 		qdisc_bstats_update(sch, skb);
 		return skb;
 	}
-	skb = netem_peek(q);
-	if (skb) {
-		u64 time_to_send;
+
+	/* If we have a child qdisc, transfer all time-ready packets
+	 * from the tfifo into the child, then dequeue from the child.
+	 * This avoids enqueueing into the child during the parent's
+	 * dequeue callback, which can confuse parents that track
+	 * active/inactive state based on qlen transitions (HFSC).
+	 */
+	if (q->qdisc) {
 		u64 now = ktime_get_ns();
 
-		/* if more time remaining? */
-		time_to_send = netem_skb_cb(skb)->time_to_send;
-		if (q->slot.slot_next && q->slot.slot_next < time_to_send)
-			get_slot_next(q, now);
+		while ((skb = netem_peek(q)) != NULL) {
+			u64 t = netem_skb_cb(skb)->time_to_send;
+
+			if (t > now)
+				break;
+			if (q->slot.slot_next && q->slot.slot_next > now)
+				break;
 
-		if (time_to_send <= now && q->slot.slot_next <= now) {
 			netem_erase_head(q, skb);
 			q->t_len--;
 			skb->next = NULL;
 			skb->prev = NULL;
-			/* skb->dev shares skb->rbnode area,
-			 * we need to restore its value.
-			 */
 			skb->dev = qdisc_dev(sch);
 
 			if (q->slot.slot_next) {
@@ -762,7 +765,7 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 					get_slot_next(q, now);
 			}
 
-			if (q->qdisc) {
+			{
 				unsigned int pkt_len = qdisc_pkt_len(skb);
 				struct sk_buff *to_free = NULL;
 				int err;
@@ -776,32 +779,58 @@ static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 					sch->q.qlen--;
 					qdisc_tree_reduce_backlog(sch, 1, pkt_len);
 				}
-				goto tfifo_dequeue;
 			}
+		}
+
+		skb = q->qdisc->ops->dequeue(q->qdisc);
+		if (skb) {
 			sch->q.qlen--;
 			goto deliver;
 		}
-
-		if (q->qdisc) {
-			skb = q->qdisc->ops->dequeue(q->qdisc);
-			if (skb) {
+	} else {
+		/* No child qdisc: dequeue directly from tfifo */
+		skb = netem_peek(q);
+		if (skb) {
+			u64 time_to_send;
+			u64 now = ktime_get_ns();
+
+			time_to_send = netem_skb_cb(skb)->time_to_send;
+			if (q->slot.slot_next &&
+			    q->slot.slot_next < time_to_send)
+				get_slot_next(q, now);
+
+			if (time_to_send <= now &&
+			    q->slot.slot_next <= now) {
+				netem_erase_head(q, skb);
+				q->t_len--;
+				skb->next = NULL;
+				skb->prev = NULL;
+				skb->dev = qdisc_dev(sch);
+
+				if (q->slot.slot_next) {
+					q->slot.packets_left--;
+					q->slot.bytes_left -=
+						qdisc_pkt_len(skb);
+					if (q->slot.packets_left <= 0 ||
+					    q->slot.bytes_left <= 0)
+						get_slot_next(q, now);
+				}
 				sch->q.qlen--;
 				goto deliver;
 			}
 		}
+	}
+
+	/* Schedule watchdog for next time-ready packet */
+	skb = netem_peek(q);
+	if (skb) {
+		u64 time_to_send = netem_skb_cb(skb)->time_to_send;
 
 		qdisc_watchdog_schedule_ns(&q->watchdog,
 					   max(time_to_send,
 					       q->slot.slot_next));
 	}
 
-	if (q->qdisc) {
-		skb = q->qdisc->ops->dequeue(q->qdisc);
-		if (skb) {
-			sch->q.qlen--;
-			goto deliver;
-		}
-	}
 	return NULL;
 }
 
-- 
2.51.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2026-03-15  0:17 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20260315001649.23931-1-stephen@networkplumber.org>
2026-03-15  0:14 ` [PATCH net v2 01/10] Revert "net/sched: Restrict conditions for adding duplicating netems to qdisc tree" Stephen Hemminger
2026-03-15  0:14 ` [PATCH net v2 02/10] net/sched: netem: add per-CPU recursion guard for duplication Stephen Hemminger
2026-03-15  0:14 ` [PATCH net v2 04/10] net/sched: netem: restructure dequeue to avoid re-entrancy with child qdisc Stephen Hemminger

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox