[PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands
@ 2023-10-01 14:50 Eric Dumazet
  2023-10-01 14:50 ` [PATCH net-next 1/4] net_sched: sch_fq: remove q->ktime_cache Eric Dumazet
                   ` (5 more replies)
  0 siblings, 6 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-01 14:50 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

As discussed in Netconf 2023 in Paris last week, this series adds
to FQ the possibility of replacing pfifo_fast for most setups.

FQ provides fairness among flows, but malicious applications
can cause problems by using thousands of sockets.

Having 3 bands like pfifo_fast can make sure that applications
using high prio packets (eg AF4) can get guaranteed throughput
even if thousands of low priority flows are competing.

Added complexity in FQ does not matter in many cases when/if
fastpath added in the prior series is used.

Eric Dumazet (4):
  net_sched: sch_fq: remove q->ktime_cache
  net_sched: export pfifo_fast prio2band[]
  net_sched: sch_fq: add 3 bands and WRR scheduling
  net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute

 include/net/sch_generic.h      |   1 +
 include/uapi/linux/pkt_sched.h |  14 +-
 net/sched/sch_fq.c             | 263 ++++++++++++++++++++++++++-------
 net/sched/sch_generic.c        |   9 +-
 4 files changed, 226 insertions(+), 61 deletions(-)

-- 
2.42.0.582.g8ccd20d70d-goog

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH net-next 1/4] net_sched: sch_fq: remove q->ktime_cache
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
@ 2023-10-01 14:50 ` Eric Dumazet
  2023-10-01 14:51 ` [PATCH net-next 2/4] net_sched: export pfifo_fast prio2band[] Eric Dumazet
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-01 14:50 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

Now that both enqueue() and dequeue() need to use ktime_get_ns(),
there is no point wasting 8 bytes in struct fq_sched_data.

This makes room for future fields. ;)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 28 +++++++++++++++-------------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 681bbf34b70763032c68d89003307ceec8ab46b4..91d71a538b71f9208f2507fd11443f784dffa966 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -106,7 +106,6 @@ struct fq_sched_data {
 
 	struct rb_root	delayed;	/* for rate limited flows */
 	u64		time_next_delayed_flow;
-	u64		ktime_cache;	/* copy of last ktime_get_ns() */
 	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
@@ -282,12 +281,13 @@ static void fq_gc(struct fq_sched_data *q,
  *
  * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
  */
-static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb)
+static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb,
+			      u64 now)
 {
 	const struct fq_sched_data *q = qdisc_priv(sch);
 	const struct sock *sk;
 
-	if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
+	if (fq_skb_cb(skb)->time_to_send > now)
 		return false;
 
 	if (sch->q.qlen != 0) {
@@ -317,7 +317,8 @@ static bool fq_fastpath_check(const struct Qdisc *sch, struct sk_buff *skb)
 	return true;
 }
 
-static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
+				   u64 now)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct rb_node **p, *parent;
@@ -360,7 +361,7 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
 		sk = (struct sock *)((hash << 1) | 1UL);
 	}
 
-	if (fq_fastpath_check(sch, skb)) {
+	if (fq_fastpath_check(sch, skb, now)) {
 		q->internal.stat_fastpath_packets++;
 		return &q->internal;
 	}
@@ -497,9 +498,9 @@ static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
 }
 
 static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
-				    const struct fq_sched_data *q)
+				     const struct fq_sched_data *q, u64 now)
 {
-	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
+	return unlikely((s64)skb->tstamp > (s64)(now + q->horizon));
 }
 
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
@@ -507,27 +508,28 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct fq_flow *f;
+	u64 now;
 
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
-	q->ktime_cache = ktime_get_ns();
+	now = ktime_get_ns();
 	if (!skb->tstamp) {
-		fq_skb_cb(skb)->time_to_send = q->ktime_cache;
+		fq_skb_cb(skb)->time_to_send = now;
 	} else {
 		/* Check if packet timestamp is too far in the future. */
-		if (fq_packet_beyond_horizon(skb, q)) {
+		if (fq_packet_beyond_horizon(skb, q, now)) {
 			if (q->horizon_drop) {
 					q->stat_horizon_drops++;
 					return qdisc_drop(skb, sch, to_free);
 			}
 			q->stat_horizon_caps++;
-			skb->tstamp = q->ktime_cache + q->horizon;
+			skb->tstamp = now + q->horizon;
 		}
 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
 	}
 
-	f = fq_classify(sch, skb);
+	f = fq_classify(sch, skb, now);
 
 	if (f != &q->internal) {
 		if (unlikely(f->qlen >= q->flow_plimit)) {
@@ -602,7 +604,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		goto out;
 	}
 
-	q->ktime_cache = now = ktime_get_ns();
+	now = ktime_get_ns();
 	fq_check_throttled(q, now);
 begin:
 	head = &q->new_flows;
-- 
2.42.0.582.g8ccd20d70d-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH net-next 2/4] net_sched: export pfifo_fast prio2band[]
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
  2023-10-01 14:50 ` [PATCH net-next 1/4] net_sched: sch_fq: remove q->ktime_cache Eric Dumazet
@ 2023-10-01 14:51 ` Eric Dumazet
  2023-10-01 14:51 ` [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling Eric Dumazet
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-01 14:51 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

pfifo_fast prio2band[] is renamed to sch_default_prio2band[]
and exported because we want to use it in FQ.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/sch_generic.h | 1 +
 net/sched/sch_generic.c   | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index f232512505f89622517f87a24b35c3a441c81b3d..c7318c73cfd632b730ca1e943c777f0475cd6458 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -587,6 +587,7 @@ static inline void sch_tree_unlock(struct Qdisc *q)
 extern struct Qdisc noop_qdisc;
 extern struct Qdisc_ops noop_qdisc_ops;
 extern struct Qdisc_ops pfifo_fast_ops;
+extern const u8 sch_default_prio2band[TC_PRIO_MAX + 1];
 extern struct Qdisc_ops mq_qdisc_ops;
 extern struct Qdisc_ops noqueue_qdisc_ops;
 extern const struct Qdisc_ops *default_qdisc_ops;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index 5d7e23f4cc0ee4c8c2c39cf10405f56fb6f0bfe1..4195a4bc26ca7932edb7b60b383f7887d960f3ca 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -694,9 +694,10 @@ struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
 	.owner		=	THIS_MODULE,
 };
 
-static const u8 prio2band[TC_PRIO_MAX + 1] = {
-	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
+const u8 sch_default_prio2band[TC_PRIO_MAX + 1] = {
+	1, 2, 2, 2, 1, 2, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1
 };
+EXPORT_SYMBOL(sch_default_prio2band);
 
 /* 3-band FIFO queue: old style, but should be a bit faster than
    generic prio+fifo combination.
@@ -721,7 +722,7 @@ static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
 static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
 			      struct sk_buff **to_free)
 {
-	int band = prio2band[skb->priority & TC_PRIO_MAX];
+	int band = sch_default_prio2band[skb->priority & TC_PRIO_MAX];
 	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
 	struct skb_array *q = band2list(priv, band);
 	unsigned int pkt_len = qdisc_pkt_len(skb);
@@ -830,7 +831,7 @@ static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
 {
 	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
 
-	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
+	memcpy(&opt.priomap, sch_default_prio2band, TC_PRIO_MAX + 1);
 	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
 		goto nla_put_failure;
 	return skb->len;
-- 
2.42.0.582.g8ccd20d70d-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
  2023-10-01 14:50 ` [PATCH net-next 1/4] net_sched: sch_fq: remove q->ktime_cache Eric Dumazet
  2023-10-01 14:51 ` [PATCH net-next 2/4] net_sched: export pfifo_fast prio2band[] Eric Dumazet
@ 2023-10-01 14:51 ` Eric Dumazet
  2023-10-01 22:15   ` Soheil Hassas Yeganeh
  2023-10-02 11:46   ` Toke Høiland-Jørgensen
  2023-10-01 14:51 ` [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute Eric Dumazet
                   ` (2 subsequent siblings)
  5 siblings, 2 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-01 14:51 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

Before Google adopted FQ for its production servers,
we had to ensure AF4 packets would get a higher share
than BE1 ones.

As discussed this week in Netconf 2023 in Paris, it is time
to upstream this for public use.

After this patch FQ can replace pfifo_fast, with the following
differences :

- FQ uses WRR instead of strict prio, to avoid starvation of
  low priority packets.

- We make sure each band/prio tracks its own usage against sch->limit.
  This was done to make sure flood of low priority packets would not
  prevent AF4 packets to be queued. Contributed by Willem.

- priomap can be changed, if needed (default value are the ones
  coming from pfifo_fast).

In this patch, we set default band weights so that :

- high prio (band=0) packets get 90% of the bandwidth
  if they compete with low prio (band=2) packets.

- high prio packets get 75% of the bandwidth
  if they compete with medium prio (band=1) packets.

Following patch in this series adds the possibility to tune
the per-band weights.

As we added many fields in 'struct fq_sched_data', we had
to make sure to have the first cache line read-mostly, and
avoid wasting precious cache lines.

More optimizations are possible but will be sent separately.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/uapi/linux/pkt_sched.h |  11 +-
 net/sched/sch_fq.c             | 203 ++++++++++++++++++++++++++-------
 2 files changed, 170 insertions(+), 44 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 579f641846b87da05e5d4b09c1072c90220ca601..ec5ab44d41a2493130670870dc9e68c71187740f 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -941,15 +941,19 @@ enum {
 
 	TCA_FQ_HORIZON_DROP,	/* drop packets beyond horizon, or cap their EDT */
 
+	TCA_FQ_PRIOMAP,		/* prio2band */
+
 	__TCA_FQ_MAX
 };
 
 #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
 
+#define FQ_BANDS 3
+
 struct tc_fq_qd_stats {
 	__u64	gc_flows;
-	__u64	highprio_packets;
-	__u64	tcp_retrans;
+	__u64	highprio_packets;	/* obsolete */
+	__u64	tcp_retrans;		/* obsolete */
 	__u64	throttled;
 	__u64	flows_plimit;
 	__u64	pkts_too_long;
@@ -963,6 +967,9 @@ struct tc_fq_qd_stats {
 	__u64	horizon_drops;
 	__u64	horizon_caps;
 	__u64	fastpath_packets;
+	__u64	band_drops[FQ_BANDS];
+	__u32	band_pkt_count[FQ_BANDS];
+	__u32	pad;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 91d71a538b71f9208f2507fd11443f784dffa966..1bae145750a66f769bd30f1db09203f725801249 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -51,7 +51,8 @@
 #include <net/tcp.h>
 
 struct fq_skb_cb {
-	u64	        time_to_send;
+	u64	time_to_send;
+	u8	band;
 };
 
 static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
@@ -84,32 +85,28 @@ struct fq_flow {
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
 
-/* Second cache line, used in fq_dequeue() */
+/* Second cache line */
 	int		credit;
-	/* 32bit hole on 64bit arches */
-
+	int		band;
 	struct fq_flow *next;		/* next pointer in RR lists */
 
 	struct rb_node  rate_node;	/* anchor in q->delayed tree */
 	u64		time_next_packet;
-} ____cacheline_aligned_in_smp;
+};
 
 struct fq_flow_head {
 	struct fq_flow *first;
 	struct fq_flow *last;
 };
 
-struct fq_sched_data {
+struct fq_perband_flows {
 	struct fq_flow_head new_flows;
-
 	struct fq_flow_head old_flows;
+	int		    credit;
+	int		    quantum; /* based on band nr : 576KB, 192KB, 64KB */
+};
 
-	struct rb_root	delayed;	/* for rate limited flows */
-	u64		time_next_delayed_flow;
-	unsigned long	unthrottle_latency_ns;
-
-	struct fq_flow	internal;	/* for non classified or high prio packets */
-
+struct fq_sched_data {
 /* Read mostly cache line */
 
 	u32		quantum;
@@ -125,10 +122,21 @@ struct fq_sched_data {
 	u8		rate_enable;
 	u8		fq_trees_log;
 	u8		horizon_drop;
+	u8		prio2band[(TC_PRIO_MAX + 1) >> 2];
 	u32		timer_slack; /* hrtimer slack in ns */
 
 /* Read/Write fields. */
 
+	unsigned int band_nr; /* band being serviced in fq_dequeue() */
+
+	struct fq_perband_flows band_flows[FQ_BANDS];
+
+	struct fq_flow	internal;	/* fastpath queue. */
+	struct rb_root	delayed;	/* for rate limited flows */
+	u64		time_next_delayed_flow;
+	unsigned long	unthrottle_latency_ns;
+
+	u32		band_pkt_count[FQ_BANDS];
 	u32		flows;
 	u32		inactive_flows; /* Flows with no packet to send. */
 	u32		throttled_flows;
@@ -139,7 +147,7 @@ struct fq_sched_data {
 
 /* Seldom used fields. */
 
-	u64		stat_internal_packets; /* aka highprio */
+	u64		stat_band_drops[FQ_BANDS];
 	u64		stat_ce_mark;
 	u64		stat_horizon_drops;
 	u64		stat_horizon_caps;
@@ -148,6 +156,12 @@ struct fq_sched_data {
 	u64		stat_allocation_errors;
 };
 
+/* return the i-th 2-bit value ("crumb") */
+static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
+{
+	return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
+}
+
 /*
  * f->tail and f->age share the same location.
  * We can use the low order bit to differentiate if this location points
@@ -172,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f)
 	return f->next == &throttled;
 }
 
-static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
+enum new_flow {
+	NEW_FLOW,
+	OLD_FLOW
+};
+
+static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
+			     enum new_flow list_sel)
 {
+	struct fq_perband_flows *pband = &q->band_flows[flow->band];
+	struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
+					&pband->new_flows :
+					&pband->old_flows;
+
 	if (head->first)
 		head->last->next = flow;
 	else
@@ -186,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
 {
 	rb_erase(&f->rate_node, &q->delayed);
 	q->throttled_flows--;
-	fq_flow_add_tail(&q->old_flows, f);
+	fq_flow_add_tail(q, f, OLD_FLOW);
 }
 
 static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
@@ -326,11 +351,6 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
 	struct rb_root *root;
 	struct fq_flow *f;
 
-	/* warning: no starvation prevention... */
-	if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
-		q->stat_internal_packets++; /* highprio packet */
-		return &q->internal;
-	}
 	/* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
 	 * or a listener (SYNCOOKIE mode)
 	 * 1) request sockets are not full blown,
@@ -509,9 +529,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct fq_flow *f;
 	u64 now;
+	u8 band;
 
-	if (unlikely(sch->q.qlen >= sch->limit))
+	band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
+	if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
+		q->stat_band_drops[band]++;
 		return qdisc_drop(skb, sch, to_free);
+	}
 
 	now = ktime_get_ns();
 	if (!skb->tstamp) {
@@ -538,11 +562,14 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		}
 
 		if (fq_flow_is_detached(f)) {
-			fq_flow_add_tail(&q->new_flows, f);
+			fq_flow_add_tail(q, f, NEW_FLOW);
 			if (time_after(jiffies, f->age + q->flow_refill_delay))
 				f->credit = max_t(u32, f->credit, q->quantum);
 		}
 
+		f->band = band;
+		q->band_pkt_count[band]++;
+		fq_skb_cb(skb)->band = band;
 		if (f->qlen == 0)
 			q->inactive_flows--;
 	}
@@ -584,13 +611,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
 	}
 }
 
+static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
+{
+	if (pband->credit <= 0)
+		return NULL;
+
+	if (pband->new_flows.first)
+		return &pband->new_flows;
+
+	return pband->old_flows.first ? &pband->old_flows : NULL;
+}
+
 static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
+	struct fq_perband_flows *pband;
 	struct fq_flow_head *head;
 	struct sk_buff *skb;
 	struct fq_flow *f;
 	unsigned long rate;
+	int retry;
 	u32 plen;
 	u64 now;
 
@@ -606,24 +646,31 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 	now = ktime_get_ns();
 	fq_check_throttled(q, now);
+	retry = 0;
+	pband = &q->band_flows[q->band_nr];
 begin:
-	head = &q->new_flows;
-	if (!head->first) {
-		head = &q->old_flows;
-		if (!head->first) {
-			if (q->time_next_delayed_flow != ~0ULL)
-				qdisc_watchdog_schedule_range_ns(&q->watchdog,
+	head = fq_pband_head_select(pband);
+	if (!head) {
+		while (++retry < FQ_BANDS) {
+			if (++q->band_nr == FQ_BANDS)
+				q->band_nr = 0;
+			pband = &q->band_flows[q->band_nr];
+			pband->credit = min(pband->credit + pband->quantum,
+					    pband->quantum);
+			goto begin;
+		}
+		if (q->time_next_delayed_flow != ~0ULL)
+			qdisc_watchdog_schedule_range_ns(&q->watchdog,
 							q->time_next_delayed_flow,
 							q->timer_slack);
-			return NULL;
-		}
+		return NULL;
 	}
 	f = head->first;
-
+	retry = 0;
 	if (f->credit <= 0) {
 		f->credit += q->quantum;
 		head->first = f->next;
-		fq_flow_add_tail(&q->old_flows, f);
+		fq_flow_add_tail(q, f, OLD_FLOW);
 		goto begin;
 	}
 
@@ -645,12 +692,13 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 		}
 		if (--f->qlen == 0)
 			q->inactive_flows++;
+		q->band_pkt_count[fq_skb_cb(skb)->band]--;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
 		/* force a pass through old_flows to prevent starvation */
-		if ((head == &q->new_flows) && q->old_flows.first) {
-			fq_flow_add_tail(&q->old_flows, f);
+		if (head == &pband->new_flows) {
+			fq_flow_add_tail(q, f, OLD_FLOW);
 		} else {
 			fq_flow_set_detached(f);
 		}
@@ -658,6 +706,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 	}
 	plen = qdisc_pkt_len(skb);
 	f->credit -= plen;
+	pband->credit -= plen;
 
 	if (!q->rate_enable)
 		goto out;
@@ -749,8 +798,10 @@ static void fq_reset(struct Qdisc *sch)
 			kmem_cache_free(fq_flow_cachep, f);
 		}
 	}
-	q->new_flows.first	= NULL;
-	q->old_flows.first	= NULL;
+	for (idx = 0; idx < FQ_BANDS; idx++) {
+		q->band_flows[idx].new_flows.first = NULL;
+		q->band_flows[idx].old_flows.first = NULL;
+	}
 	q->delayed		= RB_ROOT;
 	q->flows		= 0;
 	q->inactive_flows	= 0;
@@ -864,8 +915,53 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 	[TCA_FQ_TIMER_SLACK]		= { .type = NLA_U32 },
 	[TCA_FQ_HORIZON]		= { .type = NLA_U32 },
 	[TCA_FQ_HORIZON_DROP]		= { .type = NLA_U8 },
+	[TCA_FQ_PRIOMAP]		= {
+			.type = NLA_BINARY,
+			.len = sizeof(struct tc_prio_qopt),
+		},
 };
 
+/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
+static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
+{
+	const int num_elems = TC_PRIO_MAX + 1;
+	int i;
+
+	memset(out, 0, num_elems / 4);
+	for (i = 0; i < num_elems; i++)
+		out[i / 4] |= in[i] << (2 * (i & 0x3));
+}
+
+static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
+{
+	const int num_elems = TC_PRIO_MAX + 1;
+	int i;
+
+	for (i = 0; i < num_elems; i++)
+		out[i] = fq_prio2band(in, i);
+}
+
+static int fq_load_priomap(struct fq_sched_data *q,
+			   const struct nlattr *attr,
+			   struct netlink_ext_ack *extack)
+{
+	const struct tc_prio_qopt *map = nla_data(attr);
+	int i;
+
+	if (map->bands != FQ_BANDS) {
+		NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
+		return -EINVAL;
+	}
+	for (i = 0; i < TC_PRIO_MAX + 1; i++) {
+		if (map->priomap[i] >= FQ_BANDS) {
+			NL_SET_ERR_MSG_MOD(extack, "Incorrect field in FQ priomap");
+			return -EINVAL;
+		}
+	}
+	fq_prio2band_compress_crumb(map->priomap, q->prio2band);
+	return 0;
+}
+
 static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		     struct netlink_ext_ack *extack)
 {
@@ -940,6 +1036,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 		q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
 	}
 
+	if (!err && tb[TCA_FQ_PRIOMAP])
+		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
+
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
@@ -991,7 +1090,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 		   struct netlink_ext_ack *extack)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
-	int err;
+	int i, err;
 
 	sch->limit		= 10000;
 	q->flow_plimit		= 100;
@@ -1001,8 +1100,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	q->flow_max_rate	= ~0UL;
 	q->time_next_delayed_flow = ~0ULL;
 	q->rate_enable		= 1;
-	q->new_flows.first	= NULL;
-	q->old_flows.first	= NULL;
+	for (i = 0; i < FQ_BANDS; i++) {
+		q->band_flows[i].new_flows.first = NULL;
+		q->band_flows[i].old_flows.first = NULL;
+	}
+	q->band_flows[0].quantum = 9 << 16;
+	q->band_flows[1].quantum = 3 << 16;
+	q->band_flows[2].quantum = 1 << 16;
 	q->delayed		= RB_ROOT;
 	q->fq_root		= NULL;
 	q->fq_trees_log		= ilog2(1024);
@@ -1017,6 +1121,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
 	/* Default ce_threshold of 4294 seconds */
 	q->ce_threshold		= (u64)NSEC_PER_USEC * ~0U;
 
+	fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
 	qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
 
 	if (opt)
@@ -1031,6 +1136,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	u64 ce_threshold = q->ce_threshold;
+	struct tc_prio_qopt prio = {
+		.bands = FQ_BANDS,
+	};
 	u64 horizon = q->horizon;
 	struct nlattr *opts;
 
@@ -1062,6 +1170,10 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	    nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
 		goto nla_put_failure;
 
+	fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
+	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
@@ -1072,11 +1184,14 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 {
 	struct fq_sched_data *q = qdisc_priv(sch);
 	struct tc_fq_qd_stats st;
+	int i;
+
+	st.pad = 0;
 
 	sch_tree_lock(sch);
 
 	st.gc_flows		  = q->stat_gc_flows;
-	st.highprio_packets	  = q->stat_internal_packets;
+	st.highprio_packets	  = 0;
 	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
@@ -1093,6 +1208,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 	st.ce_mark		  = q->stat_ce_mark;
 	st.horizon_drops	  = q->stat_horizon_drops;
 	st.horizon_caps		  = q->stat_horizon_caps;
+	for (i = 0; i < FQ_BANDS; i++) {
+		st.band_drops[i]  = q->stat_band_drops[i];
+		st.band_pkt_count[i] = q->band_pkt_count[i];
+	}
 	sch_tree_unlock(sch);
 
 	return gnet_stats_copy_app(d, &st, sizeof(st));
@@ -1120,7 +1239,7 @@ static int __init fq_module_init(void)
 
 	fq_flow_cachep = kmem_cache_create("fq_flow_cache",
 					   sizeof(struct fq_flow),
-					   0, 0, NULL);
+					   0, SLAB_HWCACHE_ALIGN, NULL);
 	if (!fq_flow_cachep)
 		return -ENOMEM;
 
-- 
2.42.0.582.g8ccd20d70d-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
                   ` (2 preceding siblings ...)
  2023-10-01 14:51 ` [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling Eric Dumazet
@ 2023-10-01 14:51 ` Eric Dumazet
  2023-10-02 11:47   ` Toke Høiland-Jørgensen
  2023-10-01 15:10 ` [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Dave Taht
  2023-10-02  6:48 ` Willem de Bruijn
  5 siblings, 1 reply; 12+ messages in thread
From: Eric Dumazet @ 2023-10-01 14:51 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

This attribute can be used to tune the per band weight
and report them in "tc qdisc show" output:

qdisc fq 802f: parent 1:9 limit 100000p flow_limit 500p buckets 1024 orphan_mask 1023
 quantum 8364b initial_quantum 41820b low_rate_threshold 550Kbit
 refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
 bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536
 Sent 236460814 bytes 792991 pkt (dropped 0, overlimits 0 requeues 0)
 rate 25816bit 10pps backlog 0b 0p requeues 0
  flows 4 (inactive 4 throttled 0)
  gc 0 throttled 19 latency 17.6us fastpath 773882

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/uapi/linux/pkt_sched.h |  3 +++
 net/sched/sch_fq.c             | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index ec5ab44d41a2493130670870dc9e68c71187740f..f762a10bfb78ed896d8a5b936045a956d97b3831 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -943,12 +943,15 @@ enum {
 
 	TCA_FQ_PRIOMAP,		/* prio2band */
 
+	TCA_FQ_WEIGHTS,		/* Weights for each band */
+
 	__TCA_FQ_MAX
 };
 
 #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
 
 #define FQ_BANDS 3
+#define FQ_MIN_WEIGHT 16384
 
 struct tc_fq_qd_stats {
 	__u64	gc_flows;
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 1bae145750a66f769bd30f1db09203f725801249..1a411fe36c79a86635f319c230a045d653571700 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -919,6 +919,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
 			.type = NLA_BINARY,
 			.len = sizeof(struct tc_prio_qopt),
 		},
+	[TCA_FQ_WEIGHTS]		= {
+			.type = NLA_BINARY,
+			.len = FQ_BANDS * sizeof(s32),
+		},
 };
 
 /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
@@ -941,6 +945,24 @@ static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
 		out[i] = fq_prio2band(in, i);
 }
 
+static int fq_load_weights(struct fq_sched_data *q,
+			   const struct nlattr *attr,
+			   struct netlink_ext_ack *extack)
+{
+	s32 *weights = nla_data(attr);
+	int i;
+
+	for (i = 0; i < FQ_BANDS; i++) {
+		if (weights[i] < FQ_MIN_WEIGHT) {
+			NL_SET_ERR_MSG_MOD(extack, "Incorrect weight");
+			return -EINVAL;
+		}
+	}
+	for (i = 0; i < FQ_BANDS; i++)
+		q->band_flows[i].quantum = weights[i];
+	return 0;
+}
+
 static int fq_load_priomap(struct fq_sched_data *q,
 			   const struct nlattr *attr,
 			   struct netlink_ext_ack *extack)
@@ -1039,6 +1061,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
 	if (!err && tb[TCA_FQ_PRIOMAP])
 		err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
 
+	if (!err && tb[TCA_FQ_WEIGHTS])
+		err = fq_load_weights(q, tb[TCA_FQ_WEIGHTS], extack);
+
 	if (tb[TCA_FQ_ORPHAN_MASK])
 		q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
 
@@ -1141,6 +1166,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	};
 	u64 horizon = q->horizon;
 	struct nlattr *opts;
+	s32 weights[3];
 
 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
 	if (opts == NULL)
@@ -1174,6 +1200,12 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
 	if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
 		goto nla_put_failure;
 
+	weights[0] = q->band_flows[0].quantum;
+	weights[1] = q->band_flows[1].quantum;
+	weights[2] = q->band_flows[2].quantum;
+	if (nla_put(skb, TCA_FQ_WEIGHTS, sizeof(weights), &weights))
+		goto nla_put_failure;
+
 	return nla_nest_end(skb, opts);
 
 nla_put_failure:
-- 
2.42.0.582.g8ccd20d70d-goog


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
                   ` (3 preceding siblings ...)
  2023-10-01 14:51 ` [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute Eric Dumazet
@ 2023-10-01 15:10 ` Dave Taht
  2023-10-02  6:48 ` Willem de Bruijn
  5 siblings, 0 replies; 12+ messages in thread
From: Dave Taht @ 2023-10-01 15:10 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	Soheil Hassas Yeganeh, Neal Cardwell, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, eric.dumazet

On Sun, Oct 1, 2023 at 7:51 AM Eric Dumazet <edumazet@google.com> wrote:
>
> As discussed in Netconf 2023 in Paris last week, this series adds
> to FQ the possibility of replacing pfifo_fast for most setups.
>
> FQ provides fairness among flows, but malicious applications
> can cause problems by using thousands of sockets.
>
> Having 3 bands like pfifo_fast can make sure that applications
> using high prio packets (eg AF4) can get guaranteed throughput
> even if thousands of low priority flows are competing.
>
> Added complexity in FQ does not matter in many cases when/if
> fastpath added in the prior series is used.
>
> Eric Dumazet (4):
>   net_sched: sch_fq: remove q->ktime_cache
>   net_sched: export pfifo_fast prio2band[]
>   net_sched: sch_fq: add 3 bands and WRR scheduling
>   net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
>
>  include/net/sch_generic.h      |   1 +
>  include/uapi/linux/pkt_sched.h |  14 +-
>  net/sched/sch_fq.c             | 263 ++++++++++++++++++++++++++-------
>  net/sched/sch_generic.c        |   9 +-
>  4 files changed, 226 insertions(+), 61 deletions(-)
>
> --
> 2.42.0.582.g8ccd20d70d-goog
>
>

While I am delighted to see this, my concern is about udp traffic. I
have not paid much attention to how that is treated in sch_fq in
recent years, it was, originally, a second class citizen. I assume the
prio stuff here works on all protocols? Have similar pacing,
udp_notsent_lowat, etc things been added to that? (I really don´t
know,
I am lagging 4 years behind on kernel developments)

If that is not the case I would like the commit message clarified to
say something like "most tcp-mainly servers and clients, and not
routers, or applications leveraging udp without backpressure, such as
vpns, or voip, or quic applications. ¨ The confusion over the use
cases for sch_fq vs fq_codel or cake has been a PITA. I was very
pleased to see effective backpressure working on containers (circa
6.1)

Acked-By: Dave Taht <dave.taht@gmail.com>

-- 
Oct 30: https://netdevconf.info/0x17/news/the-maestro-and-the-music-bof.html
Dave Täht CSO, LibreQos

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling
  2023-10-01 14:51 ` [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling Eric Dumazet
@ 2023-10-01 22:15   ` Soheil Hassas Yeganeh
  2023-10-02 11:46   ` Toke Høiland-Jørgensen
  1 sibling, 0 replies; 12+ messages in thread
From: Soheil Hassas Yeganeh @ 2023-10-01 22:15 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	Neal Cardwell, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet

On Sun, Oct 1, 2023 at 10:51 AM Eric Dumazet <edumazet@google.com> wrote:
>
> Before Google adopted FQ for its production servers,
> we had to ensure AF4 packets would get a higher share
> than BE1 ones.
>
> As discussed this week in Netconf 2023 in Paris, it is time
> to upstream this for public use.
>
> After this patch FQ can replace pfifo_fast, with the following
> differences :
>
> - FQ uses WRR instead of strict prio, to avoid starvation of
>   low priority packets.
>
> - We make sure each band/prio tracks its own usage against sch->limit.
>   This was done to make sure flood of low priority packets would not
>   prevent AF4 packets to be queued. Contributed by Willem.
>
> - priomap can be changed, if needed (default value are the ones
>   coming from pfifo_fast).
>
> In this patch, we set default band weights so that :
>
> - high prio (band=0) packets get 90% of the bandwidth
>   if they compete with low prio (band=2) packets.
>
> - high prio packets get 75% of the bandwidth
>   if they compete with medium prio (band=1) packets.
>
> Following patch in this series adds the possibility to tune
> the per-band weights.
>
> As we added many fields in 'struct fq_sched_data', we had
> to make sure to have the first cache line read-mostly, and
> avoid wasting precious cache lines.
>
> More optimizations are possible but will be sent separately.
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>

Thank you for upstreaming this feature!

Acked-by: Soheil Hassas Yeganeh <soheil@google.com>

> ---
>  include/uapi/linux/pkt_sched.h |  11 +-
>  net/sched/sch_fq.c             | 203 ++++++++++++++++++++++++++-------
>  2 files changed, 170 insertions(+), 44 deletions(-)
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index 579f641846b87da05e5d4b09c1072c90220ca601..ec5ab44d41a2493130670870dc9e68c71187740f 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -941,15 +941,19 @@ enum {
>
>         TCA_FQ_HORIZON_DROP,    /* drop packets beyond horizon, or cap their EDT */
>
> +       TCA_FQ_PRIOMAP,         /* prio2band */
> +
>         __TCA_FQ_MAX
>  };
>
>  #define TCA_FQ_MAX     (__TCA_FQ_MAX - 1)
>
> +#define FQ_BANDS 3
> +
>  struct tc_fq_qd_stats {
>         __u64   gc_flows;
> -       __u64   highprio_packets;
> -       __u64   tcp_retrans;
> +       __u64   highprio_packets;       /* obsolete */
> +       __u64   tcp_retrans;            /* obsolete */
>         __u64   throttled;
>         __u64   flows_plimit;
>         __u64   pkts_too_long;
> @@ -963,6 +967,9 @@ struct tc_fq_qd_stats {
>         __u64   horizon_drops;
>         __u64   horizon_caps;
>         __u64   fastpath_packets;
> +       __u64   band_drops[FQ_BANDS];
> +       __u32   band_pkt_count[FQ_BANDS];
> +       __u32   pad;
>  };
>
>  /* Heavy-Hitter Filter */
> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> index 91d71a538b71f9208f2507fd11443f784dffa966..1bae145750a66f769bd30f1db09203f725801249 100644
> --- a/net/sched/sch_fq.c
> +++ b/net/sched/sch_fq.c
> @@ -51,7 +51,8 @@
>  #include <net/tcp.h>
>
>  struct fq_skb_cb {
> -       u64             time_to_send;
> +       u64     time_to_send;
> +       u8      band;
>  };
>
>  static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
> @@ -84,32 +85,28 @@ struct fq_flow {
>         u32             socket_hash;    /* sk_hash */
>         int             qlen;           /* number of packets in flow queue */
>
> -/* Second cache line, used in fq_dequeue() */
> +/* Second cache line */
>         int             credit;
> -       /* 32bit hole on 64bit arches */
> -
> +       int             band;
>         struct fq_flow *next;           /* next pointer in RR lists */
>
>         struct rb_node  rate_node;      /* anchor in q->delayed tree */
>         u64             time_next_packet;
> -} ____cacheline_aligned_in_smp;
> +};
>
>  struct fq_flow_head {
>         struct fq_flow *first;
>         struct fq_flow *last;
>  };
>
> -struct fq_sched_data {
> +struct fq_perband_flows {
>         struct fq_flow_head new_flows;
> -
>         struct fq_flow_head old_flows;
> +       int                 credit;
> +       int                 quantum; /* based on band nr : 576KB, 192KB, 64KB */
> +};
>
> -       struct rb_root  delayed;        /* for rate limited flows */
> -       u64             time_next_delayed_flow;
> -       unsigned long   unthrottle_latency_ns;
> -
> -       struct fq_flow  internal;       /* for non classified or high prio packets */
> -
> +struct fq_sched_data {
>  /* Read mostly cache line */
>
>         u32             quantum;
> @@ -125,10 +122,21 @@ struct fq_sched_data {
>         u8              rate_enable;
>         u8              fq_trees_log;
>         u8              horizon_drop;
> +       u8              prio2band[(TC_PRIO_MAX + 1) >> 2];
>         u32             timer_slack; /* hrtimer slack in ns */
>
>  /* Read/Write fields. */
>
> +       unsigned int band_nr; /* band being serviced in fq_dequeue() */
> +
> +       struct fq_perband_flows band_flows[FQ_BANDS];
> +
> +       struct fq_flow  internal;       /* fastpath queue. */
> +       struct rb_root  delayed;        /* for rate limited flows */
> +       u64             time_next_delayed_flow;
> +       unsigned long   unthrottle_latency_ns;
> +
> +       u32             band_pkt_count[FQ_BANDS];
>         u32             flows;
>         u32             inactive_flows; /* Flows with no packet to send. */
>         u32             throttled_flows;
> @@ -139,7 +147,7 @@ struct fq_sched_data {
>
>  /* Seldom used fields. */
>
> -       u64             stat_internal_packets; /* aka highprio */
> +       u64             stat_band_drops[FQ_BANDS];
>         u64             stat_ce_mark;
>         u64             stat_horizon_drops;
>         u64             stat_horizon_caps;
> @@ -148,6 +156,12 @@ struct fq_sched_data {
>         u64             stat_allocation_errors;
>  };
>
> +/* return the i-th 2-bit value ("crumb") */
> +static u8 fq_prio2band(const u8 *prio2band, unsigned int prio)
> +{
> +       return (prio2band[prio / 4] >> (2 * (prio & 0x3))) & 0x3;
> +}
> +
>  /*
>   * f->tail and f->age share the same location.
>   * We can use the low order bit to differentiate if this location points
> @@ -172,8 +186,19 @@ static bool fq_flow_is_throttled(const struct fq_flow *f)
>         return f->next == &throttled;
>  }
>
> -static void fq_flow_add_tail(struct fq_flow_head *head, struct fq_flow *flow)
> +enum new_flow {
> +       NEW_FLOW,
> +       OLD_FLOW
> +};
> +
> +static void fq_flow_add_tail(struct fq_sched_data *q, struct fq_flow *flow,
> +                            enum new_flow list_sel)
>  {
> +       struct fq_perband_flows *pband = &q->band_flows[flow->band];
> +       struct fq_flow_head *head = (list_sel == NEW_FLOW) ?
> +                                       &pband->new_flows :
> +                                       &pband->old_flows;
> +
>         if (head->first)
>                 head->last->next = flow;
>         else
> @@ -186,7 +211,7 @@ static void fq_flow_unset_throttled(struct fq_sched_data *q, struct fq_flow *f)
>  {
>         rb_erase(&f->rate_node, &q->delayed);
>         q->throttled_flows--;
> -       fq_flow_add_tail(&q->old_flows, f);
> +       fq_flow_add_tail(q, f, OLD_FLOW);
>  }
>
>  static void fq_flow_set_throttled(struct fq_sched_data *q, struct fq_flow *f)
> @@ -326,11 +351,6 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb,
>         struct rb_root *root;
>         struct fq_flow *f;
>
> -       /* warning: no starvation prevention... */
> -       if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) {
> -               q->stat_internal_packets++; /* highprio packet */
> -               return &q->internal;
> -       }
>         /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
>          * or a listener (SYNCOOKIE mode)
>          * 1) request sockets are not full blown,
> @@ -509,9 +529,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
>         struct fq_sched_data *q = qdisc_priv(sch);
>         struct fq_flow *f;
>         u64 now;
> +       u8 band;
>
> -       if (unlikely(sch->q.qlen >= sch->limit))
> +       band = fq_prio2band(q->prio2band, skb->priority & TC_PRIO_MAX);
> +       if (unlikely(q->band_pkt_count[band] >= sch->limit)) {
> +               q->stat_band_drops[band]++;
>                 return qdisc_drop(skb, sch, to_free);
> +       }
>
>         now = ktime_get_ns();
>         if (!skb->tstamp) {
> @@ -538,11 +562,14 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
>                 }
>
>                 if (fq_flow_is_detached(f)) {
> -                       fq_flow_add_tail(&q->new_flows, f);
> +                       fq_flow_add_tail(q, f, NEW_FLOW);
>                         if (time_after(jiffies, f->age + q->flow_refill_delay))
>                                 f->credit = max_t(u32, f->credit, q->quantum);
>                 }
>
> +               f->band = band;
> +               q->band_pkt_count[band]++;
> +               fq_skb_cb(skb)->band = band;
>                 if (f->qlen == 0)
>                         q->inactive_flows--;
>         }
> @@ -584,13 +611,26 @@ static void fq_check_throttled(struct fq_sched_data *q, u64 now)
>         }
>  }
>
> +static struct fq_flow_head *fq_pband_head_select(struct fq_perband_flows *pband)
> +{
> +       if (pband->credit <= 0)
> +               return NULL;
> +
> +       if (pband->new_flows.first)
> +               return &pband->new_flows;
> +
> +       return pband->old_flows.first ? &pband->old_flows : NULL;
> +}
> +
>  static struct sk_buff *fq_dequeue(struct Qdisc *sch)
>  {
>         struct fq_sched_data *q = qdisc_priv(sch);
> +       struct fq_perband_flows *pband;
>         struct fq_flow_head *head;
>         struct sk_buff *skb;
>         struct fq_flow *f;
>         unsigned long rate;
> +       int retry;
>         u32 plen;
>         u64 now;
>
> @@ -606,24 +646,31 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
>
>         now = ktime_get_ns();
>         fq_check_throttled(q, now);
> +       retry = 0;
> +       pband = &q->band_flows[q->band_nr];
>  begin:
> -       head = &q->new_flows;
> -       if (!head->first) {
> -               head = &q->old_flows;
> -               if (!head->first) {
> -                       if (q->time_next_delayed_flow != ~0ULL)
> -                               qdisc_watchdog_schedule_range_ns(&q->watchdog,
> +       head = fq_pband_head_select(pband);
> +       if (!head) {
> +               while (++retry < FQ_BANDS) {
> +                       if (++q->band_nr == FQ_BANDS)
> +                               q->band_nr = 0;
> +                       pband = &q->band_flows[q->band_nr];
> +                       pband->credit = min(pband->credit + pband->quantum,
> +                                           pband->quantum);
> +                       goto begin;
> +               }
> +               if (q->time_next_delayed_flow != ~0ULL)
> +                       qdisc_watchdog_schedule_range_ns(&q->watchdog,
>                                                         q->time_next_delayed_flow,
>                                                         q->timer_slack);
> -                       return NULL;
> -               }
> +               return NULL;
>         }
>         f = head->first;
> -
> +       retry = 0;
>         if (f->credit <= 0) {
>                 f->credit += q->quantum;
>                 head->first = f->next;
> -               fq_flow_add_tail(&q->old_flows, f);
> +               fq_flow_add_tail(q, f, OLD_FLOW);
>                 goto begin;
>         }
>
> @@ -645,12 +692,13 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
>                 }
>                 if (--f->qlen == 0)
>                         q->inactive_flows++;
> +               q->band_pkt_count[fq_skb_cb(skb)->band]--;
>                 fq_dequeue_skb(sch, f, skb);
>         } else {
>                 head->first = f->next;
>                 /* force a pass through old_flows to prevent starvation */
> -               if ((head == &q->new_flows) && q->old_flows.first) {
> -                       fq_flow_add_tail(&q->old_flows, f);
> +               if (head == &pband->new_flows) {
> +                       fq_flow_add_tail(q, f, OLD_FLOW);
>                 } else {
>                         fq_flow_set_detached(f);
>                 }
> @@ -658,6 +706,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
>         }
>         plen = qdisc_pkt_len(skb);
>         f->credit -= plen;
> +       pband->credit -= plen;
>
>         if (!q->rate_enable)
>                 goto out;
> @@ -749,8 +798,10 @@ static void fq_reset(struct Qdisc *sch)
>                         kmem_cache_free(fq_flow_cachep, f);
>                 }
>         }
> -       q->new_flows.first      = NULL;
> -       q->old_flows.first      = NULL;
> +       for (idx = 0; idx < FQ_BANDS; idx++) {
> +               q->band_flows[idx].new_flows.first = NULL;
> +               q->band_flows[idx].old_flows.first = NULL;
> +       }
>         q->delayed              = RB_ROOT;
>         q->flows                = 0;
>         q->inactive_flows       = 0;
> @@ -864,8 +915,53 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
>         [TCA_FQ_TIMER_SLACK]            = { .type = NLA_U32 },
>         [TCA_FQ_HORIZON]                = { .type = NLA_U32 },
>         [TCA_FQ_HORIZON_DROP]           = { .type = NLA_U8 },
> +       [TCA_FQ_PRIOMAP]                = {
> +                       .type = NLA_BINARY,
> +                       .len = sizeof(struct tc_prio_qopt),
> +               },
>  };
>
> +/* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
> +static void fq_prio2band_compress_crumb(const u8 *in, u8 *out)
> +{
> +       const int num_elems = TC_PRIO_MAX + 1;
> +       int i;
> +
> +       memset(out, 0, num_elems / 4);
> +       for (i = 0; i < num_elems; i++)
> +               out[i / 4] |= in[i] << (2 * (i & 0x3));
> +}
> +
> +static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
> +{
> +       const int num_elems = TC_PRIO_MAX + 1;
> +       int i;
> +
> +       for (i = 0; i < num_elems; i++)
> +               out[i] = fq_prio2band(in, i);
> +}
> +
> +static int fq_load_priomap(struct fq_sched_data *q,
> +                          const struct nlattr *attr,
> +                          struct netlink_ext_ack *extack)
> +{
> +       const struct tc_prio_qopt *map = nla_data(attr);
> +       int i;
> +
> +       if (map->bands != FQ_BANDS) {
> +               NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
> +               return -EINVAL;
> +       }
> +       for (i = 0; i < TC_PRIO_MAX + 1; i++) {
> +               if (map->priomap[i] >= FQ_BANDS) {
> +                       NL_SET_ERR_MSG_MOD(extack, "Incorrect field in FQ priomap");
> +                       return -EINVAL;
> +               }
> +       }
> +       fq_prio2band_compress_crumb(map->priomap, q->prio2band);
> +       return 0;
> +}
> +
>  static int fq_change(struct Qdisc *sch, struct nlattr *opt,
>                      struct netlink_ext_ack *extack)
>  {
> @@ -940,6 +1036,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt,
>                 q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
>         }
>
> +       if (!err && tb[TCA_FQ_PRIOMAP])
> +               err = fq_load_priomap(q, tb[TCA_FQ_PRIOMAP], extack);
> +
>         if (tb[TCA_FQ_ORPHAN_MASK])
>                 q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
>
> @@ -991,7 +1090,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
>                    struct netlink_ext_ack *extack)
>  {
>         struct fq_sched_data *q = qdisc_priv(sch);
> -       int err;
> +       int i, err;
>
>         sch->limit              = 10000;
>         q->flow_plimit          = 100;
> @@ -1001,8 +1100,13 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
>         q->flow_max_rate        = ~0UL;
>         q->time_next_delayed_flow = ~0ULL;
>         q->rate_enable          = 1;
> -       q->new_flows.first      = NULL;
> -       q->old_flows.first      = NULL;
> +       for (i = 0; i < FQ_BANDS; i++) {
> +               q->band_flows[i].new_flows.first = NULL;
> +               q->band_flows[i].old_flows.first = NULL;
> +       }
> +       q->band_flows[0].quantum = 9 << 16;
> +       q->band_flows[1].quantum = 3 << 16;
> +       q->band_flows[2].quantum = 1 << 16;
>         q->delayed              = RB_ROOT;
>         q->fq_root              = NULL;
>         q->fq_trees_log         = ilog2(1024);
> @@ -1017,6 +1121,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt,
>         /* Default ce_threshold of 4294 seconds */
>         q->ce_threshold         = (u64)NSEC_PER_USEC * ~0U;
>
> +       fq_prio2band_compress_crumb(sch_default_prio2band, q->prio2band);
>         qdisc_watchdog_init_clockid(&q->watchdog, sch, CLOCK_MONOTONIC);
>
>         if (opt)
> @@ -1031,6 +1136,9 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
>  {
>         struct fq_sched_data *q = qdisc_priv(sch);
>         u64 ce_threshold = q->ce_threshold;
> +       struct tc_prio_qopt prio = {
> +               .bands = FQ_BANDS,
> +       };
>         u64 horizon = q->horizon;
>         struct nlattr *opts;
>
> @@ -1062,6 +1170,10 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
>             nla_put_u8(skb, TCA_FQ_HORIZON_DROP, q->horizon_drop))
>                 goto nla_put_failure;
>
> +       fq_prio2band_decompress_crumb(q->prio2band, prio.priomap);
> +       if (nla_put(skb, TCA_FQ_PRIOMAP, sizeof(prio), &prio))
> +               goto nla_put_failure;
> +
>         return nla_nest_end(skb, opts);
>
>  nla_put_failure:
> @@ -1072,11 +1184,14 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
>  {
>         struct fq_sched_data *q = qdisc_priv(sch);
>         struct tc_fq_qd_stats st;
> +       int i;
> +
> +       st.pad = 0;
>
>         sch_tree_lock(sch);
>
>         st.gc_flows               = q->stat_gc_flows;
> -       st.highprio_packets       = q->stat_internal_packets;
> +       st.highprio_packets       = 0;
>         st.fastpath_packets       = q->internal.stat_fastpath_packets;
>         st.tcp_retrans            = 0;
>         st.throttled              = q->stat_throttled;
> @@ -1093,6 +1208,10 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
>         st.ce_mark                = q->stat_ce_mark;
>         st.horizon_drops          = q->stat_horizon_drops;
>         st.horizon_caps           = q->stat_horizon_caps;
> +       for (i = 0; i < FQ_BANDS; i++) {
> +               st.band_drops[i]  = q->stat_band_drops[i];
> +               st.band_pkt_count[i] = q->band_pkt_count[i];
> +       }
>         sch_tree_unlock(sch);
>
>         return gnet_stats_copy_app(d, &st, sizeof(st));
> @@ -1120,7 +1239,7 @@ static int __init fq_module_init(void)
>
>         fq_flow_cachep = kmem_cache_create("fq_flow_cache",
>                                            sizeof(struct fq_flow),
> -                                          0, 0, NULL);
> +                                          0, SLAB_HWCACHE_ALIGN, NULL);
>         if (!fq_flow_cachep)
>                 return -ENOMEM;
>
> --
> 2.42.0.582.g8ccd20d70d-goog
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands
  2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
                   ` (4 preceding siblings ...)
  2023-10-01 15:10 ` [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Dave Taht
@ 2023-10-02  6:48 ` Willem de Bruijn
  5 siblings, 0 replies; 12+ messages in thread
From: Willem de Bruijn @ 2023-10-02  6:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	Soheil Hassas Yeganeh, Neal Cardwell, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, eric.dumazet

On Sun, Oct 1, 2023 at 4:51 PM Eric Dumazet <edumazet@google.com> wrote:
>
> As discussed in Netconf 2023 in Paris last week, this series adds
> to FQ the possibility of replacing pfifo_fast for most setups.
>
> FQ provides fairness among flows, but malicious applications
> can cause problems by using thousands of sockets.
>
> Having 3 bands like pfifo_fast can make sure that applications
> using high prio packets (eg AF4) can get guaranteed throughput
> even if thousands of low priority flows are competing.
>
> Added complexity in FQ does not matter in many cases when/if
> fastpath added in the prior series is used.
>
> Eric Dumazet (4):
>   net_sched: sch_fq: remove q->ktime_cache
>   net_sched: export pfifo_fast prio2band[]
>   net_sched: sch_fq: add 3 bands and WRR scheduling
>   net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
>
>  include/net/sch_generic.h      |   1 +
>  include/uapi/linux/pkt_sched.h |  14 +-
>  net/sched/sch_fq.c             | 263 ++++++++++++++++++++++++++-------
>  net/sched/sch_generic.c        |   9 +-
>  4 files changed, 226 insertions(+), 61 deletions(-)
>
> --
> 2.42.0.582.g8ccd20d70d-goog

For the series:

Reviewed-by: Willem de Bruijn <willemb@google.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling
  2023-10-01 14:51 ` [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling Eric Dumazet
  2023-10-01 22:15   ` Soheil Hassas Yeganeh
@ 2023-10-02 11:46   ` Toke Høiland-Jørgensen
  2023-10-02 12:23     ` Eric Dumazet
  1 sibling, 1 reply; 12+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-02 11:46 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

Eric Dumazet <edumazet@google.com> writes:

> Before Google adopted FQ for its production servers,
> we had to ensure AF4 packets would get a higher share
> than BE1 ones.
>
> As discussed this week in Netconf 2023 in Paris, it is time
> to upstream this for public use.

IIRC, when you mentioned this at Netconf you said the new behaviour
would probably need to be behind a flag, but I don't see that in this
series. What was the reason you decided to drop that?

[..]
> +static int fq_load_priomap(struct fq_sched_data *q,
> +			   const struct nlattr *attr,
> +			   struct netlink_ext_ack *extack)
> +{
> +	const struct tc_prio_qopt *map = nla_data(attr);
> +	int i;
> +
> +	if (map->bands != FQ_BANDS) {
> +		NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
> +		return -EINVAL;
> +	}
> +	for (i = 0; i < TC_PRIO_MAX + 1; i++) {
> +		if (map->priomap[i] >= FQ_BANDS) {
> +			NL_SET_ERR_MSG_MOD(extack, "Incorrect field in FQ priomap");

Can we be a bit more specific than just "incorrect" here? Something like
"FQ priomap field %d maps to a too high band %d"?

-Toke


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
  2023-10-01 14:51 ` [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute Eric Dumazet
@ 2023-10-02 11:47   ` Toke Høiland-Jørgensen
  2023-10-02 12:24     ` Eric Dumazet
  0 siblings, 1 reply; 12+ messages in thread
From: Toke Høiland-Jørgensen @ 2023-10-02 11:47 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Soheil Hassas Yeganeh, Neal Cardwell,
	Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev, eric.dumazet,
	Eric Dumazet

Eric Dumazet <edumazet@google.com> writes:

> This attribute can be used to tune the per band weight
> and report them in "tc qdisc show" output:
>
> qdisc fq 802f: parent 1:9 limit 100000p flow_limit 500p buckets 1024 orphan_mask 1023
>  quantum 8364b initial_quantum 41820b low_rate_threshold 550Kbit
>  refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
>  bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536
>  Sent 236460814 bytes 792991 pkt (dropped 0, overlimits 0 requeues 0)
>  rate 25816bit 10pps backlog 0b 0p requeues 0
>   flows 4 (inactive 4 throttled 0)
>   gc 0 throttled 19 latency 17.6us fastpath 773882
>
> Signed-off-by: Eric Dumazet <edumazet@google.com>
> ---
>  include/uapi/linux/pkt_sched.h |  3 +++
>  net/sched/sch_fq.c             | 32 ++++++++++++++++++++++++++++++++
>  2 files changed, 35 insertions(+)
>
> diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> index ec5ab44d41a2493130670870dc9e68c71187740f..f762a10bfb78ed896d8a5b936045a956d97b3831 100644
> --- a/include/uapi/linux/pkt_sched.h
> +++ b/include/uapi/linux/pkt_sched.h
> @@ -943,12 +943,15 @@ enum {
>  
>  	TCA_FQ_PRIOMAP,		/* prio2band */
>  
> +	TCA_FQ_WEIGHTS,		/* Weights for each band */
> +
>  	__TCA_FQ_MAX
>  };
>  
>  #define TCA_FQ_MAX	(__TCA_FQ_MAX - 1)
>  
>  #define FQ_BANDS 3
> +#define FQ_MIN_WEIGHT 16384
>  
>  struct tc_fq_qd_stats {
>  	__u64	gc_flows;
> diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> index 1bae145750a66f769bd30f1db09203f725801249..1a411fe36c79a86635f319c230a045d653571700 100644
> --- a/net/sched/sch_fq.c
> +++ b/net/sched/sch_fq.c
> @@ -919,6 +919,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
>  			.type = NLA_BINARY,
>  			.len = sizeof(struct tc_prio_qopt),
>  		},
> +	[TCA_FQ_WEIGHTS]		= {
> +			.type = NLA_BINARY,
> +			.len = FQ_BANDS * sizeof(s32),
> +		},
>  };
>  
>  /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
> @@ -941,6 +945,24 @@ static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
>  		out[i] = fq_prio2band(in, i);
>  }
>  
> +static int fq_load_weights(struct fq_sched_data *q,
> +			   const struct nlattr *attr,
> +			   struct netlink_ext_ack *extack)
> +{
> +	s32 *weights = nla_data(attr);
> +	int i;
> +
> +	for (i = 0; i < FQ_BANDS; i++) {
> +		if (weights[i] < FQ_MIN_WEIGHT) {
> +			NL_SET_ERR_MSG_MOD(extack, "Incorrect weight");

As in the previous patch, can we be a bit more specific here? "Weight %d
less that minimum allowed %d"?

-Toke


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling
  2023-10-02 11:46   ` Toke Høiland-Jørgensen
@ 2023-10-02 12:23     ` Eric Dumazet
  0 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-02 12:23 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	Soheil Hassas Yeganeh, Neal Cardwell, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, eric.dumazet

On Mon, Oct 2, 2023 at 1:46 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Eric Dumazet <edumazet@google.com> writes:
>
> > Before Google adopted FQ for its production servers,
> > we had to ensure AF4 packets would get a higher share
> > than BE1 ones.
> >
> > As discussed this week in Netconf 2023 in Paris, it is time
> > to upstream this for public use.
>
> IIRC, when you mentioned this at Netconf you said the new behaviour
> would probably need to be behind a flag, but I don't see that in this
> series. What was the reason you decided to drop that?


Not a flag, this would add runtime costs.
"struct fq_sched_data" is very big and I try not adding fields unless
really necessary.

I mentioned at Netconf that we had been using this WRR mode for ~5 years
and without a flag.


>
> [..]
> > +static int fq_load_priomap(struct fq_sched_data *q,
> > +                        const struct nlattr *attr,
> > +                        struct netlink_ext_ack *extack)
> > +{
> > +     const struct tc_prio_qopt *map = nla_data(attr);
> > +     int i;
> > +
> > +     if (map->bands != FQ_BANDS) {
> > +             NL_SET_ERR_MSG_MOD(extack, "FQ only supports 3 bands");
> > +             return -EINVAL;
> > +     }
> > +     for (i = 0; i < TC_PRIO_MAX + 1; i++) {
> > +             if (map->priomap[i] >= FQ_BANDS) {
> > +                     NL_SET_ERR_MSG_MOD(extack, "Incorrect field in FQ priomap");
>
> Can we be a bit more specific than just "incorrect" here? Something like
> "FQ priomap field %d maps to a too high band %d"?

Maybe, but note sch_prio does not even set extack for this case.

This is mostly something that only fuzzers like syzbot could possibly
hit, iproute2 will not feed the kernel with such invalid values.

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute
  2023-10-02 11:47   ` Toke Høiland-Jørgensen
@ 2023-10-02 12:24     ` Eric Dumazet
  0 siblings, 0 replies; 12+ messages in thread
From: Eric Dumazet @ 2023-10-02 12:24 UTC (permalink / raw)
  To: Toke Høiland-Jørgensen
  Cc: David S . Miller, Jakub Kicinski, Paolo Abeni, Willem de Bruijn,
	Soheil Hassas Yeganeh, Neal Cardwell, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, eric.dumazet

On Mon, Oct 2, 2023 at 1:47 PM Toke Høiland-Jørgensen <toke@redhat.com> wrote:
>
> Eric Dumazet <edumazet@google.com> writes:
>
> > This attribute can be used to tune the per band weight
> > and report them in "tc qdisc show" output:
> >
> > qdisc fq 802f: parent 1:9 limit 100000p flow_limit 500p buckets 1024 orphan_mask 1023
> >  quantum 8364b initial_quantum 41820b low_rate_threshold 550Kbit
> >  refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
> >  bands 3 priomap 1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 1 weights 589824 196608 65536
> >  Sent 236460814 bytes 792991 pkt (dropped 0, overlimits 0 requeues 0)
> >  rate 25816bit 10pps backlog 0b 0p requeues 0
> >   flows 4 (inactive 4 throttled 0)
> >   gc 0 throttled 19 latency 17.6us fastpath 773882
> >
> > Signed-off-by: Eric Dumazet <edumazet@google.com>
> > ---
> >  include/uapi/linux/pkt_sched.h |  3 +++
> >  net/sched/sch_fq.c             | 32 ++++++++++++++++++++++++++++++++
> >  2 files changed, 35 insertions(+)
> >
> > diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
> > index ec5ab44d41a2493130670870dc9e68c71187740f..f762a10bfb78ed896d8a5b936045a956d97b3831 100644
> > --- a/include/uapi/linux/pkt_sched.h
> > +++ b/include/uapi/linux/pkt_sched.h
> > @@ -943,12 +943,15 @@ enum {
> >
> >       TCA_FQ_PRIOMAP,         /* prio2band */
> >
> > +     TCA_FQ_WEIGHTS,         /* Weights for each band */
> > +
> >       __TCA_FQ_MAX
> >  };
> >
> >  #define TCA_FQ_MAX   (__TCA_FQ_MAX - 1)
> >
> >  #define FQ_BANDS 3
> > +#define FQ_MIN_WEIGHT 16384
> >
> >  struct tc_fq_qd_stats {
> >       __u64   gc_flows;
> > diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
> > index 1bae145750a66f769bd30f1db09203f725801249..1a411fe36c79a86635f319c230a045d653571700 100644
> > --- a/net/sched/sch_fq.c
> > +++ b/net/sched/sch_fq.c
> > @@ -919,6 +919,10 @@ static const struct nla_policy fq_policy[TCA_FQ_MAX + 1] = {
> >                       .type = NLA_BINARY,
> >                       .len = sizeof(struct tc_prio_qopt),
> >               },
> > +     [TCA_FQ_WEIGHTS]                = {
> > +                     .type = NLA_BINARY,
> > +                     .len = FQ_BANDS * sizeof(s32),
> > +             },
> >  };
> >
> >  /* compress a u8 array with all elems <= 3 to an array of 2-bit fields */
> > @@ -941,6 +945,24 @@ static void fq_prio2band_decompress_crumb(const u8 *in, u8 *out)
> >               out[i] = fq_prio2band(in, i);
> >  }
> >
> > +static int fq_load_weights(struct fq_sched_data *q,
> > +                        const struct nlattr *attr,
> > +                        struct netlink_ext_ack *extack)
> > +{
> > +     s32 *weights = nla_data(attr);
> > +     int i;
> > +
> > +     for (i = 0; i < FQ_BANDS; i++) {
> > +             if (weights[i] < FQ_MIN_WEIGHT) {
> > +                     NL_SET_ERR_MSG_MOD(extack, "Incorrect weight");
>
> As in the previous patch, can we be a bit more specific here? "Weight %d
> less that minimum allowed %d"?

I guess I can do this. Again this is to prevent syzbot from doing bad things.

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-10-02 12:24 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-10-01 14:50 [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Eric Dumazet
2023-10-01 14:50 ` [PATCH net-next 1/4] net_sched: sch_fq: remove q->ktime_cache Eric Dumazet
2023-10-01 14:51 ` [PATCH net-next 2/4] net_sched: export pfifo_fast prio2band[] Eric Dumazet
2023-10-01 14:51 ` [PATCH net-next 3/4] net_sched: sch_fq: add 3 bands and WRR scheduling Eric Dumazet
2023-10-01 22:15   ` Soheil Hassas Yeganeh
2023-10-02 11:46   ` Toke Høiland-Jørgensen
2023-10-02 12:23     ` Eric Dumazet
2023-10-01 14:51 ` [PATCH net-next 4/4] net_sched: sch_fq: add TCA_FQ_WEIGHTS attribute Eric Dumazet
2023-10-02 11:47   ` Toke Høiland-Jørgensen
2023-10-02 12:24     ` Eric Dumazet
2023-10-01 15:10 ` [PATCH net-next 0/4] net_sched: sch_fq: add WRR scheduling and 3 bands Dave Taht
2023-10-02  6:48 ` Willem de Bruijn

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).