netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net-next 0/4] net_sched: sch_fq: round of improvements
@ 2023-09-20 12:54 Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 1/4] net_sched: sch_fq: struct sched_data reorg Eric Dumazet
                   ` (3 more replies)
  0 siblings, 4 replies; 7+ messages in thread
From: Eric Dumazet @ 2023-09-20 12:54 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet, Eric Dumazet

For FQ tenth anniversary, it was time for making it faster.

The FQ part (as in Fair Queue) is rather expensive, because
we have to classify packets and store them in a per-flow structure,
and add this per-flow structure in a hash table.

Most fq qdisc are almost idle. Trying to share NIC bandwidth has
no benefits, thus the qdisc could behave like a FIFO.

Eric Dumazet (4):
  net_sched: sch_fq: struct sched_data reorg
  net_sched: sch_fq: change how @inactive is tracked
  net_sched: sch_fq: add fast path for mostly idle qdisc
  net_sched: sch_fq: always garbage collect

 include/uapi/linux/pkt_sched.h |   1 +
 net/sched/sch_fq.c             | 127 ++++++++++++++++++++++++---------
 2 files changed, 96 insertions(+), 32 deletions(-)

-- 
2.42.0.459.ge4e396fd5e-goog


^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH net-next 1/4] net_sched: sch_fq: struct sched_data reorg
  2023-09-20 12:54 [PATCH net-next 0/4] net_sched: sch_fq: round of improvements Eric Dumazet
@ 2023-09-20 12:54 ` Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 2/4] net_sched: sch_fq: change how @inactive is tracked Eric Dumazet
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2023-09-20 12:54 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet, Eric Dumazet

q->flows can be often modified, and q->timer_slack is read mostly.

Exchange the two fields, so that cache line countaining
quantum, initial_quantum, and other critical parameters
stay clean (read-mostly).

Move q->watchdog next to q->stat_throttled

Add comments explaining how the structure is split in
three different parts.

pahole output before the patch:

struct fq_sched_data {
	struct fq_flow_head        new_flows;            /*     0  0x10 */
	struct fq_flow_head        old_flows;            /*  0x10  0x10 */
	struct rb_root             delayed;              /*  0x20   0x8 */
	u64                        time_next_delayed_flow; /*  0x28   0x8 */
	u64                        ktime_cache;          /*  0x30   0x8 */
	unsigned long              unthrottle_latency_ns; /*  0x38   0x8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct fq_flow             internal __attribute__((__aligned__(64))); /*  0x40  0x80 */

	/* XXX last struct has 16 bytes of padding */

	/* --- cacheline 3 boundary (192 bytes) --- */
	u32                        quantum;              /*  0xc0   0x4 */
	u32                        initial_quantum;      /*  0xc4   0x4 */
	u32                        flow_refill_delay;    /*  0xc8   0x4 */
	u32                        flow_plimit;          /*  0xcc   0x4 */
	unsigned long              flow_max_rate;        /*  0xd0   0x8 */
	u64                        ce_threshold;         /*  0xd8   0x8 */
	u64                        horizon;              /*  0xe0   0x8 */
	u32                        orphan_mask;          /*  0xe8   0x4 */
	u32                        low_rate_threshold;   /*  0xec   0x4 */
	struct rb_root *           fq_root;              /*  0xf0   0x8 */
	u8                         rate_enable;          /*  0xf8   0x1 */
	u8                         fq_trees_log;         /*  0xf9   0x1 */
	u8                         horizon_drop;         /*  0xfa   0x1 */

	/* XXX 1 byte hole, try to pack */

<bad>	u32                        flows;                /*  0xfc   0x4 */
	/* --- cacheline 4 boundary (256 bytes) --- */
	u32                        inactive_flows;       /* 0x100   0x4 */
	u32                        throttled_flows;      /* 0x104   0x4 */
	u64                        stat_gc_flows;        /* 0x108   0x8 */
	u64                        stat_internal_packets; /* 0x110   0x8 */
	u64                        stat_throttled;       /* 0x118   0x8 */
	u64                        stat_ce_mark;         /* 0x120   0x8 */
	u64                        stat_horizon_drops;   /* 0x128   0x8 */
	u64                        stat_horizon_caps;    /* 0x130   0x8 */
	u64                        stat_flows_plimit;    /* 0x138   0x8 */
	/* --- cacheline 5 boundary (320 bytes) --- */
	u64                        stat_pkts_too_long;   /* 0x140   0x8 */
	u64                        stat_allocation_errors; /* 0x148   0x8 */
<bad>	u32                        timer_slack;          /* 0x150   0x4 */

	/* XXX 4 bytes hole, try to pack */

	struct qdisc_watchdog      watchdog;             /* 0x158  0x48 */

	/* size: 448, cachelines: 7, members: 34 */
	/* sum members: 411, holes: 2, sum holes: 5 */
	/* padding: 32 */
	/* paddings: 1, sum paddings: 16 */
	/* forced alignments: 1 */
};

pahole output after the patch:

struct fq_sched_data {
	struct fq_flow_head        new_flows;            /*     0  0x10 */
	struct fq_flow_head        old_flows;            /*  0x10  0x10 */
	struct rb_root             delayed;              /*  0x20   0x8 */
	u64                        time_next_delayed_flow; /*  0x28   0x8 */
	u64                        ktime_cache;          /*  0x30   0x8 */
	unsigned long              unthrottle_latency_ns; /*  0x38   0x8 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct fq_flow             internal __attribute__((__aligned__(64))); /*  0x40  0x80 */

	/* XXX last struct has 16 bytes of padding */

	/* --- cacheline 3 boundary (192 bytes) --- */
	u32                        quantum;              /*  0xc0   0x4 */
	u32                        initial_quantum;      /*  0xc4   0x4 */
	u32                        flow_refill_delay;    /*  0xc8   0x4 */
	u32                        flow_plimit;          /*  0xcc   0x4 */
	unsigned long              flow_max_rate;        /*  0xd0   0x8 */
	u64                        ce_threshold;         /*  0xd8   0x8 */
	u64                        horizon;              /*  0xe0   0x8 */
	u32                        orphan_mask;          /*  0xe8   0x4 */
	u32                        low_rate_threshold;   /*  0xec   0x4 */
	struct rb_root *           fq_root;              /*  0xf0   0x8 */
	u8                         rate_enable;          /*  0xf8   0x1 */
	u8                         fq_trees_log;         /*  0xf9   0x1 */
	u8                         horizon_drop;         /*  0xfa   0x1 */

	/* XXX 1 byte hole, try to pack */

<good>	u32                        timer_slack;          /*  0xfc   0x4 */
	/* --- cacheline 4 boundary (256 bytes) --- */
<good>	u32                        flows;                /* 0x100   0x4 */
	u32                        inactive_flows;       /* 0x104   0x4 */
	u32                        throttled_flows;      /* 0x108   0x4 */

	/* XXX 4 bytes hole, try to pack */

	u64                        stat_throttled;       /* 0x110   0x8 */
<better> struct qdisc_watchdog     watchdog;             /* 0x118  0x48 */
	/* --- cacheline 5 boundary (320 bytes) was 32 bytes ago --- */
	u64                        stat_gc_flows;        /* 0x160   0x8 */
	u64                        stat_internal_packets; /* 0x168   0x8 */
	u64                        stat_ce_mark;         /* 0x170   0x8 */
	u64                        stat_horizon_drops;   /* 0x178   0x8 */
	/* --- cacheline 6 boundary (384 bytes) --- */
	u64                        stat_horizon_caps;    /* 0x180   0x8 */
	u64                        stat_flows_plimit;    /* 0x188   0x8 */
	u64                        stat_pkts_too_long;   /* 0x190   0x8 */
	u64                        stat_allocation_errors; /* 0x198   0x8 */

	/* Force padding: */
	u64                        :64;
	u64                        :64;
	u64                        :64;
	u64                        :64;

	/* size: 448, cachelines: 7, members: 34 */
	/* sum members: 411, holes: 2, sum holes: 5 */
	/* padding: 32 */
	/* paddings: 1, sum paddings: 16 */
	/* forced alignments: 1 */
};

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index f59a2cb2c803d79bd1f0eb1806464a0220824f9e..230300aac3ed1c86c8a52f405a03f67b60848a05 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -104,6 +104,9 @@ struct fq_sched_data {
 	unsigned long	unthrottle_latency_ns;
 
 	struct fq_flow	internal;	/* for non classified or high prio packets */
+
+/* Read mostly cache line */
+
 	u32		quantum;
 	u32		initial_quantum;
 	u32		flow_refill_delay;
@@ -117,22 +120,27 @@ struct fq_sched_data {
 	u8		rate_enable;
 	u8		fq_trees_log;
 	u8		horizon_drop;
+	u32		timer_slack; /* hrtimer slack in ns */
+
+/* Read/Write fields. */
+
 	u32		flows;
 	u32		inactive_flows;
 	u32		throttled_flows;
 
+	u64		stat_throttled;
+	struct qdisc_watchdog watchdog;
 	u64		stat_gc_flows;
+
+/* Seldom used fields. */
+
 	u64		stat_internal_packets;
-	u64		stat_throttled;
 	u64		stat_ce_mark;
 	u64		stat_horizon_drops;
 	u64		stat_horizon_caps;
 	u64		stat_flows_plimit;
 	u64		stat_pkts_too_long;
 	u64		stat_allocation_errors;
-
-	u32		timer_slack; /* hrtimer slack in ns */
-	struct qdisc_watchdog watchdog;
 };
 
 /*
-- 
2.42.0.459.ge4e396fd5e-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH net-next 2/4] net_sched: sch_fq: change how @inactive is tracked
  2023-09-20 12:54 [PATCH net-next 0/4] net_sched: sch_fq: round of improvements Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 1/4] net_sched: sch_fq: struct sched_data reorg Eric Dumazet
@ 2023-09-20 12:54 ` Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 4/4] net_sched: sch_fq: always garbage collect Eric Dumazet
  3 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2023-09-20 12:54 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet, Eric Dumazet

Currently, when one fq qdisc has no more packets to send, it can still
have some flows stored in its RR lists (q->new_flows & q->old_flows)

This was a design choice, but what is a bit disturbing is that
the inactive_flows counter does not include the count of empty flows
in RR lists.

As next patch needs to know better if there are active flows,
this change makes inactive_flows exact.

Before the patch, following command on an empty qdisc could have returned:

lpaa17:~# tc -s -d qd sh dev eth1 | grep inactive
  flows 1322 (inactive 1316 throttled 0)
  flows 1330 (inactive 1325 throttled 0)
  flows 1193 (inactive 1190 throttled 0)
  flows 1208 (inactive 1202 throttled 0)

After the patch, we now have:

lpaa17:~# tc -s -d qd sh dev eth1 | grep inactive
  flows 1322 (inactive 1322 throttled 0)
  flows 1330 (inactive 1330 throttled 0)
  flows 1193 (inactive 1193 throttled 0)
  flows 1208 (inactive 1208 throttled 0)

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 230300aac3ed1c86c8a52f405a03f67b60848a05..4af43a401dbb4111d5cfaddb4b83fc5c7b63b83d 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -125,7 +125,7 @@ struct fq_sched_data {
 /* Read/Write fields. */
 
 	u32		flows;
-	u32		inactive_flows;
+	u32		inactive_flows; /* Flows with no packet to send. */
 	u32		throttled_flows;
 
 	u64		stat_throttled;
@@ -402,9 +402,12 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 			   struct sk_buff *skb)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
+
 	fq_erase_head(sch, flow, skb);
 	skb_mark_not_on_list(skb);
-	flow->qlen--;
+	if (--flow->qlen == 0)
+		q->inactive_flows++;
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 }
@@ -484,13 +487,13 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		return qdisc_drop(skb, sch, to_free);
 	}
 
-	f->qlen++;
+	if (f->qlen++ == 0)
+		q->inactive_flows--;
 	qdisc_qstats_backlog_inc(sch, skb);
 	if (fq_flow_is_detached(f)) {
 		fq_flow_add_tail(&q->new_flows, f);
 		if (time_after(jiffies, f->age + q->flow_refill_delay))
 			f->credit = max_t(u32, f->credit, q->quantum);
-		q->inactive_flows--;
 	}
 
 	/* Note: this overwrites f->age */
@@ -597,7 +600,6 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			fq_flow_add_tail(&q->old_flows, f);
 		} else {
 			fq_flow_set_detached(f);
-			q->inactive_flows++;
 		}
 		goto begin;
 	}
-- 
2.42.0.459.ge4e396fd5e-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc
  2023-09-20 12:54 [PATCH net-next 0/4] net_sched: sch_fq: round of improvements Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 1/4] net_sched: sch_fq: struct sched_data reorg Eric Dumazet
  2023-09-20 12:54 ` [PATCH net-next 2/4] net_sched: sch_fq: change how @inactive is tracked Eric Dumazet
@ 2023-09-20 12:54 ` Eric Dumazet
  2023-09-20 14:09   ` kernel test robot
  2023-09-25  2:25   ` kernel test robot
  2023-09-20 12:54 ` [PATCH net-next 4/4] net_sched: sch_fq: always garbage collect Eric Dumazet
  3 siblings, 2 replies; 7+ messages in thread
From: Eric Dumazet @ 2023-09-20 12:54 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet, Eric Dumazet

TCQ_F_CAN_BYPASS can be used by few qdiscs.

Idea is that if we queue a packet to an empty qdisc,
following dequeue() would pick it immediately.

FQ can not use the generic TCQ_F_CAN_BYPASS code,
because some additional checks need to be performed.

This patch adds a similar fast path to FQ.

Most of the time, qdisc is not throttled,
and many packets can avoid bringing/touching
at least four cache lines, and consuming 128bytes
of memory to store the state of a flow.

After this patch, netperf can send UDP packets about 13 % faster,
and pktgen goes 30 % faster (when FQ is in the way), on a fast NIC.

TCP traffic is also improved, thanks to a reduction of cache line misses.

tc -s -d qd sh dev eth1
...
qdisc fq 8004: parent 1:2 limit 10000p flow_limit 100p buckets 1024
   orphan_mask 1023 quantum 3028b initial_quantum 15140b low_rate_threshold 550Kbit
   refill_delay 40ms timer_slack 10us horizon 10s horizon_drop
 Sent 5646784384 bytes 1985161 pkt (dropped 0, overlimits 0 requeues 0)
 backlog 0b 0p requeues 0
  flows 122 (inactive 122 throttled 0)
  gc 0 highprio 0 fastpath 659990 throttled 27762 latency 8.57us

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/uapi/linux/pkt_sched.h |   1 +
 net/sched/sch_fq.c             | 107 +++++++++++++++++++++++++--------
 2 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h
index 3f85ae5780563cdfb42fdb3a107ca2489d0830a4..579f641846b87da05e5d4b09c1072c90220ca601 100644
--- a/include/uapi/linux/pkt_sched.h
+++ b/include/uapi/linux/pkt_sched.h
@@ -962,6 +962,7 @@ struct tc_fq_qd_stats {
 	__u64	ce_mark;		/* packets above ce_threshold */
 	__u64	horizon_drops;
 	__u64	horizon_caps;
+	__u64	fastpath_packets;
 };
 
 /* Heavy-Hitter Filter */
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 4af43a401dbb4111d5cfaddb4b83fc5c7b63b83d..ea510003bc385148397a690c5afd9387fba9796c 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -2,7 +2,7 @@
 /*
  * net/sched/sch_fq.c Fair Queue Packet Scheduler (per flow pacing)
  *
- *  Copyright (C) 2013-2015 Eric Dumazet <edumazet@google.com>
+ *  Copyright (C) 2013-2023 Eric Dumazet <edumazet@google.com>
  *
  *  Meant to be mostly used for locally generated traffic :
  *  Fast classification depends on skb->sk being set before reaching us.
@@ -73,7 +73,13 @@ struct fq_flow {
 		struct sk_buff *tail;	/* last skb in the list */
 		unsigned long  age;	/* (jiffies | 1UL) when flow was emptied, for gc */
 	};
-	struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+	union {
+		struct rb_node	fq_node;	/* anchor in fq_root[] trees */
+		/* Following field is only used for q->internal,
+		 * because q->internal is not hashed in fq_root[]
+		 */
+		u64		stat_fastpath_packets;
+	};
 	struct sock	*sk;
 	u32		socket_hash;	/* sk_hash */
 	int		qlen;		/* number of packets in flow queue */
@@ -266,8 +272,56 @@ static void fq_gc(struct fq_sched_data *q,
 	kmem_cache_free_bulk(fq_flow_cachep, fcnt, tofree);
 }
 
-static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
+/* Fast path can be used if :
+ * 1) Packet tstamp is in the past.
+ * 2) FQ qlen == 0   OR
+ *   (no flow is currently elligible for transmit,
+ *    AND fast path queue has less than 8 packets)
+ * 3) No SO_MAX_PACING_RATE on the socket (if any).
+ * 4) No @maxrate attribute on this qdisc,
+ *
+ * FQ can not use generic TCQ_F_CAN_BYPASS infrastructure.
+ */
+static bool fq_fastpath_check(struct Qdisc *sch, struct sk_buff *skb)
 {
+	struct fq_sched_data *q = qdisc_priv(sch);
+	const struct sock *sk;
+
+	if (fq_skb_cb(skb)->time_to_send > q->ktime_cache)
+		return false;
+
+	if (sch->q.qlen != 0) {
+		/* Even if some packets are stored in this qdisc,
+		 * we can still enable fast path if all of them are
+		 * scheduled in the future (ie no flows are elligible)
+		 * or in the fast path queue.
+		 */
+		if (q->flows != q->inactive_flows + q->throttled_flows)
+			return false;
+
+		/* Do not allow fast path queue to explode, we want Fair Queue mode
+		 * under pressure.
+		 */
+		if (q->internal.qlen >= 8)
+			return false;
+	}
+
+	sk = skb->sk;
+	if (sk && sk_fullsock(sk) && !sk_is_tcp(sk) &&
+	    sk->sk_max_pacing_rate != ~0UL)
+		return false;
+
+	if (q->flow_max_rate != ~0UL)
+		return false;
+
+	q->internal.stat_fastpath_packets++;
+
+	return true;
+}
+
+static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
+{
+	struct fq_sched_data *q = qdisc_priv(sch);
 	struct rb_node **p, *parent;
 	struct sock *sk = skb->sk;
 	struct rb_root *root;
@@ -307,6 +361,9 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
 		sk = (struct sock *)((hash << 1) | 1UL);
 	}
 
+	if (fq_fastpath_check(sch, skb))
+		return &q->internal;
+
 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
 
 	if (q->flows >= (2U << q->fq_trees_log) &&
@@ -402,12 +459,8 @@ static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
 static void fq_dequeue_skb(struct Qdisc *sch, struct fq_flow *flow,
 			   struct sk_buff *skb)
 {
-	struct fq_sched_data *q = qdisc_priv(sch);
-
 	fq_erase_head(sch, flow, skb);
 	skb_mark_not_on_list(skb);
-	if (--flow->qlen == 0)
-		q->inactive_flows++;
 	qdisc_qstats_backlog_dec(sch, skb);
 	sch->q.qlen--;
 }
@@ -450,6 +503,7 @@ static bool fq_packet_beyond_horizon(const struct sk_buff *skb,
 	return unlikely((s64)skb->tstamp > (s64)(q->ktime_cache + q->horizon));
 }
 
+
 static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 		      struct sk_buff **to_free)
 {
@@ -459,49 +513,46 @@ static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 	if (unlikely(sch->q.qlen >= sch->limit))
 		return qdisc_drop(skb, sch, to_free);
 
+	q->ktime_cache = ktime_get_ns();
 	if (!skb->tstamp) {
-		fq_skb_cb(skb)->time_to_send = q->ktime_cache = ktime_get_ns();
+		fq_skb_cb(skb)->time_to_send = q->ktime_cache;
 	} else {
-		/* Check if packet timestamp is too far in the future.
-		 * Try first if our cached value, to avoid ktime_get_ns()
-		 * cost in most cases.
-		 */
+		/* Check if packet timestamp is too far in the future. */
 		if (fq_packet_beyond_horizon(skb, q)) {
-			/* Refresh our cache and check another time */
-			q->ktime_cache = ktime_get_ns();
-			if (fq_packet_beyond_horizon(skb, q)) {
-				if (q->horizon_drop) {
+			if (q->horizon_drop) {
 					q->stat_horizon_drops++;
 					return qdisc_drop(skb, sch, to_free);
-				}
-				q->stat_horizon_caps++;
-				skb->tstamp = q->ktime_cache + q->horizon;
 			}
+			q->stat_horizon_caps++;
+			skb->tstamp = q->ktime_cache + q->horizon;
 		}
 		fq_skb_cb(skb)->time_to_send = skb->tstamp;
 	}
 
-	f = fq_classify(skb, q);
+	f = fq_classify(sch, skb);
 	if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
 		q->stat_flows_plimit++;
 		return qdisc_drop(skb, sch, to_free);
 	}
 
-	if (f->qlen++ == 0)
-		q->inactive_flows--;
-	qdisc_qstats_backlog_inc(sch, skb);
 	if (fq_flow_is_detached(f)) {
 		fq_flow_add_tail(&q->new_flows, f);
 		if (time_after(jiffies, f->age + q->flow_refill_delay))
 			f->credit = max_t(u32, f->credit, q->quantum);
 	}
 
-	/* Note: this overwrites f->age */
-	flow_queue_add(f, skb);
-
 	if (unlikely(f == &q->internal)) {
 		q->stat_internal_packets++;
+	} else {
+		if (f->qlen == 0)
+			q->inactive_flows--;
 	}
+queue:
+	f->qlen++;
+	/* Note: this overwrites f->age */
+	flow_queue_add(f, skb);
+
+	qdisc_qstats_backlog_inc(sch, skb);
 	sch->q.qlen++;
 
 	return NET_XMIT_SUCCESS;
@@ -549,6 +600,7 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 
 	skb = fq_peek(&q->internal);
 	if (unlikely(skb)) {
+		q->internal.qlen--;
 		fq_dequeue_skb(sch, &q->internal, skb);
 		goto out;
 	}
@@ -592,6 +644,8 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
 			INET_ECN_set_ce(skb);
 			q->stat_ce_mark++;
 		}
+		if (--f->qlen == 0)
+			q->inactive_flows++;
 		fq_dequeue_skb(sch, f, skb);
 	} else {
 		head->first = f->next;
@@ -1024,6 +1078,7 @@ static int fq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
 
 	st.gc_flows		  = q->stat_gc_flows;
 	st.highprio_packets	  = q->stat_internal_packets;
+	st.fastpath_packets	  = q->internal.stat_fastpath_packets;
 	st.tcp_retrans		  = 0;
 	st.throttled		  = q->stat_throttled;
 	st.flows_plimit		  = q->stat_flows_plimit;
-- 
2.42.0.459.ge4e396fd5e-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH net-next 4/4] net_sched: sch_fq: always garbage collect
  2023-09-20 12:54 [PATCH net-next 0/4] net_sched: sch_fq: round of improvements Eric Dumazet
                   ` (2 preceding siblings ...)
  2023-09-20 12:54 ` [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc Eric Dumazet
@ 2023-09-20 12:54 ` Eric Dumazet
  3 siblings, 0 replies; 7+ messages in thread
From: Eric Dumazet @ 2023-09-20 12:54 UTC (permalink / raw)
  To: David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: Willem de Bruijn, Jamal Hadi Salim, Cong Wang, Jiri Pirko, netdev,
	eric.dumazet, Eric Dumazet

FQ performs garbage collection at enqueue time, and only
if number of flows is above a given threshold, which
is hit after the qdisc has been used a bit.

Since an RB-tree traversal is needed to locate a flow,
it makes sense to perform gc all the time, to keep
rb-trees smaller.

This reduces by 50 % average storage costs in FQ,
and avoids 1 cache line miss at enqueue time when
fast path added in prior patch can not be used.

Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 net/sched/sch_fq.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index ea510003bc385148397a690c5afd9387fba9796c..0fe2e90af14c2ef822a1455f3c308f0caeb53fe3 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -366,9 +366,7 @@ static struct fq_flow *fq_classify(struct Qdisc *sch, struct sk_buff *skb)
 
 	root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
 
-	if (q->flows >= (2U << q->fq_trees_log) &&
-	    q->inactive_flows > q->flows/2)
-		fq_gc(q, root, sk);
+	fq_gc(q, root, sk);
 
 	p = &root->rb_node;
 	parent = NULL;
-- 
2.42.0.459.ge4e396fd5e-goog


^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc
  2023-09-20 12:54 ` [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc Eric Dumazet
@ 2023-09-20 14:09   ` kernel test robot
  2023-09-25  2:25   ` kernel test robot
  1 sibling, 0 replies; 7+ messages in thread
From: kernel test robot @ 2023-09-20 14:09 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: oe-kbuild-all, Willem de Bruijn, Jamal Hadi Salim, Cong Wang,
	Jiri Pirko, netdev, eric.dumazet, Eric Dumazet

Hi Eric,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/net_sched-sch_fq-struct-sched_data-reorg/20230920-205744
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230920125418.3675569-4-edumazet%40google.com
patch subject: [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc
config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20230920/202309202157.1I3oSZoS-lkp@intel.com/config)
compiler: m68k-linux-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230920/202309202157.1I3oSZoS-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202309202157.1I3oSZoS-lkp@intel.com/

All warnings (new ones prefixed by >>):

   net/sched/sch_fq.c: In function 'fq_enqueue':
>> net/sched/sch_fq.c:550:1: warning: label 'queue' defined but not used [-Wunused-label]
     550 | queue:
         | ^~~~~


vim +/queue +550 net/sched/sch_fq.c

   505	
   506	
   507	static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
   508			      struct sk_buff **to_free)
   509	{
   510		struct fq_sched_data *q = qdisc_priv(sch);
   511		struct fq_flow *f;
   512	
   513		if (unlikely(sch->q.qlen >= sch->limit))
   514			return qdisc_drop(skb, sch, to_free);
   515	
   516		q->ktime_cache = ktime_get_ns();
   517		if (!skb->tstamp) {
   518			fq_skb_cb(skb)->time_to_send = q->ktime_cache;
   519		} else {
   520			/* Check if packet timestamp is too far in the future. */
   521			if (fq_packet_beyond_horizon(skb, q)) {
   522				if (q->horizon_drop) {
   523						q->stat_horizon_drops++;
   524						return qdisc_drop(skb, sch, to_free);
   525				}
   526				q->stat_horizon_caps++;
   527				skb->tstamp = q->ktime_cache + q->horizon;
   528			}
   529			fq_skb_cb(skb)->time_to_send = skb->tstamp;
   530		}
   531	
   532		f = fq_classify(sch, skb);
   533		if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
   534			q->stat_flows_plimit++;
   535			return qdisc_drop(skb, sch, to_free);
   536		}
   537	
   538		if (fq_flow_is_detached(f)) {
   539			fq_flow_add_tail(&q->new_flows, f);
   540			if (time_after(jiffies, f->age + q->flow_refill_delay))
   541				f->credit = max_t(u32, f->credit, q->quantum);
   542		}
   543	
   544		if (unlikely(f == &q->internal)) {
   545			q->stat_internal_packets++;
   546		} else {
   547			if (f->qlen == 0)
   548				q->inactive_flows--;
   549		}
 > 550	queue:
   551		f->qlen++;
   552		/* Note: this overwrites f->age */
   553		flow_queue_add(f, skb);
   554	
   555		qdisc_qstats_backlog_inc(sch, skb);
   556		sch->q.qlen++;
   557	
   558		return NET_XMIT_SUCCESS;
   559	}
   560	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc
  2023-09-20 12:54 ` [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc Eric Dumazet
  2023-09-20 14:09   ` kernel test robot
@ 2023-09-25  2:25   ` kernel test robot
  1 sibling, 0 replies; 7+ messages in thread
From: kernel test robot @ 2023-09-25  2:25 UTC (permalink / raw)
  To: Eric Dumazet, David S . Miller, Jakub Kicinski, Paolo Abeni
  Cc: llvm, oe-kbuild-all, Willem de Bruijn, Jamal Hadi Salim,
	Cong Wang, Jiri Pirko, netdev, eric.dumazet, Eric Dumazet

Hi Eric,

kernel test robot noticed the following build warnings:

[auto build test WARNING on net-next/main]

url:    https://github.com/intel-lab-lkp/linux/commits/Eric-Dumazet/net_sched-sch_fq-struct-sched_data-reorg/20230920-205744
base:   net-next/main
patch link:    https://lore.kernel.org/r/20230920125418.3675569-4-edumazet%40google.com
patch subject: [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20230925/202309251006.HEmH6uZd-lkp@intel.com/config)
compiler: clang version 16.0.4 (https://github.com/llvm/llvm-project.git ae42196bc493ffe877a7e3dff8be32035dea4d07)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20230925/202309251006.HEmH6uZd-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202309251006.HEmH6uZd-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> net/sched/sch_fq.c:550:1: warning: unused label 'queue' [-Wunused-label]
   queue:
   ^~~~~~
   1 warning generated.


vim +/queue +550 net/sched/sch_fq.c

   505	
   506	
   507	static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
   508			      struct sk_buff **to_free)
   509	{
   510		struct fq_sched_data *q = qdisc_priv(sch);
   511		struct fq_flow *f;
   512	
   513		if (unlikely(sch->q.qlen >= sch->limit))
   514			return qdisc_drop(skb, sch, to_free);
   515	
   516		q->ktime_cache = ktime_get_ns();
   517		if (!skb->tstamp) {
   518			fq_skb_cb(skb)->time_to_send = q->ktime_cache;
   519		} else {
   520			/* Check if packet timestamp is too far in the future. */
   521			if (fq_packet_beyond_horizon(skb, q)) {
   522				if (q->horizon_drop) {
   523						q->stat_horizon_drops++;
   524						return qdisc_drop(skb, sch, to_free);
   525				}
   526				q->stat_horizon_caps++;
   527				skb->tstamp = q->ktime_cache + q->horizon;
   528			}
   529			fq_skb_cb(skb)->time_to_send = skb->tstamp;
   530		}
   531	
   532		f = fq_classify(sch, skb);
   533		if (unlikely(f->qlen >= q->flow_plimit && f != &q->internal)) {
   534			q->stat_flows_plimit++;
   535			return qdisc_drop(skb, sch, to_free);
   536		}
   537	
   538		if (fq_flow_is_detached(f)) {
   539			fq_flow_add_tail(&q->new_flows, f);
   540			if (time_after(jiffies, f->age + q->flow_refill_delay))
   541				f->credit = max_t(u32, f->credit, q->quantum);
   542		}
   543	
   544		if (unlikely(f == &q->internal)) {
   545			q->stat_internal_packets++;
   546		} else {
   547			if (f->qlen == 0)
   548				q->inactive_flows--;
   549		}
 > 550	queue:
   551		f->qlen++;
   552		/* Note: this overwrites f->age */
   553		flow_queue_add(f, skb);
   554	
   555		qdisc_qstats_backlog_inc(sch, skb);
   556		sch->q.qlen++;
   557	
   558		return NET_XMIT_SUCCESS;
   559	}
   560	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2023-09-25  2:26 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-09-20 12:54 [PATCH net-next 0/4] net_sched: sch_fq: round of improvements Eric Dumazet
2023-09-20 12:54 ` [PATCH net-next 1/4] net_sched: sch_fq: struct sched_data reorg Eric Dumazet
2023-09-20 12:54 ` [PATCH net-next 2/4] net_sched: sch_fq: change how @inactive is tracked Eric Dumazet
2023-09-20 12:54 ` [PATCH net-next 3/4] net_sched: sch_fq: add fast path for mostly idle qdisc Eric Dumazet
2023-09-20 14:09   ` kernel test robot
2023-09-25  2:25   ` kernel test robot
2023-09-20 12:54 ` [PATCH net-next 4/4] net_sched: sch_fq: always garbage collect Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).