public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Cosmin Ratiu <cratiu@nvidia.com>
To: <netdev@vger.kernel.org>
Cc: David Ahern <dsahern@kernel.org>,
	"David S . Miller" <davem@davemloft.net>,
	Eric Dumazet <edumazet@google.com>,
	Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
	Simon Horman <horms@kernel.org>, Cosmin Ratiu <cratiu@nvidia.com>
Subject: [PATCH net-next] ipv4: Flush the FIB once per dev nexthop removal
Date: Mon, 4 May 2026 16:36:26 +0300	[thread overview]
Message-ID: <20260504133626.4155933-1-cratiu@nvidia.com> (raw)

When a device is going down, all nexthops on it are removed, and for
each nexthop being removed the FIB table is flushed, which does a full
trie traversal looking for entries marked RTNH_F_DEAD and removing them.
The performance of this is O(N x R), with N being number of dev nexthops
and R being number of IPv4 routes.

The RTNL is held the entire time.

When there are many nexthops to be removed and many routing entries,
this can result in the RTNL being held for multiple minutes, which
causes unhappiness in other processes trying to acquire the RTNL (e.g.
systemd-networkd for DHCP renewals).

In a complicated deployment with multiple vxlan devices, each having
16K nexthops and a total of 128K ipv4 routes, this is exactly what
happens:

nexthop_flush_dev()                # loops over 16K nexthops
  -> remove_nexthop()
    -> __remove_nexthop()
      -> __remove_nexthop_fib()    # marks fi->fib_flags |= RTNH_F_DEAD
        -> fib_flush()             # for EACH nexthop!
	  -> fib_table_flush()     # walks the ENTIRE FIB, 128K entries

Change that so that a nexthop_flush_dev() does a single fib_flush()
after all nexthops are removed. This is done with:
- __remove_nexthop_fib() no longer flushes the FIB, instead returns
  whether a flush is needed and is marked with __must_check.
- __remove_nexthop() and remove_nexthop() propagate this return value up
  with __must_check.
- A new wrapper is defined, remove_one_nexthop() which calls
  remove_nexthop() and flushes if necessary.
- The two direct callers of __remove_nexthop() get a WARN_ON, since the
  nh about to be removed should not have any FIB entries referencing it
  when replacing or inserting a new one.
- Callers which need to remove a single nexthop were migrated to
  remove_one_nexthop().
- Callers which need to remove multiple nexthops keep track in a local
  bool whether a flush is needed and call flush once at the end.
- This is plumbed through group removal as well, so when removing a leaf
  nh causes a parent group to lose its last member, the group's flush is
  also deferred, accumulated via remove_nexthop_from_groups() ->
  remove_nh_grp_entry() -> remove_nexthop(). remove_nh_grp_entry() gets
  a __must_check as well.

This dramatically improves performance from O(N x R) to O(N + R).

Releasing a nexthop reference in remove_nexthop() now no longer frees
it. Instead, it is deleted when the last fib_info pointing to it gets
freed via free_fib_info_rcu(). All routing code is already careful not
to take into consideration routes marked with RTNH_F_DEAD.

Tested with:
DEV=eth2
ip link set up dev $DEV
ip link add testnh0 link $DEV type macvlan mode bridge
ip addr add 198.51.100.1/24 dev testnh0
ip link set testnh0 up

seq 1 65536 | \
sed 's/.*/nexthop add id & via 198.51.100.2 dev testnh0/' | \
ip -batch -

i=1
for a in $(seq 0 255); do
  for b in $(seq 0 255); do
    echo "route add 10.${a}.${b}.0/32 nhid $i"
    i=$((i + 1))
  done
done | ip -batch -

time ip link set testnh0 down
ip link del testnh0

Without this patch:
real	0m32.601s
user	0m0.000s
sys	0m32.511s

With this patch:
real	0m0.209s
user	0m0.000s
sys	0m0.153s

Signed-off-by: Cosmin Ratiu <cratiu@nvidia.com>
---
 net/ipv4/nexthop.c | 85 +++++++++++++++++++++++++++++-----------------
 1 file changed, 54 insertions(+), 31 deletions(-)

diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index f92fcc39fc4c..1822df80fca5 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -20,8 +20,8 @@
 #define NH_RES_DEFAULT_IDLE_TIMER	(120 * HZ)
 #define NH_RES_DEFAULT_UNBALANCED_TIMER	0	/* No forced rebalancing. */
 
-static void remove_nexthop(struct net *net, struct nexthop *nh,
-			   struct nl_info *nlinfo);
+static bool __must_check remove_nexthop(struct net *net, struct nexthop *nh,
+					struct nl_info *nlinfo);
 
 #define NH_DEV_HASHBITS  8
 #define NH_DEV_HASHSIZE (1U << NH_DEV_HASHBITS)
@@ -2016,9 +2016,9 @@ static void nh_hthr_group_rebalance(struct nh_group *nhg)
 	}
 }
 
-static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
-				struct nl_info *nlinfo,
-				struct list_head *deferred_free)
+static bool __must_check
+remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
+		    struct nl_info *nlinfo, struct list_head *deferred_free)
 {
 	struct nh_grp_entry *nhges, *new_nhges;
 	struct nexthop *nhp = nhge->nh_parent;
@@ -2033,10 +2033,8 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 	newg = nhg->spare;
 
 	/* last entry, keep it visible and remove the parent */
-	if (nhg->num_nh == 1) {
-		remove_nexthop(net, nhp, nlinfo);
-		return;
-	}
+	if (nhg->num_nh == 1)
+		return remove_nexthop(net, nhp, nlinfo);
 
 	newg->has_v4 = false;
 	newg->is_multipath = nhg->is_multipath;
@@ -2093,22 +2091,26 @@ static void remove_nh_grp_entry(struct net *net, struct nh_grp_entry *nhge,
 
 	if (nlinfo)
 		nexthop_notify(RTM_NEWNEXTHOP, nhp, nlinfo);
+
+	return false;
 }
 
-static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+static bool remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 				       struct nl_info *nlinfo)
 {
 	struct nh_grp_entry *nhge, *tmp;
+	bool need_flush = false;
 	LIST_HEAD(deferred_free);
 
 	/* If there is nothing to do, let's avoid the costly call to
 	 * synchronize_net()
 	 */
 	if (list_empty(&nh->grp_list))
-		return;
+		return false;
 
 	list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list)
-		remove_nh_grp_entry(net, nhge, nlinfo, &deferred_free);
+		need_flush |= remove_nh_grp_entry(net, nhge, nlinfo,
+						   &deferred_free);
 
 	/* make sure all see the newly published array before releasing rtnl */
 	synchronize_net();
@@ -2118,6 +2120,8 @@ static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
 		list_del(&nhge->nh_list);
 		free_percpu(nhge->stats);
 	}
+
+	return need_flush;
 }
 
 static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
@@ -2142,18 +2146,15 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
 }
 
 /* not called for nexthop replace */
-static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
+static bool __must_check __remove_nexthop_fib(struct net *net,
+					      struct nexthop *nh)
 {
+	bool need_flush = !list_empty(&nh->fi_list);
 	struct fib6_info *f6i;
-	bool do_flush = false;
 	struct fib_info *fi;
 
-	list_for_each_entry(fi, &nh->fi_list, nh_list) {
+	list_for_each_entry(fi, &nh->fi_list, nh_list)
 		fi->fib_flags |= RTNH_F_DEAD;
-		do_flush = true;
-	}
-	if (do_flush)
-		fib_flush(net);
 
 	spin_lock_bh(&nh->lock);
 
@@ -2173,12 +2174,14 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
 	}
 
 	spin_unlock_bh(&nh->lock);
+
+	return need_flush;
 }
 
-static void __remove_nexthop(struct net *net, struct nexthop *nh,
-			     struct nl_info *nlinfo)
+static bool __must_check __remove_nexthop(struct net *net, struct nexthop *nh,
+					  struct nl_info *nlinfo)
 {
-	__remove_nexthop_fib(net, nh);
+	bool need_flush = __remove_nexthop_fib(net, nh);
 
 	if (nh->is_group) {
 		remove_nexthop_group(nh, nlinfo);
@@ -2189,13 +2192,17 @@ static void __remove_nexthop(struct net *net, struct nexthop *nh,
 		if (nhi->fib_nhc.nhc_dev)
 			hlist_del(&nhi->dev_hash);
 
-		remove_nexthop_from_groups(net, nh, nlinfo);
+		need_flush |= remove_nexthop_from_groups(net, nh, nlinfo);
 	}
+
+	return need_flush;
 }
 
-static void remove_nexthop(struct net *net, struct nexthop *nh,
-			   struct nl_info *nlinfo)
+static bool __must_check remove_nexthop(struct net *net, struct nexthop *nh,
+					struct nl_info *nlinfo)
 {
+	bool need_flush;
+
 	call_nexthop_notifiers(net, NEXTHOP_EVENT_DEL, nh, NULL);
 
 	/* remove from the tree */
@@ -2204,10 +2211,19 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
 	if (nlinfo)
 		nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
 
-	__remove_nexthop(net, nh, nlinfo);
+	need_flush = __remove_nexthop(net, nh, nlinfo);
 	nh_base_seq_inc(net);
 
 	nexthop_put(nh);
+
+	return need_flush;
+}
+
+static void remove_one_nexthop(struct net *net, struct nexthop *nh,
+			       struct nl_info *nlinfo)
+{
+	if (remove_nexthop(net, nh, nlinfo))
+		fib_flush(net);
 }
 
 /* if any FIB entries reference this nexthop, any dst entries
@@ -2592,7 +2608,7 @@ static int replace_nexthop(struct net *net, struct nexthop *old,
 	if (!err) {
 		nh_rt_cache_flush(net, old, new);
 
-		__remove_nexthop(net, new, NULL);
+		WARN_ON(__remove_nexthop(net, new, NULL));
 		nexthop_put(new);
 	}
 
@@ -2701,6 +2717,7 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 	struct hlist_head *head = &net->nexthop.devhash[hash];
 	struct hlist_node *n;
 	struct nh_info *nhi;
+	bool need_flush = false;
 
 	hlist_for_each_entry_safe(nhi, n, head, dev_hash) {
 		if (nhi->fib_nhc.nhc_dev != dev)
@@ -2710,22 +2727,28 @@ static void nexthop_flush_dev(struct net_device *dev, unsigned long event)
 		    (event == NETDEV_DOWN || event == NETDEV_CHANGE))
 			continue;
 
-		remove_nexthop(net, nhi->nh_parent, NULL);
+		need_flush |= remove_nexthop(net, nhi->nh_parent, NULL);
 	}
+
+	if (need_flush)
+		fib_flush(net);
 }
 
 /* rtnl; called when net namespace is deleted */
 static void flush_all_nexthops(struct net *net)
 {
 	struct rb_root *root = &net->nexthop.rb_root;
+	bool need_flush = false;
 	struct rb_node *node;
 	struct nexthop *nh;
 
 	while ((node = rb_first(root))) {
 		nh = rb_entry(node, struct nexthop, rb_node);
-		remove_nexthop(net, nh, NULL);
+		need_flush |= remove_nexthop(net, nh, NULL);
 		cond_resched();
 	}
+	if (need_flush)
+		fib_flush(net);
 }
 
 static struct nexthop *nexthop_create_group(struct net *net,
@@ -2994,7 +3017,7 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
 
 	err = insert_nexthop(net, nh, cfg, extack);
 	if (err) {
-		__remove_nexthop(net, nh, NULL);
+		WARN_ON(__remove_nexthop(net, nh, NULL));
 		nexthop_put(nh);
 		nh = ERR_PTR(err);
 	}
@@ -3363,7 +3386,7 @@ static int rtm_del_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
 
 	nh = nexthop_find_by_id(net, id);
 	if (nh)
-		remove_nexthop(net, nh, &nlinfo);
+		remove_one_nexthop(net, nh, &nlinfo);
 	else
 		err = -ENOENT;
 
-- 
2.53.0


                 reply	other threads:[~2026-05-04 13:37 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260504133626.4155933-1-cratiu@nvidia.com \
    --to=cratiu@nvidia.com \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=horms@kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox