From: Kuniyuki Iwashima <kuniyu@amazon.com>
To: "David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>
Cc: Kuniyuki Iwashima <kuniyu@amazon.com>,
Kuniyuki Iwashima <kuni1840@gmail.com>, <netdev@vger.kernel.org>
Subject: [PATCH v2 net-next 2/4] rtnetlink: Add per-netns RTNL.
Date: Wed, 2 Oct 2024 08:12:38 -0700 [thread overview]
Message-ID: <20241002151240.49813-3-kuniyu@amazon.com> (raw)
In-Reply-To: <20241002151240.49813-1-kuniyu@amazon.com>
The goal is to break RTNL down into per-netns mutex.
This patch adds per-netns mutex and its helper functions, rtnl_net_lock()
and rtnl_net_unlock().
rtnl_net_lock() acquires the global RTNL and per-netns RTNL mutex, and
rtnl_net_unlock() releases them.
We will replace 800+ rtnl_lock() with rtnl_net_lock() and finally removes
rtnl_lock() in rtnl_net_lock().
When we need to nest per-netns RTNL mutex, we will use __rtnl_net_lock(),
and its locking order is defined by rtnl_net_lock_cmp_fn() as follows:
1. init_net is first
2. netns address ascending order
Note that the conversion will be done under CONFIG_DEBUG_NET_SMALL_RTNL
with LOCKDEP so that we can carefully add the extra mutex without slowing
down RTNL operations during conversion.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/linux/rtnetlink.h | 13 +++++++++
include/net/net_namespace.h | 4 +++
net/Kconfig.debug | 15 ++++++++++
net/core/net_namespace.c | 6 ++++
net/core/rtnetlink.c | 58 +++++++++++++++++++++++++++++++++++++
5 files changed, 96 insertions(+)
diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index cdfc897f1e3c..f743c4f678bf 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h
@@ -46,6 +46,19 @@ extern int rtnl_is_locked(void);
extern int rtnl_lock_killable(void);
extern bool refcount_dec_and_rtnl_lock(refcount_t *r);
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net);
+void __rtnl_net_unlock(struct net *net);
+void rtnl_net_lock(struct net *net);
+void rtnl_net_unlock(struct net *net);
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b);
+#else
+#define __rtnl_net_lock(net)
+#define __rtnl_net_unlock(net)
+#define rtnl_net_lock(net) rtnl_lock()
+#define rtnl_net_unlock(net) rtnl_unlock()
+#endif
+
extern wait_queue_head_t netdev_unregistering_wq;
extern atomic_t dev_unreg_count;
extern struct rw_semaphore pernet_ops_rwsem;
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index e67b483cc8bb..873c0f9fdac6 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -188,6 +188,10 @@ struct net {
#if IS_ENABLED(CONFIG_SMC)
struct netns_smc smc;
#endif
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ /* Move to a better place when the config guard is removed. */
+ struct mutex rtnl_mutex;
+#endif
} __randomize_layout;
#include <linux/seq_file_net.h>
diff --git a/net/Kconfig.debug b/net/Kconfig.debug
index 5e3fffe707dd..277fab8c4d77 100644
--- a/net/Kconfig.debug
+++ b/net/Kconfig.debug
@@ -24,3 +24,18 @@ config DEBUG_NET
help
Enable extra sanity checks in networking.
This is mostly used by fuzzers, but is safe to select.
+
+config DEBUG_NET_SMALL_RTNL
+ bool "Add extra per-netns mutex inside RTNL"
+ depends on DEBUG_KERNEL && NET && LOCK_DEBUGGING_SUPPORT
+ select PROVE_LOCKING
+ default n
+ help
+ rtnl_lock() is being replaced with rtnl_net_lock() that
+ acquires the global RTNL and a small per-netns RTNL mutex.
+
+ During the conversion, rtnl_net_lock() just adds an extra
+ mutex in every RTNL scope and slows down the operations.
+
+ Once the conversion completes, rtnl_lock() will be removed
+ and rtnetlink will gain per-netns scalability.
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index e39479f1c9a4..105e3cd26763 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -334,6 +334,12 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
idr_init(&net->netns_ids);
spin_lock_init(&net->nsid_lock);
mutex_init(&net->ipv4.ra_mutex);
+
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+ mutex_init(&net->rtnl_mutex);
+ lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
+#endif
+
preinit_net_sysctl(net);
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index f0a520987085..edf530441b65 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -179,6 +179,64 @@ bool lockdep_rtnl_is_held(void)
EXPORT_SYMBOL(lockdep_rtnl_is_held);
#endif /* #ifdef CONFIG_PROVE_LOCKING */
+#ifdef CONFIG_DEBUG_NET_SMALL_RTNL
+void __rtnl_net_lock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_lock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_lock);
+
+void __rtnl_net_unlock(struct net *net)
+{
+ ASSERT_RTNL();
+
+ mutex_unlock(&net->rtnl_mutex);
+}
+EXPORT_SYMBOL(__rtnl_net_unlock);
+
+void rtnl_net_lock(struct net *net)
+{
+ rtnl_lock();
+ __rtnl_net_lock(net);
+}
+EXPORT_SYMBOL(rtnl_net_lock);
+
+void rtnl_net_unlock(struct net *net)
+{
+ __rtnl_net_unlock(net);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(rtnl_net_unlock);
+
+static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
+{
+ if (net_eq(net_a, net_b))
+ return 0;
+
+ /* always init_net first */
+ if (net_eq(net_a, &init_net))
+ return -1;
+
+ if (net_eq(net_b, &init_net))
+ return 1;
+
+ /* otherwise lock in ascending order */
+ return net_a < net_b ? -1 : 1;
+}
+
+int rtnl_net_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b)
+{
+ const struct net *net_a, *net_b;
+
+ net_a = container_of(a, struct net, rtnl_mutex.dep_map);
+ net_b = container_of(b, struct net, rtnl_mutex.dep_map);
+
+ return rtnl_net_cmp_locks(net_a, net_b);
+}
+#endif
+
static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
--
2.30.2
next prev parent reply other threads:[~2024-10-02 15:13 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-10-02 15:12 [PATCH v2 net-next 0/4] rtnetlink: Per-netns RTNL Kuniyuki Iwashima
2024-10-02 15:12 ` [PATCH v2 net-next 1/4] Revert "rtnetlink: add guard for RTNL" Kuniyuki Iwashima
2024-10-02 15:14 ` Eric Dumazet
2024-10-02 15:12 ` Kuniyuki Iwashima [this message]
2024-10-04 11:08 ` [PATCH v2 net-next 2/4] rtnetlink: Add per-netns RTNL Eric Dumazet
2024-10-04 20:21 ` Jakub Kicinski
2024-10-04 20:45 ` Kuniyuki Iwashima
2024-10-04 20:51 ` Kuniyuki Iwashima
2024-10-04 20:59 ` Eric Dumazet
2024-10-04 21:22 ` Kuniyuki Iwashima
2024-10-02 15:12 ` [PATCH v2 net-next 3/4] rtnetlink: Add assertion helpers for " Kuniyuki Iwashima
2024-10-04 11:10 ` Eric Dumazet
2024-10-02 15:12 ` [PATCH v2 net-next 4/4] rtnetlink: Add ASSERT_RTNL_NET() placeholder for netdev notifier Kuniyuki Iwashima
2024-10-04 11:14 ` Eric Dumazet
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20241002151240.49813-3-kuniyu@amazon.com \
--to=kuniyu@amazon.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=kuni1840@gmail.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).