From: Eric Dumazet <edumazet@google.com>
To: "David S . Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>,
Paolo Abeni <pabeni@redhat.com>
Cc: Simon Horman <horms@kernel.org>,
Kuniyuki Iwashima <kuniyu@google.com>,
David Ahern <dsahern@kernel.org>,
netdev@vger.kernel.org, eric.dumazet@gmail.com,
Eric Dumazet <edumazet@google.com>
Subject: [PATCH v3 net-next 3/3] rtnetlink: do not acquire RTNL for RTM_GETLINK with RTEXT_FILTER_NAME_ONLY
Date: Wed, 20 May 2026 10:32:27 +0000 [thread overview]
Message-ID: <20260520103227.1133277-4-edumazet@google.com> (raw)
In-Reply-To: <20260520103227.1133277-1-edumazet@google.com>
When RTEXT_FILTER_NAME_ONLY is requested, rtnl_fill_ifinfo()
is dumping device attributes which do not need RTNL protection.
Many shell scripts invoke iproute2 commands specifying a device by
its name. After this patch, they will no longer add RTNL pressure.
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
net/core/rtnetlink.c | 82 ++++++++++++++++++++++++++++++--------------
1 file changed, 56 insertions(+), 26 deletions(-)
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index ae0254f19178735b2805a8189e81a960a49b2858..587bb8bbc73d0b2075ca508a5537200f65f74594 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -2068,7 +2068,6 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
struct nlmsghdr *nlh;
struct Qdisc *qdisc;
- ASSERT_RTNL();
nlh = nlmsg_put(skb, pid, seq, type, sizeof(*ifm), flags);
if (nlh == NULL)
return -EMSGSIZE;
@@ -2091,6 +2090,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
if (ext_filter_mask & RTEXT_FILTER_NAME_ONLY)
goto end;
+ ASSERT_RTNL();
if (tgt_netnsid >= 0 &&
nla_put_s32(skb, IFLA_TARGET_NETNSID, tgt_netnsid))
goto nla_put_failure;
@@ -3468,6 +3468,21 @@ static struct net_device *rtnl_dev_get(struct net *net,
return __dev_get_by_name(net, ifname);
}
+static struct net_device *rtnl_dev_get_rcu(struct net *net,
+ struct nlattr *tb[])
+{
+ char ifname[ALTIFNAMSIZ];
+
+ if (tb[IFLA_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ);
+ else if (tb[IFLA_ALT_IFNAME])
+ nla_strscpy(ifname, tb[IFLA_ALT_IFNAME], ALTIFNAMSIZ);
+ else
+ return NULL;
+
+ return dev_get_by_name_rcu(net, ifname);
+}
+
static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
@@ -4187,14 +4202,15 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct netlink_ext_ack *extack)
{
struct net *net = sock_net(skb->sk);
+ struct nlattr *tb[IFLA_MAX + 1];
+ netdevice_tracker dev_tracker;
+ struct net_device *dev = NULL;
struct net *tgt_net = net;
+ u32 ext_filter_mask = 0;
struct ifinfomsg *ifm;
- struct nlattr *tb[IFLA_MAX+1];
- struct net_device *dev = NULL;
struct sk_buff *nskb;
int netnsid = -1;
int err;
- u32 ext_filter_mask = 0;
err = rtnl_valid_getlink_req(skb, nlh, tb, extack);
if (err < 0)
@@ -4214,43 +4230,56 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
- err = -EINVAL;
ifm = nlmsg_data(nlh);
- if (ifm->ifi_index > 0)
- dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
- else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
- dev = rtnl_dev_get(tgt_net, tb);
- else
+ rcu_read_lock();
+ if (ifm->ifi_index > 0) {
+ dev = dev_get_by_index_rcu(tgt_net, ifm->ifi_index);
+ } else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME]) {
+ dev = rtnl_dev_get_rcu(tgt_net, tb);
+ } else {
+ rcu_read_unlock();
+ err = -EINVAL;
goto out;
+ }
+ netdev_hold(dev, &dev_tracker, GFP_ATOMIC);
+ rcu_read_unlock();
err = -ENODEV;
if (dev == NULL)
goto out;
+ if (!(ext_filter_mask & RTEXT_FILTER_NAME_ONLY)) {
+ rtnl_lock();
+ /* Synchronize the carrier state so we don't report a state
+ * that we're not actually going to honour immediately; if
+ * the driver just did a carrier off->on transition, we can
+ * only TX if link watch work has run, but without this we'd
+ * already report carrier on, even if it doesn't work yet.
+ */
+ linkwatch_sync_dev(dev);
+ }
+
err = -ENOBUFS;
nskb = nlmsg_new_large(if_nlmsg_size(dev, ext_filter_mask));
- if (nskb == NULL)
- goto out;
+ if (nskb)
+ err = rtnl_fill_ifinfo(nskb, dev, net,
+ RTM_NEWLINK, NETLINK_CB(skb).portid,
+ nlh->nlmsg_seq, 0, 0, ext_filter_mask,
+ 0, NULL, 0, netnsid, GFP_KERNEL);
- /* Synchronize the carrier state so we don't report a state
- * that we're not actually going to honour immediately; if
- * the driver just did a carrier off->on transition, we can
- * only TX if link watch work has run, but without this we'd
- * already report carrier on, even if it doesn't work yet.
- */
- linkwatch_sync_dev(dev);
+ if (!(ext_filter_mask & RTEXT_FILTER_NAME_ONLY))
+ rtnl_unlock();
- err = rtnl_fill_ifinfo(nskb, dev, net,
- RTM_NEWLINK, NETLINK_CB(skb).portid,
- nlh->nlmsg_seq, 0, 0, ext_filter_mask,
- 0, NULL, 0, netnsid, GFP_KERNEL);
if (err < 0) {
/* -EMSGSIZE implies BUG in if_nlmsg_size */
- WARN_ON(err == -EMSGSIZE);
+ WARN_ON_ONCE(err == -EMSGSIZE &&
+ !(ext_filter_mask & RTEXT_FILTER_NAME_ONLY));
kfree_skb(nskb);
- } else
+ } else {
err = rtnl_unicast(nskb, net, NETLINK_CB(skb).portid);
+ }
out:
+ netdev_put(dev, &dev_tracker);
if (netnsid >= 0)
put_net(tgt_net);
@@ -7116,7 +7145,8 @@ static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst =
{.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
.flags = RTNL_FLAG_DOIT_PERNET_WIP},
{.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
- .dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
+ .dumpit = rtnl_dump_ifinfo,
+ .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE | RTNL_FLAG_DOIT_UNLOCKED},
{.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
.flags = RTNL_FLAG_DOIT_PERNET_WIP},
{.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
--
2.54.0.631.ge1b05301d1-goog
prev parent reply other threads:[~2026-05-20 10:32 UTC|newest]
Thread overview: 4+ messages / expand[flat|nested] mbox.gz Atom feed top
2026-05-20 10:32 [PATCH v3 net-next 0/3] rtnetlink: RTNL avoidance in rtnl_getlink() Eric Dumazet
2026-05-20 10:32 ` [PATCH v3 net-next 1/3] rtnetlink: use nla_nest_end_safe() in rtnl_fill_prop_list() Eric Dumazet
2026-05-20 10:32 ` [PATCH v3 net-next 2/3] net: defer netdev_name_node_alt_flush() call to netdev_run_todo() Eric Dumazet
2026-05-20 10:32 ` Eric Dumazet [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20260520103227.1133277-4-edumazet@google.com \
--to=edumazet@google.com \
--cc=davem@davemloft.net \
--cc=dsahern@kernel.org \
--cc=eric.dumazet@gmail.com \
--cc=horms@kernel.org \
--cc=kuba@kernel.org \
--cc=kuniyu@google.com \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox