* [PATCH v1 net-next 01/13] ipv6: Validate RTA_GATEWAY of RTA_MULTIPATH in rtm_to_fib6_config().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 02/13] ipv6: Get rid of RTNL for SIOCDELRT and RTM_DELROUTE Kuniyuki Iwashima
` (12 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will perform RTM_NEWROUTE and RTM_DELROUTE under RCU, and then
we want to perform some validation out of the RCU scope.
When creating / removing an IPv6 route with RTA_MULTIPATH,
inet6_rtm_newroute() / inet6_rtm_delroute() validates RTA_GATEWAY
in each multipath entry.
Let's do that in rtm_to_fib6_config().
Note that now RTM_DELROUTE returns an error for RTA_MULTIPATH with
0 entries, which was accepted but should result in -EINVAL as
RTM_ADDROUTE.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 82 +++++++++++++++++++++++++-----------------------
1 file changed, 43 insertions(+), 39 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c3406a0d45bd..6c9c99fb02fe 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5016,6 +5016,44 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_FLOWLABEL] = { .type = NLA_BE32 },
};
+static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct rtnexthop *rtnh;
+ int remaining;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ if (!rtnh_ok(rtnh, remaining)) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop configuration - no valid nexthops");
+ return -EINVAL;
+ }
+
+ do {
+ int attrlen = rtnh_attrlen(rtnh);
+
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs;
+
+ attrs = rtnh_attrs(rtnh);
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ if (nla_len(nla) < sizeof(cfg->fc_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid IPv6 address in RTA_GATEWAY");
+ return -EINVAL;
+ }
+ }
+ }
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ } while (rtnh_ok(rtnh, remaining));
+
+ return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len,
+ extack, true);
+}
+
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct fib6_config *cfg,
struct netlink_ext_ack *extack)
@@ -5130,9 +5168,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
- err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
- cfg->fc_mp_len,
- extack, true);
+ err = rtm_to_fib6_multipath_config(cfg, extack);
if (err < 0)
goto errout;
}
@@ -5252,19 +5288,6 @@ static bool ip6_route_mpath_should_notify(const struct fib6_info *rt)
return should_notify;
}
-static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
- struct netlink_ext_ack *extack)
-{
- if (nla_len(nla) < sizeof(*gw)) {
- NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
- return -EINVAL;
- }
-
- *gw = nla_get_in6_addr(nla);
-
- return 0;
-}
-
static int ip6_route_multipath_add(struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -5305,18 +5328,11 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
- extack);
- if (err)
- goto cleanup;
-
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
- r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
- /* RTA_ENCAP_TYPE length checked in
- * lwtunnel_valid_encap_type_attr
- */
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
if (nla)
r_cfg.fc_encap_type = nla_get_u16(nla);
@@ -5349,12 +5365,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rtnh = rtnh_next(rtnh, &remaining);
}
- if (list_empty(&rt6_nh_list)) {
- NL_SET_ERR_MSG(extack,
- "Invalid nexthop configuration - no valid nexthops");
- return -EINVAL;
- }
-
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
* the full route when done
@@ -5476,21 +5486,15 @@ static int ip6_route_multipath_del(struct fib6_config *cfg,
nla = nla_find(attrs, attrlen, RTA_GATEWAY);
if (nla) {
- err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
- extack);
- if (err) {
- last_err = err;
- goto next_rtnh;
- }
-
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
r_cfg.fc_flags |= RTF_GATEWAY;
}
}
+
err = ip6_route_del(&r_cfg, extack);
if (err)
last_err = err;
-next_rtnh:
rtnh = rtnh_next(rtnh, &remaining);
}
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 02/13] ipv6: Get rid of RTNL for SIOCDELRT and RTM_DELROUTE.
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 01/13] ipv6: Validate RTA_GATEWAY of RTA_MULTIPATH in rtm_to_fib6_config() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 03/13] ipv6: Move some validation from ip6_route_info_create() to rtm_to_fib6_config() Kuniyuki Iwashima
` (11 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Basically, removing an IPv6 route does not require RTNL because
the IPv6 routing tables are protected by per table lock. Also,
ip6_route_del() already relies on RCU and the table lock.
inet6_rtm_delroute() calls nexthop_find_by_id() to check if the
nexthop specified by RTA_NH_ID exists. nexthop uses rbtree and
the top-down walk can be safely performed under RCU.
Let's call nexthop_find_by_id() under RCU and get rid of RTNL
from inet6_rtm_delroute() and SIOCDELRT.
Even if the nexthop is removed after rcu_read_unlock() in
inet6_rtm_delroute(), __remove_nexthop_fib() cleans up the routes
tied to the nexthop, and ip6_route_del() returns -ESRCH. So the
request was at least valid as of nexthop_find_by_id(), and it's just
a matter of timing.
Note that we need to pass false to lwtunnel_valid_encap_type_attr().
The following patches also use the newroute bool.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 36 ++++++++++++++++++++++--------------
1 file changed, 22 insertions(+), 14 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6c9c99fb02fe..b737b242079e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4482,19 +4482,20 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
rtmsg_to_fib6_config(net, rtmsg, &cfg);
- rtnl_lock();
switch (cmd) {
case SIOCADDRT:
+ rtnl_lock();
/* Only do the default setting of fc_metric in route adding */
if (cfg.fc_metric == 0)
cfg.fc_metric = IP6_RT_PRIO_USER;
err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
+ rtnl_unlock();
break;
case SIOCDELRT:
err = ip6_route_del(&cfg, NULL);
break;
}
- rtnl_unlock();
+
return err;
}
@@ -5017,7 +5018,8 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
};
static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
- struct netlink_ext_ack *extack)
+ struct netlink_ext_ack *extack,
+ bool newroute)
{
struct rtnexthop *rtnh;
int remaining;
@@ -5051,15 +5053,16 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
} while (rtnh_ok(rtnh, remaining));
return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len,
- extack, true);
+ extack, newroute);
}
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
struct fib6_config *cfg,
struct netlink_ext_ack *extack)
{
- struct rtmsg *rtm;
+ bool newroute = nlh->nlmsg_type == RTM_NEWROUTE;
struct nlattr *tb[RTA_MAX+1];
+ struct rtmsg *rtm;
unsigned int pref;
int err;
@@ -5168,7 +5171,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
- err = rtm_to_fib6_multipath_config(cfg, extack);
+ err = rtm_to_fib6_multipath_config(cfg, extack, newroute);
if (err < 0)
goto errout;
}
@@ -5188,7 +5191,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
- extack, true);
+ extack, newroute);
if (err < 0)
goto errout;
}
@@ -5511,15 +5514,20 @@ static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
if (err < 0)
return err;
- if (cfg.fc_nh_id &&
- !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id)) {
- NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
- return -EINVAL;
+ if (cfg.fc_nh_id) {
+ rcu_read_lock();
+ err = !nexthop_find_by_id(sock_net(skb->sk), cfg.fc_nh_id);
+ rcu_read_unlock();
+
+ if (err) {
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ return -EINVAL;
+ }
}
- if (cfg.fc_mp)
+ if (cfg.fc_mp) {
return ip6_route_multipath_del(&cfg, extack);
- else {
+ } else {
cfg.fc_delete_all_nh = 1;
return ip6_route_del(&cfg, extack);
}
@@ -6731,7 +6739,7 @@ static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_o
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
.doit = inet6_rtm_newroute},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
- .doit = inet6_rtm_delroute},
+ .doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
.doit = inet6_rtm_getroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
};
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 03/13] ipv6: Move some validation from ip6_route_info_create() to rtm_to_fib6_config().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 01/13] ipv6: Validate RTA_GATEWAY of RTA_MULTIPATH in rtm_to_fib6_config() Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 02/13] ipv6: Get rid of RTNL for SIOCDELRT and RTM_DELROUTE Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 04/13] ipv6: Check GATEWAY in rtm_to_fib6_multipath_config() Kuniyuki Iwashima
` (10 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
ip6_route_info_create() is called from 3 functions:
* ip6_route_add()
* ip6_route_multipath_add()
* addrconf_f6i_alloc()
addrconf_f6i_alloc() does not need validation for struct fib6_config in
ip6_route_info_create().
ip6_route_multipath_add() calls ip6_route_info_create() for multiple
routes with slightly different fib6_config instances, which is copied
from the base config passed from userspace. So, we need not validate
the same config repeatedly.
Let's move such validation into rtm_to_fib6_config().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 79 +++++++++++++++++++++++++-----------------------
1 file changed, 42 insertions(+), 37 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b737b242079e..baad02c099ff 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3705,38 +3705,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
int err = -EINVAL;
int addr_type;
- /* RTF_PCPU is an internal flag; can not be set by userspace */
- if (cfg->fc_flags & RTF_PCPU) {
- NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
- goto out;
- }
-
- /* RTF_CACHE is an internal flag; can not be set by userspace */
- if (cfg->fc_flags & RTF_CACHE) {
- NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
- goto out;
- }
-
- if (cfg->fc_type > RTN_MAX) {
- NL_SET_ERR_MSG(extack, "Invalid route type");
- goto out;
- }
-
- if (cfg->fc_dst_len > 128) {
- NL_SET_ERR_MSG(extack, "Invalid prefix length");
- goto out;
- }
- if (cfg->fc_src_len > 128) {
- NL_SET_ERR_MSG(extack, "Invalid source address length");
- goto out;
- }
-#ifndef CONFIG_IPV6_SUBTREES
- if (cfg->fc_src_len) {
- NL_SET_ERR_MSG(extack,
- "Specifying source address requires IPV6_SUBTREES to be enabled");
- goto out;
- }
-#endif
if (cfg->fc_nh_id) {
nh = nexthop_find_by_id(net, cfg->fc_nh_id);
if (!nh) {
@@ -3801,11 +3769,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
rt->fib6_src.plen = cfg->fc_src_len;
#endif
if (nh) {
- if (rt->fib6_src.plen) {
- NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
- err = -EINVAL;
- goto out_free;
- }
if (!nexthop_get(nh)) {
NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
err = -ENOENT;
@@ -5205,6 +5168,48 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
}
}
+ if (newroute) {
+ /* RTF_PCPU is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_PCPU) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
+ goto errout;
+ }
+
+ /* RTF_CACHE is an internal flag; can not be set by userspace */
+ if (cfg->fc_flags & RTF_CACHE) {
+ NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
+ goto errout;
+ }
+
+ if (cfg->fc_type > RTN_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid route type");
+ goto errout;
+ }
+
+ if (cfg->fc_dst_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid prefix length");
+ goto errout;
+ }
+
+#ifdef CONFIG_IPV6_SUBTREES
+ if (cfg->fc_src_len > 128) {
+ NL_SET_ERR_MSG(extack, "Invalid source address length");
+ goto errout;
+ }
+
+ if (cfg->fc_nh_id && cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack, "Nexthops can not be used with source routing");
+ goto errout;
+ }
+#else
+ if (cfg->fc_src_len) {
+ NL_SET_ERR_MSG(extack,
+ "Specifying source address requires IPV6_SUBTREES to be enabled");
+ goto errout;
+ }
+#endif
+ }
+
err = 0;
errout:
return err;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 04/13] ipv6: Check GATEWAY in rtm_to_fib6_multipath_config().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (2 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 03/13] ipv6: Move some validation from ip6_route_info_create() to rtm_to_fib6_config() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 05/13] ipv6: Move nexthop_find_by_id() after fib6_info_alloc() Kuniyuki Iwashima
` (9 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
In ip6_route_multipath_add(), we call rt6_qualify_for_ecmp() for each
entry. If it returns false, the request fails.
rt6_qualify_for_ecmp() returns false if either of the conditions below
is true:
1. f6i->fib6_flags has RTF_ADDRCONF
2. f6i->nh is not NULL
3. f6i->fib6_nh->fib_nh_gw_family is AF_UNSPEC
1 is unnecessary because rtm_to_fib6_config() never sets RTF_ADDRCONF
to cfg->fc_flags.
2. is equivalent with cfg->fc_nh_id.
3. can be replaced by checking RTF_GATEWAY in the base and each multipath
entry because AF_INET6 is set to f6i->fib6_nh->fib_nh_gw_family only when
cfg.fc_is_fdb is true or RTF_GATEWAY is set, but the former is always
false.
Let's perform the equivalent checks in rtm_to_fib6_multipath_config().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 16 +++++++++-------
1 file changed, 9 insertions(+), 7 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index baad02c099ff..b51793ee7a18 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4996,6 +4996,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
}
do {
+ bool has_gateway = cfg->fc_flags & RTF_GATEWAY;
int attrlen = rtnh_attrlen(rtnh);
if (attrlen > 0) {
@@ -5009,9 +5010,17 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
"Invalid IPv6 address in RTA_GATEWAY");
return -EINVAL;
}
+
+ has_gateway = true;
}
}
+ if (newroute && (cfg->fc_nh_id || !has_gateway)) {
+ NL_SET_ERR_MSG(extack,
+ "Device only routes can not be added for IPv6 using the multipath API.");
+ return -EINVAL;
+ }
+
rtnh = rtnh_next(rtnh, &remaining);
} while (rtnh_ok(rtnh, remaining));
@@ -5353,13 +5362,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rt = NULL;
goto cleanup;
}
- if (!rt6_qualify_for_ecmp(rt)) {
- err = -EINVAL;
- NL_SET_ERR_MSG(extack,
- "Device only routes can not be added for IPv6 using the multipath API.");
- fib6_info_release(rt);
- goto cleanup;
- }
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 05/13] ipv6: Move nexthop_find_by_id() after fib6_info_alloc().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (3 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 04/13] ipv6: Check GATEWAY in rtm_to_fib6_multipath_config() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 06/13] ipv6: Split ip6_route_info_create() Kuniyuki Iwashima
` (8 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.
Then, we must perform two lookups for nexthop and dev under RCU
to guarantee their lifetime.
ip6_route_info_create() calls nexthop_find_by_id() first if
RTA_NH_ID is specified, and then allocates struct fib6_info.
nexthop_find_by_id() must be called under RCU, but we do not want
to use GFP_ATOMIC for memory allocation here, which will be likely
to fail in ip6_route_multipath_add().
Let's move nexthop_find_by_id() after the memory allocation.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 34 ++++++++++++++++++----------------
1 file changed, 18 insertions(+), 16 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index b51793ee7a18..28d38282a19e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3699,24 +3699,11 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
{
struct net *net = cfg->fc_nlinfo.nl_net;
struct fib6_info *rt = NULL;
- struct nexthop *nh = NULL;
struct fib6_table *table;
struct fib6_nh *fib6_nh;
- int err = -EINVAL;
+ int err = -ENOBUFS;
int addr_type;
- if (cfg->fc_nh_id) {
- nh = nexthop_find_by_id(net, cfg->fc_nh_id);
- if (!nh) {
- NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
- goto out;
- }
- err = fib6_check_nexthop(nh, cfg, extack);
- if (err)
- goto out;
- }
-
- err = -ENOBUFS;
if (cfg->fc_nlinfo.nlh &&
!(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
table = fib6_get_table(net, cfg->fc_table);
@@ -3732,7 +3719,7 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto out;
err = -ENOMEM;
- rt = fib6_info_alloc(gfp_flags, !nh);
+ rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
if (!rt)
goto out;
@@ -3768,12 +3755,27 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
- if (nh) {
+
+ if (cfg->fc_nh_id) {
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, cfg->fc_nh_id);
+ if (!nh) {
+ err = -EINVAL;
+ NL_SET_ERR_MSG(extack, "Nexthop id does not exist");
+ goto out_free;
+ }
+
+ err = fib6_check_nexthop(nh, cfg, extack);
+ if (err)
+ goto out_free;
+
if (!nexthop_get(nh)) {
NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
err = -ENOENT;
goto out_free;
}
+
rt->nh = nh;
fib6_nh = nexthop_fib6_nh(rt->nh);
} else {
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 06/13] ipv6: Split ip6_route_info_create().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (4 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 05/13] ipv6: Move nexthop_find_by_id() after fib6_info_alloc() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 07/13] ipv6: Preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create() Kuniyuki Iwashima
` (7 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.
Then, we want to allocate everything as possible before entering
the RCU section.
The RCU section will start in the middle of ip6_route_info_create(),
and this is problematic for ip6_route_multipath_add() that calls
ip6_route_info_create() multiple times.
Let's split ip6_route_info_create() into two parts; one for memory
allocation and another for nexthop setup.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 95 +++++++++++++++++++++++++++++++-----------------
1 file changed, 62 insertions(+), 33 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 28d38282a19e..6299cfd9f12a 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3694,15 +3694,13 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
}
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
- gfp_t gfp_flags,
- struct netlink_ext_ack *extack)
+ gfp_t gfp_flags,
+ struct netlink_ext_ack *extack)
{
struct net *net = cfg->fc_nlinfo.nl_net;
- struct fib6_info *rt = NULL;
struct fib6_table *table;
- struct fib6_nh *fib6_nh;
- int err = -ENOBUFS;
- int addr_type;
+ struct fib6_info *rt;
+ int err;
if (cfg->fc_nlinfo.nlh &&
!(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
@@ -3714,22 +3712,22 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
} else {
table = fib6_new_table(net, cfg->fc_table);
}
+ if (!table) {
+ err = -ENOBUFS;
+ goto err;
+ }
- if (!table)
- goto out;
-
- err = -ENOMEM;
rt = fib6_info_alloc(gfp_flags, !cfg->fc_nh_id);
- if (!rt)
- goto out;
+ if (!rt) {
+ err = -ENOMEM;
+ goto err;
+ }
rt->fib6_metrics = ip_fib_metrics_init(cfg->fc_mx, cfg->fc_mx_len,
extack);
if (IS_ERR(rt->fib6_metrics)) {
err = PTR_ERR(rt->fib6_metrics);
- /* Do not leave garbage there. */
- rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
- goto out_free;
+ goto free;
}
if (cfg->fc_flags & RTF_ADDRCONF)
@@ -3737,12 +3735,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (cfg->fc_flags & RTF_EXPIRES)
fib6_set_expires(rt, jiffies +
- clock_t_to_jiffies(cfg->fc_expires));
+ clock_t_to_jiffies(cfg->fc_expires));
if (cfg->fc_protocol == RTPROT_UNSPEC)
cfg->fc_protocol = RTPROT_BOOT;
- rt->fib6_protocol = cfg->fc_protocol;
+ rt->fib6_protocol = cfg->fc_protocol;
rt->fib6_table = table;
rt->fib6_metric = cfg->fc_metric;
rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
@@ -3755,6 +3753,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
rt->fib6_src.plen = cfg->fc_src_len;
#endif
+ return rt;
+free:
+ kfree(rt);
+err:
+ return ERR_PTR(err);
+}
+
+static int ip6_route_info_create_nh(struct fib6_info *rt,
+ struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct net *net = cfg->fc_nlinfo.nl_net;
+ struct fib6_nh *fib6_nh;
+ int err;
if (cfg->fc_nh_id) {
struct nexthop *nh;
@@ -3779,9 +3791,11 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
rt->nh = nh;
fib6_nh = nexthop_fib6_nh(rt->nh);
} else {
- err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack);
+ int addr_type;
+
+ err = fib6_nh_init(net, rt->fib6_nh, cfg, GFP_ATOMIC, extack);
if (err)
- goto out;
+ goto out_release;
fib6_nh = rt->fib6_nh;
@@ -3800,21 +3814,20 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
NL_SET_ERR_MSG(extack, "Invalid source address");
err = -EINVAL;
- goto out;
+ goto out_release;
}
rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
rt->fib6_prefsrc.plen = 128;
- } else
- rt->fib6_prefsrc.plen = 0;
+ }
- return rt;
-out:
+ return 0;
+out_release:
fib6_info_release(rt);
- return ERR_PTR(err);
+ return err;
out_free:
ip_fib_metrics_put(rt->fib6_metrics);
kfree(rt);
- return ERR_PTR(err);
+ return err;
}
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
@@ -3827,6 +3840,10 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
if (IS_ERR(rt))
return PTR_ERR(rt);
+ err = ip6_route_info_create_nh(rt, cfg, extack);
+ if (err)
+ return err;
+
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
fib6_info_release(rt);
@@ -4550,6 +4567,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
.fc_ignore_dev_down = true,
};
struct fib6_info *f6i;
+ int err;
if (anycast) {
cfg.fc_type = RTN_ANYCAST;
@@ -4560,14 +4578,19 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net,
}
f6i = ip6_route_info_create(&cfg, gfp_flags, extack);
- if (!IS_ERR(f6i)) {
- f6i->dst_nocount = true;
+ if (IS_ERR(f6i))
+ return f6i;
- if (!anycast &&
- (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
- READ_ONCE(idev->cnf.disable_policy)))
- f6i->dst_nopolicy = true;
- }
+ err = ip6_route_info_create_nh(f6i, &cfg, extack);
+ if (err)
+ return ERR_PTR(err);
+
+ f6i->dst_nocount = true;
+
+ if (!anycast &&
+ (READ_ONCE(net->ipv6.devconf_all->disable_policy) ||
+ READ_ONCE(idev->cnf.disable_policy)))
+ f6i->dst_nopolicy = true;
return f6i;
}
@@ -5365,6 +5388,12 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
goto cleanup;
}
+ err = ip6_route_info_create_nh(rt, &r_cfg, extack);
+ if (err) {
+ rt = NULL;
+ goto cleanup;
+ }
+
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 07/13] ipv6: Preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (5 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 06/13] ipv6: Split ip6_route_info_create() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 08/13] ipv6: Preallocate nhc_pcpu_rth_output " Kuniyuki Iwashima
` (6 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
ip6_route_info_create_nh() will be called under RCU.
Then, fib6_nh_init() is also under RCU, but per-cpu memory allocation
is very likely to fail with GFP_ATOMIC while bluk-adding IPv6 routes
and we will see a bunch of this message in dmesg.
percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
percpu: allocation failed, size=8 align=8 atomic=1, atomic alloc failed, no space left
Let's preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create().
If something fails before the original memory allocation in
fib6_nh_init(), ip6_route_info_create_nh() calls fib6_info_release(),
which releases the preallocated per-cpu memory.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 25 ++++++++++++++++++++++---
1 file changed, 22 insertions(+), 3 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 6299cfd9f12a..d50377131506 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3630,10 +3630,12 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh,
goto out;
pcpu_alloc:
- fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
if (!fib6_nh->rt6i_pcpu) {
- err = -ENOMEM;
- goto out;
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu) {
+ err = -ENOMEM;
+ goto out;
+ }
}
fib6_nh->fib_nh_dev = dev;
@@ -3693,6 +3695,15 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
}
}
+static int fib6_nh_prealloc_percpu(struct fib6_nh *fib6_nh, gfp_t gfp_flags)
+{
+ fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
+ if (!fib6_nh->rt6i_pcpu)
+ return -ENOMEM;
+
+ return 0;
+}
+
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
gfp_t gfp_flags,
struct netlink_ext_ack *extack)
@@ -3730,6 +3741,12 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
goto free;
}
+ if (!cfg->fc_nh_id) {
+ err = fib6_nh_prealloc_percpu(&rt->fib6_nh[0], gfp_flags);
+ if (err)
+ goto free_metrics;
+ }
+
if (cfg->fc_flags & RTF_ADDRCONF)
rt->dst_nocount = true;
@@ -3754,6 +3771,8 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
rt->fib6_src.plen = cfg->fc_src_len;
#endif
return rt;
+free_metrics:
+ ip_fib_metrics_put(rt->fib6_metrics);
free:
kfree(rt);
err:
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 08/13] ipv6: Preallocate nhc_pcpu_rth_output in ip6_route_info_create().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (6 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 07/13] ipv6: Preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 09/13] ipv6: Don't pass net to ip6_route_info_append() Kuniyuki Iwashima
` (5 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
ip6_route_info_create_nh() will be called under RCU.
It calls fib_nh_common_init() and allocates nhc->nhc_pcpu_rth_output.
As with the reason for rt->fib6_nh->rt6i_pcpu, we want to avoid
GFP_ATOMIC allocation for nhc->nhc_pcpu_rth_output under RCU.
Let's preallocate it in ip6_route_info_create().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/fib_semantics.c | 10 ++++++----
net/ipv6/route.c | 9 +++++++++
2 files changed, 15 insertions(+), 4 deletions(-)
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index f68bb9e34c34..5326f1501af0 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -617,10 +617,12 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc,
{
int err;
- nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
- gfp_flags);
- if (!nhc->nhc_pcpu_rth_output)
- return -ENOMEM;
+ if (!nhc->nhc_pcpu_rth_output) {
+ nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+ gfp_flags);
+ if (!nhc->nhc_pcpu_rth_output)
+ return -ENOMEM;
+ }
if (encap) {
struct lwtunnel_state *lwtstate;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d50377131506..e65e2c8b7125 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3697,10 +3697,19 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh)
static int fib6_nh_prealloc_percpu(struct fib6_nh *fib6_nh, gfp_t gfp_flags)
{
+ struct fib_nh_common *nhc = &fib6_nh->nh_common;
+
fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags);
if (!fib6_nh->rt6i_pcpu)
return -ENOMEM;
+ nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *,
+ gfp_flags);
+ if (!nhc->nhc_pcpu_rth_output) {
+ free_percpu(fib6_nh->rt6i_pcpu);
+ return -ENOMEM;
+ }
+
return 0;
}
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 09/13] ipv6: Don't pass net to ip6_route_info_append().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (7 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 08/13] ipv6: Preallocate nhc_pcpu_rth_output " Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 10/13] ipv6: Factorise ip6_route_multipath_add() Kuniyuki Iwashima
` (4 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
net is not used in ip6_route_info_append() after commit 36f19d5b4f99
("net/ipv6: Remove extra call to ip6_convert_metrics for multipath case").
Let's remove the argument.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 6 ++----
1 file changed, 2 insertions(+), 4 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index e65e2c8b7125..26e5a372a9cd 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5283,8 +5283,7 @@ struct rt6_nh {
struct list_head next;
};
-static int ip6_route_info_append(struct net *net,
- struct list_head *rt6_nh_list,
+static int ip6_route_info_append(struct list_head *rt6_nh_list,
struct fib6_info *rt,
struct fib6_config *r_cfg)
{
@@ -5424,8 +5423,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
- err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
- rt, &r_cfg);
+ err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
if (err) {
fib6_info_release(rt);
goto cleanup;
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 10/13] ipv6: Factorise ip6_route_multipath_add().
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (8 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 09/13] ipv6: Don't pass net to ip6_route_info_append() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 11/13] ipv6: Protect fib6_link_table() with spinlock Kuniyuki Iwashima
` (3 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.
Then, the RCU section will start before ip6_route_info_create_nh()
in ip6_route_multipath_add(), but ip6_route_info_create() is called
in the same loop and will sleep.
Let's split the loop into ip6_route_mpath_info_create() and
ip6_route_mpath_info_create_nh().
Note that ip6_route_info_append() is now integrated into
ip6_route_mpath_info_create_nh() because we need to call different
free functions for nexthops that passed ip6_route_info_create_nh().
In case of failure, the remaining nexthops that ip6_route_info_create_nh()
has not been called for will be freed by ip6_route_mpath_info_cleanup().
OTOH, if a nexthop passes ip6_route_info_create_nh(), it will be linked
to a local temporary list, which will be spliced back to rt6_nh_list.
In case of failure, these nexthops will be released by fib6_info_release()
in ip6_route_multipath_add().
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv6/route.c | 205 ++++++++++++++++++++++++++++++-----------------
1 file changed, 130 insertions(+), 75 deletions(-)
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 26e5a372a9cd..a209d8c8ff75 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -5281,29 +5281,131 @@ struct rt6_nh {
struct fib6_info *fib6_info;
struct fib6_config r_cfg;
struct list_head next;
+ int weight;
};
-static int ip6_route_info_append(struct list_head *rt6_nh_list,
- struct fib6_info *rt,
- struct fib6_config *r_cfg)
+static void ip6_route_mpath_info_cleanup(struct list_head *rt6_nh_list)
{
- struct rt6_nh *nh;
- int err = -EEXIST;
+ struct rt6_nh *nh, *nh_next;
- list_for_each_entry(nh, rt6_nh_list, next) {
- /* check if fib6_info already exists */
- if (rt6_duplicate_nexthop(nh->fib6_info, rt))
- return err;
+ list_for_each_entry_safe(nh, nh_next, rt6_nh_list, next) {
+ struct fib6_info *rt = nh->fib6_info;
+
+ if (rt) {
+ free_percpu(rt->fib6_nh->nh_common.nhc_pcpu_rth_output);
+ free_percpu(rt->fib6_nh->rt6i_pcpu);
+ ip_fib_metrics_put(rt->fib6_metrics);
+ kfree(rt);
+ }
+
+ list_del(&nh->next);
+ kfree(nh);
}
+}
- nh = kzalloc(sizeof(*nh), GFP_KERNEL);
- if (!nh)
- return -ENOMEM;
- nh->fib6_info = rt;
- memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
- list_add_tail(&nh->next, rt6_nh_list);
+static int ip6_route_mpath_info_create(struct list_head *rt6_nh_list,
+ struct fib6_config *cfg,
+ struct netlink_ext_ack *extack)
+{
+ struct rtnexthop *rtnh;
+ int remaining;
+ int err;
+
+ remaining = cfg->fc_mp_len;
+ rtnh = (struct rtnexthop *)cfg->fc_mp;
+
+ /* Parse a Multipath Entry and build a list (rt6_nh_list) of
+ * fib6_info structs per nexthop
+ */
+ while (rtnh_ok(rtnh, remaining)) {
+ struct fib6_config r_cfg;
+ struct fib6_info *rt;
+ struct rt6_nh *nh;
+ int attrlen;
+
+ nh = kzalloc(sizeof(*nh), GFP_KERNEL);
+ if (!nh) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ list_add_tail(&nh->next, rt6_nh_list);
+
+ memcpy(&r_cfg, cfg, sizeof(*cfg));
+ if (rtnh->rtnh_ifindex)
+ r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
+
+ attrlen = rtnh_attrlen(rtnh);
+ if (attrlen > 0) {
+ struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
+
+ nla = nla_find(attrs, attrlen, RTA_GATEWAY);
+ if (nla) {
+ r_cfg.fc_gateway = nla_get_in6_addr(nla);
+ r_cfg.fc_flags |= RTF_GATEWAY;
+ }
+
+ r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
+ nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
+ if (nla)
+ r_cfg.fc_encap_type = nla_get_u16(nla);
+ }
+
+ r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
+
+ rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
+ if (IS_ERR(rt)) {
+ err = PTR_ERR(rt);
+ goto err;
+ }
+
+ nh->fib6_info = rt;
+ nh->weight = rtnh->rtnh_hops + 1;
+ memcpy(&nh->r_cfg, &r_cfg, sizeof(r_cfg));
+
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
return 0;
+err:
+ ip6_route_mpath_info_cleanup(rt6_nh_list);
+ return err;
+}
+
+static int ip6_route_mpath_info_create_nh(struct list_head *rt6_nh_list,
+ struct netlink_ext_ack *extack)
+{
+ struct rt6_nh *nh, *nh_next, *nh_tmp;
+ LIST_HEAD(tmp);
+ int err;
+
+ list_for_each_entry_safe(nh, nh_next, rt6_nh_list, next) {
+ struct fib6_info *rt = nh->fib6_info;
+
+ err = ip6_route_info_create_nh(rt, &nh->r_cfg, extack);
+ if (err) {
+ nh->fib6_info = NULL;
+ goto err;
+ }
+
+ rt->fib6_nh->fib_nh_weight = nh->weight;
+
+ list_move_tail(&nh->next, &tmp);
+
+ list_for_each_entry(nh_tmp, rt6_nh_list, next) {
+ /* check if fib6_info already exists */
+ if (rt6_duplicate_nexthop(nh_tmp->fib6_info, rt)) {
+ err = -EEXIST;
+ goto err;
+ }
+ }
+ }
+out:
+ list_splice(&tmp, rt6_nh_list);
+ return err;
+err:
+ ip6_route_mpath_info_cleanup(rt6_nh_list);
+ goto out;
}
static void ip6_route_mpath_notify(struct fib6_info *rt,
@@ -5362,75 +5464,28 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
{
struct fib6_info *rt_notif = NULL, *rt_last = NULL;
struct nl_info *info = &cfg->fc_nlinfo;
- struct fib6_config r_cfg;
- struct rtnexthop *rtnh;
- struct fib6_info *rt;
- struct rt6_nh *err_nh;
struct rt6_nh *nh, *nh_safe;
+ LIST_HEAD(rt6_nh_list);
+ struct rt6_nh *err_nh;
__u16 nlflags;
- int remaining;
- int attrlen;
- int err = 1;
int nhn = 0;
- int replace = (cfg->fc_nlinfo.nlh &&
- (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
- LIST_HEAD(rt6_nh_list);
+ int replace;
+ int err;
+
+ replace = (cfg->fc_nlinfo.nlh &&
+ (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
nlflags |= NLM_F_APPEND;
- remaining = cfg->fc_mp_len;
- rtnh = (struct rtnexthop *)cfg->fc_mp;
-
- /* Parse a Multipath Entry and build a list (rt6_nh_list) of
- * fib6_info structs per nexthop
- */
- while (rtnh_ok(rtnh, remaining)) {
- memcpy(&r_cfg, cfg, sizeof(*cfg));
- if (rtnh->rtnh_ifindex)
- r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
-
- attrlen = rtnh_attrlen(rtnh);
- if (attrlen > 0) {
- struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
-
- nla = nla_find(attrs, attrlen, RTA_GATEWAY);
- if (nla) {
- r_cfg.fc_gateway = nla_get_in6_addr(nla);
- r_cfg.fc_flags |= RTF_GATEWAY;
- }
-
- r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
- nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
- if (nla)
- r_cfg.fc_encap_type = nla_get_u16(nla);
- }
-
- r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
- rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
- if (IS_ERR(rt)) {
- err = PTR_ERR(rt);
- rt = NULL;
- goto cleanup;
- }
-
- err = ip6_route_info_create_nh(rt, &r_cfg, extack);
- if (err) {
- rt = NULL;
- goto cleanup;
- }
-
- rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1;
-
- err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
- if (err) {
- fib6_info_release(rt);
- goto cleanup;
- }
+ err = ip6_route_mpath_info_create(&rt6_nh_list, cfg, extack);
+ if (err)
+ return err;
- rtnh = rtnh_next(rtnh, &remaining);
- }
+ err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack);
+ if (err)
+ goto cleanup;
/* for add and replace send one notification with all nexthops.
* Skip the notification in fib6_add_rt2node and send one with
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 11/13] ipv6: Protect fib6_link_table() with spinlock.
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (9 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 10/13] ipv6: Factorise ip6_route_multipath_add() Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 12/13] ipv6: Protect nh->f6i_list with spinlock and flag Kuniyuki Iwashima
` (2 subsequent siblings)
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.
If the request specifies a new table ID, fib6_new_table() is
called to create a new routing table.
Two concurrent requests could specify the same table ID, so we
need a lock to protect net->ipv6.fib_table_hash[h].
Let's add a spinlock to protect the hash bucket linkage.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/netns/ipv6.h | 1 +
net/ipv6/ip6_fib.c | 26 +++++++++++++++++++++-----
2 files changed, 22 insertions(+), 5 deletions(-)
diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h
index 5f2cfd84570a..47dc70d8100a 100644
--- a/include/net/netns/ipv6.h
+++ b/include/net/netns/ipv6.h
@@ -72,6 +72,7 @@ struct netns_ipv6 {
struct rt6_statistics *rt6_stats;
struct timer_list ip6_fib_timer;
struct hlist_head *fib_table_hash;
+ spinlock_t fib_table_hash_lock;
struct fib6_table *fib6_main_tbl;
struct list_head fib6_walkers;
rwlock_t fib6_walker_lock;
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index c134ba202c4c..dab091f70f2b 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -249,19 +249,33 @@ static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
struct fib6_table *fib6_new_table(struct net *net, u32 id)
{
- struct fib6_table *tb;
+ struct fib6_table *tb, *new_tb;
if (id == 0)
id = RT6_TABLE_MAIN;
+
tb = fib6_get_table(net, id);
if (tb)
return tb;
- tb = fib6_alloc_table(net, id);
- if (tb)
- fib6_link_table(net, tb);
+ new_tb = fib6_alloc_table(net, id);
+ if (!new_tb)
+ return NULL;
+
+ spin_lock_bh(&net->ipv6.fib_table_hash_lock);
+
+ tb = fib6_get_table(net, id);
+ if (unlikely(tb)) {
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
+ kfree(new_tb);
+ return tb;
+ }
- return tb;
+ fib6_link_table(net, new_tb);
+
+ spin_unlock_bh(&net->ipv6.fib_table_hash_lock);
+
+ return new_tb;
}
EXPORT_SYMBOL_GPL(fib6_new_table);
@@ -2423,6 +2437,8 @@ static int __net_init fib6_net_init(struct net *net)
if (!net->ipv6.fib_table_hash)
goto out_rt6_stats;
+ spin_lock_init(&net->ipv6.fib_table_hash_lock);
+
net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
GFP_KERNEL);
if (!net->ipv6.fib6_main_tbl)
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 12/13] ipv6: Protect nh->f6i_list with spinlock and flag.
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (10 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 11/13] ipv6: Protect fib6_link_table() with spinlock Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 4:00 ` [PATCH v1 net-next 13/13] ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE Kuniyuki Iwashima
2025-03-21 14:07 ` [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Stanislav Fomichev
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
We will get rid of RTNL from RTM_NEWROUTE and SIOCADDRT.
Then, we may be going to add a route tied to a dying nexthop.
The nexthop itself is not freed during the RCU graceful period,
but if we link a route after __remove_nexthop_fib() is called for
the nexthop, the route will be leaked.
To avoid the race between IPv6 route addition under RCU vs nexthop
deletion under RTNL, let's add a dead flag and protect it and
nh->f6i_list with a spinlock.
__remove_nexthop_fib() acquires the nexthop's spinlock and sets false
to nh->dead, then calls ip6_del_rt() for the linked route one by one
without the spinlock because fib6_purge_rt() acquires it later.
While adding an IPv6 route, fib6_add() acquires the nexthop lock and
checks the dead flag just before inserting the route.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
include/net/nexthop.h | 2 ++
net/ipv4/nexthop.c | 20 +++++++++++++++++---
net/ipv6/ip6_fib.c | 25 ++++++++++++++++++++-----
3 files changed, 39 insertions(+), 8 deletions(-)
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index d9fb44e8b321..572e69cda476 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -152,6 +152,8 @@ struct nexthop {
u8 protocol; /* app managing this nh */
u8 nh_flags;
bool is_group;
+ bool dead;
+ spinlock_t lock; /* protect dead and f6i_list */
refcount_t refcnt;
struct rcu_head rcu;
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 01df7dd795f0..94eab81bfe54 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -541,6 +541,7 @@ static struct nexthop *nexthop_alloc(void)
INIT_LIST_HEAD(&nh->f6i_list);
INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fdb_list);
+ spin_lock_init(&nh->lock);
}
return nh;
}
@@ -2105,7 +2106,7 @@ static void remove_nexthop_group(struct nexthop *nh, struct nl_info *nlinfo)
/* not called for nexthop replace */
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
- struct fib6_info *f6i, *tmp;
+ struct fib6_info *f6i;
bool do_flush = false;
struct fib_info *fi;
@@ -2116,13 +2117,26 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
if (do_flush)
fib_flush(net);
- /* ip6_del_rt removes the entry from this list hence the _safe */
- list_for_each_entry_safe(f6i, tmp, &nh->f6i_list, nh_list) {
+ spin_lock_bh(&nh->lock);
+
+ nh->dead = true;
+
+ while (1) {
+ f6i = list_first_entry_or_null(&nh->f6i_list, typeof(*f6i), nh_list);
+ if (!f6i)
+ break;
+
+ spin_unlock_bh(&nh->lock);
+
/* __ip6_del_rt does a release, so do a hold here */
fib6_info_hold(f6i);
ipv6_stub->ip6_del_rt(net, f6i,
!READ_ONCE(net->ipv4.sysctl_nexthop_compat_mode));
+
+ spin_lock_bh(&nh->lock);
}
+
+ spin_unlock_bh(&nh->lock);
}
static void __remove_nexthop(struct net *net, struct nexthop *nh,
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index dab091f70f2b..a1aab33b2558 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -1048,8 +1048,12 @@ static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
rt6_flush_exceptions(rt);
fib6_drop_pcpu_from(rt, table);
- if (rt->nh && !list_empty(&rt->nh_list))
- list_del_init(&rt->nh_list);
+ if (rt->nh) {
+ spin_lock(&rt->nh->lock);
+ if (!list_empty(&rt->nh_list))
+ list_del_init(&rt->nh_list);
+ spin_unlock(&rt->nh->lock);
+ }
if (refcount_read(&rt->fib6_ref) != 1) {
/* This route is used as dummy address holder in some split
@@ -1499,10 +1503,21 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
}
#endif
- err = fib6_add_rt2node(fn, rt, info, extack);
+ if (rt->nh) {
+ spin_lock(&rt->nh->lock);
+ if (rt->nh->dead) {
+ NL_SET_ERR_MSG(extack, "Nexthop has been deleted");
+ err = -EINVAL;
+ } else {
+ err = fib6_add_rt2node(fn, rt, info, extack);
+ if (!err)
+ list_add(&rt->nh_list, &rt->nh->f6i_list);
+ }
+ spin_unlock(&rt->nh->lock);
+ } else {
+ err = fib6_add_rt2node(fn, rt, info, extack);
+ }
if (!err) {
- if (rt->nh)
- list_add(&rt->nh_list, &rt->nh->f6i_list);
__fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
if (rt->fib6_flags & RTF_EXPIRES)
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* [PATCH v1 net-next 13/13] ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE.
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (11 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 12/13] ipv6: Protect nh->f6i_list with spinlock and flag Kuniyuki Iwashima
@ 2025-03-21 4:00 ` Kuniyuki Iwashima
2025-03-21 14:07 ` [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Stanislav Fomichev
13 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 4:00 UTC (permalink / raw)
To: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni
Cc: Simon Horman, Kuniyuki Iwashima, Kuniyuki Iwashima, netdev
Now we are ready to remove RTNL from SIOCADDRT and RTM_NEWROUTE.
The remaining things to do are
1. pass false to lwtunnel_valid_encap_type_attr()
2. use rcu_dereference_rtnl() in fib6_check_nexthop()
3. place rcu_read_lock() before ip6_route_info_create_nh().
Let's complete RTNL-free conversion.
When each CPU-X adds 100000 routes on table-X in a batch on
c7a.metal-48xl EC2 instance with 192 CPUs,
without this series:
$ sudo ./route_test.sh
...
added 19200000 routes (100000 routes * 192 tables).
Time elapsed: 189154 milliseconds.
with this series:
$ sudo ./route_test.sh
...
added 19200000 routes (100000 routes * 192 tables).
Time elapsed: 62531 milliseconds.
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
---
net/ipv4/nexthop.c | 4 ++--
net/ipv6/route.c | 18 ++++++++++++------
2 files changed, 14 insertions(+), 8 deletions(-)
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 94eab81bfe54..dec1a107aa0b 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -1543,12 +1543,12 @@ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg,
if (nh->is_group) {
struct nh_group *nhg;
- nhg = rtnl_dereference(nh->nh_grp);
+ nhg = rcu_dereference_rtnl(nh->nh_grp);
if (nhg->has_v4)
goto no_v4_nh;
is_fdb_nh = nhg->fdb_nh;
} else {
- nhi = rtnl_dereference(nh->nh_info);
+ nhi = rcu_dereference_rtnl(nh->nh_info);
if (nhi->family == AF_INET)
goto no_v4_nh;
is_fdb_nh = nhi->fdb_nh;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index a209d8c8ff75..d1d60415d1aa 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3868,12 +3868,16 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
if (IS_ERR(rt))
return PTR_ERR(rt);
+ rcu_read_lock();
+
err = ip6_route_info_create_nh(rt, cfg, extack);
if (err)
- return err;
+ goto unlock;
err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
fib6_info_release(rt);
+unlock:
+ rcu_read_unlock();
return err;
}
@@ -4494,12 +4498,10 @@ int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg)
switch (cmd) {
case SIOCADDRT:
- rtnl_lock();
/* Only do the default setting of fc_metric in route adding */
if (cfg.fc_metric == 0)
cfg.fc_metric = IP6_RT_PRIO_USER;
err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
- rtnl_unlock();
break;
case SIOCDELRT:
err = ip6_route_del(&cfg, NULL);
@@ -5078,7 +5080,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg,
} while (rtnh_ok(rtnh, remaining));
return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len,
- extack, newroute);
+ extack, false);
}
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -5216,7 +5218,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
err = lwtunnel_valid_encap_type(cfg->fc_encap_type,
- extack, newroute);
+ extack, false);
if (err < 0)
goto errout;
}
@@ -5483,6 +5485,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
if (err)
return err;
+ rcu_read_lock();
+
err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack);
if (err)
goto cleanup;
@@ -5574,6 +5578,8 @@ static int ip6_route_multipath_add(struct fib6_config *cfg,
}
cleanup:
+ rcu_read_unlock();
+
list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
fib6_info_release(nh->fib6_info);
list_del(&nh->next);
@@ -6856,7 +6862,7 @@ static void bpf_iter_unregister(void)
static const struct rtnl_msg_handler ip6_route_rtnl_msg_handlers[] __initconst_or_module = {
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_NEWROUTE,
- .doit = inet6_rtm_newroute},
+ .doit = inet6_rtm_newroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_DELROUTE,
.doit = inet6_rtm_delroute, .flags = RTNL_FLAG_DOIT_UNLOCKED},
{.owner = THIS_MODULE, .protocol = PF_INET6, .msgtype = RTM_GETROUTE,
--
2.48.1
^ permalink raw reply related [flat|nested] 16+ messages in thread* Re: [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table.
2025-03-21 4:00 [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Kuniyuki Iwashima
` (12 preceding siblings ...)
2025-03-21 4:00 ` [PATCH v1 net-next 13/13] ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE Kuniyuki Iwashima
@ 2025-03-21 14:07 ` Stanislav Fomichev
2025-03-21 16:50 ` Kuniyuki Iwashima
13 siblings, 1 reply; 16+ messages in thread
From: Stanislav Fomichev @ 2025-03-21 14:07 UTC (permalink / raw)
To: Kuniyuki Iwashima
Cc: David S. Miller, David Ahern, Eric Dumazet, Jakub Kicinski,
Paolo Abeni, Simon Horman, Kuniyuki Iwashima, netdev
On 03/20, Kuniyuki Iwashima wrote:
> IPv6 routing tables are protected by each table's lock and work in
> the interrupt context, which means we basically don't need RTNL to
> modify an IPv6 routing table itself.
>
> Currently, the control paths require RTNL because we may need to
> perform device and nexthop lookups; we must prevent dev/nexthop from
> going away from the netns.
>
> This, however, can be achieved by RCU as well.
>
> If we are in the RCU critical section while adding an IPv6 route,
> synchronize_net() in netif_change_net_namespace() and
> unregister_netdevice_many_notify() guarantee that the dev will not be
> moved to another netns or removed.
>
> Also, nexthop is guaranteed not to be freed during the RCU grace period.
>
> If we care about a race between nexthop removal and IPv6 route addition,
> we can get rid of RTNL from the control paths.
>
> Patch 1 moves a validation for RTA_MULTIPATH earlier.
> Patch 2 removes RTNL for SIOCDELRT and RTM_DELROUTE.
> Patch 3 ~ 10 move validation and memory allocation earlier.
> Patch 11 prevents a race between two requests for the same table.
> Patch 12 prevents the race mentioned above.
> Patch 13 removes RTNL for SIOCADDRT and RTM_NEWROUTE.
>
>
> Test:
>
> The script [0] lets each CPU-X create 100000 routes on table-X in a
> batch.
>
> On c7a.metal-48xl EC2 instance with 192 CPUs,
>
> With this series:
>
> $ sudo ./route_test.sh
> start adding routes
> added 19200000 routes (100000 routes * 192 tables).
> Time elapsed: 189154 milliseconds.
>
> Without series:
>
> $ sudo ./route_test.sh
> start adding routes
> added 19200000 routes (100000 routes * 192 tables).
> Time elapsed: 62531 milliseconds.
>
> I changed the number of routes (1000 ~ 100000 per CPU/table) and
> constantly saw it complete 3x faster with this series.
>
>
> [0]
> #!/bin/bash
>
> mkdir tmp
>
> NS="test"
> ip netns add $NS
> ip -n $NS link add veth0 type veth peer veth1
> ip -n $NS link set veth0 up
> ip -n $NS link set veth1 up
>
> TABLES=()
> for i in $(seq $(nproc)); do
> TABLES+=("$i")
> done
>
> ROUTES=()
> for i in {1..100}; do
> for j in {1..1000}; do
> ROUTES+=("2001:$i:$j::/64")
> done
> done
>
> for TABLE in "${TABLES[@]}"; do
> FILE="./tmp/batch-table-$TABLE.txt"
> > $FILE
> for ROUTE in "${ROUTES[@]}"; do
> echo "route add $ROUTE dev veth0 table $TABLE" >> $FILE
> done
> done
>
> echo "start adding routes"
>
> START_TIME=$(date +%s%3N)
> for TABLE in "${TABLES[@]}"; do
> ip -n $NS -6 -batch "./tmp/batch-table-$TABLE.txt" &
> done
>
> wait
> END_TIME=$(date +%s%3N)
> ELAPSED_TIME=$((END_TIME - START_TIME))
>
> echo "added $((${#ROUTES[@]} * ${#TABLES[@]})) routes (${#ROUTES[@]} routes * ${#TABLES[@]} tables)."
> echo "Time elapsed: ${ELAPSED_TIME} milliseconds."
> echo $(ip -n $NS -6 route show table all | wc -l) # Just for debug
>
> ip netns del $NS
> rm -fr ./tmp/
Lockdep is not supper happy about some patch:
https://netdev-3.bots.linux.dev/vmksft-forwarding-dbg/results/42463/38-gre-multipath-nh-res-sh/stderr
---
pw-bot: cr
^ permalink raw reply [flat|nested] 16+ messages in thread* Re: [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table.
2025-03-21 14:07 ` [PATCH v1 net-next 00/13] ipv6: No RTNL for IPv6 routing table Stanislav Fomichev
@ 2025-03-21 16:50 ` Kuniyuki Iwashima
0 siblings, 0 replies; 16+ messages in thread
From: Kuniyuki Iwashima @ 2025-03-21 16:50 UTC (permalink / raw)
To: stfomichev
Cc: davem, dsahern, edumazet, horms, kuba, kuni1840, kuniyu, netdev,
pabeni
From: Stanislav Fomichev <stfomichev@gmail.com>
Date: Fri, 21 Mar 2025 07:07:15 -0700
> On 03/20, Kuniyuki Iwashima wrote:
> > IPv6 routing tables are protected by each table's lock and work in
> > the interrupt context, which means we basically don't need RTNL to
> > modify an IPv6 routing table itself.
> >
> > Currently, the control paths require RTNL because we may need to
> > perform device and nexthop lookups; we must prevent dev/nexthop from
> > going away from the netns.
> >
> > This, however, can be achieved by RCU as well.
> >
> > If we are in the RCU critical section while adding an IPv6 route,
> > synchronize_net() in netif_change_net_namespace() and
> > unregister_netdevice_many_notify() guarantee that the dev will not be
> > moved to another netns or removed.
> >
> > Also, nexthop is guaranteed not to be freed during the RCU grace period.
> >
> > If we care about a race between nexthop removal and IPv6 route addition,
> > we can get rid of RTNL from the control paths.
> >
> > Patch 1 moves a validation for RTA_MULTIPATH earlier.
> > Patch 2 removes RTNL for SIOCDELRT and RTM_DELROUTE.
> > Patch 3 ~ 10 move validation and memory allocation earlier.
> > Patch 11 prevents a race between two requests for the same table.
> > Patch 12 prevents the race mentioned above.
> > Patch 13 removes RTNL for SIOCADDRT and RTM_NEWROUTE.
> >
> >
> > Test:
> >
> > The script [0] lets each CPU-X create 100000 routes on table-X in a
> > batch.
> >
> > On c7a.metal-48xl EC2 instance with 192 CPUs,
> >
> > With this series:
> >
> > $ sudo ./route_test.sh
> > start adding routes
> > added 19200000 routes (100000 routes * 192 tables).
> > Time elapsed: 189154 milliseconds.
> >
> > Without series:
> >
> > $ sudo ./route_test.sh
> > start adding routes
> > added 19200000 routes (100000 routes * 192 tables).
> > Time elapsed: 62531 milliseconds.
> >
> > I changed the number of routes (1000 ~ 100000 per CPU/table) and
> > constantly saw it complete 3x faster with this series.
> >
> >
> > [0]
> > #!/bin/bash
> >
> > mkdir tmp
> >
> > NS="test"
> > ip netns add $NS
> > ip -n $NS link add veth0 type veth peer veth1
> > ip -n $NS link set veth0 up
> > ip -n $NS link set veth1 up
> >
> > TABLES=()
> > for i in $(seq $(nproc)); do
> > TABLES+=("$i")
> > done
> >
> > ROUTES=()
> > for i in {1..100}; do
> > for j in {1..1000}; do
> > ROUTES+=("2001:$i:$j::/64")
> > done
> > done
> >
> > for TABLE in "${TABLES[@]}"; do
> > FILE="./tmp/batch-table-$TABLE.txt"
> > > $FILE
> > for ROUTE in "${ROUTES[@]}"; do
> > echo "route add $ROUTE dev veth0 table $TABLE" >> $FILE
> > done
> > done
> >
> > echo "start adding routes"
> >
> > START_TIME=$(date +%s%3N)
> > for TABLE in "${TABLES[@]}"; do
> > ip -n $NS -6 -batch "./tmp/batch-table-$TABLE.txt" &
> > done
> >
> > wait
> > END_TIME=$(date +%s%3N)
> > ELAPSED_TIME=$((END_TIME - START_TIME))
> >
> > echo "added $((${#ROUTES[@]} * ${#TABLES[@]})) routes (${#ROUTES[@]} routes * ${#TABLES[@]} tables)."
> > echo "Time elapsed: ${ELAPSED_TIME} milliseconds."
> > echo $(ip -n $NS -6 route show table all | wc -l) # Just for debug
> >
> > ip netns del $NS
> > rm -fr ./tmp/
>
> Lockdep is not supper happy about some patch:
> https://netdev-3.bots.linux.dev/vmksft-forwarding-dbg/results/42463/38-gre-multipath-nh-res-sh/stderr
Looks like I need to extend the RCU critical section in
ip6_route_del() in patch 2:
---8<---
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index d1d60415d1aa..b6434532858f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4104,9 +4104,9 @@ static int ip6_route_del(struct fib6_config *cfg,
if (rt->nh) {
if (!fib6_info_hold_safe(rt))
continue;
- rcu_read_unlock();
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ break;
}
if (cfg->fc_nh_id)
continue;
@@ -4121,13 +4121,13 @@ static int ip6_route_del(struct fib6_config *cfg,
continue;
if (!fib6_info_hold_safe(rt))
continue;
- rcu_read_unlock();
/* if gateway was specified only delete the one hop */
if (cfg->fc_flags & RTF_GATEWAY)
- return __ip6_del_rt(rt, &cfg->fc_nlinfo);
-
- return __ip6_del_rt_siblings(rt, cfg);
+ err = __ip6_del_rt(rt, &cfg->fc_nlinfo);
+ else
+ err = __ip6_del_rt_siblings(rt, cfg);
+ break;
}
}
rcu_read_unlock();
---8<---
Thanks!
^ permalink raw reply related [flat|nested] 16+ messages in thread