From: dsahern@kernel.org
To: netdev@vger.kernel.org
Cc: roopa@cumulusnetworks.com, sharpd@cumulusnetworks.com,
idosch@mellanox.com, davem@davemloft.net,
David Ahern <dsahern@gmail.com>
Subject: [PATCH RFC net-next 17/18] net: Add support for nexthop groups
Date: Fri, 31 Aug 2018 17:49:52 -0700 [thread overview]
Message-ID: <20180901004954.7145-18-dsahern@kernel.org> (raw)
In-Reply-To: <20180901004954.7145-1-dsahern@kernel.org>
From: David Ahern <dsahern@gmail.com>
Allow the creation of nexthop groups which reference other nexthop
objects to create multipath routes.
TO-DO: Add mpath support to IPv6
Signed-off-by: David Ahern <dsahern@gmail.com>
---
include/net/nexthop.h | 77 +++++--
net/ipv4/fib_semantics.c | 5 +-
net/ipv4/nexthop.c | 511 ++++++++++++++++++++++++++++++++++++++++++-----
net/ipv4/route.c | 16 +-
4 files changed, 540 insertions(+), 69 deletions(-)
diff --git a/include/net/nexthop.h b/include/net/nexthop.h
index 759bb39e4ea7..654b67192337 100644
--- a/include/net/nexthop.h
+++ b/include/net/nexthop.h
@@ -28,6 +28,23 @@
struct nexthop;
+struct nh_grp_entry {
+ struct nexthop *nh;
+ u32 weight;
+ atomic_t upper_bound;
+
+ struct list_head nh_list;
+ struct nexthop *nh_parent; /* nexthop of group with this entry */
+};
+
+struct nh_group {
+ u16 num_nh_set;
+ u16 num_nh;
+ u8 mpath:1,
+ unused:7;
+ struct nh_grp_entry nh_entries[0];
+};
+
struct nh_info {
struct hlist_node dev_hash;
struct net *net;
@@ -47,6 +64,7 @@ struct nh_info {
struct nexthop {
struct rb_node rb_node;
+ struct list_head grp_list; /* nh group entries using this nh */
struct list_head fi_list; /* v4 entries using nh */
struct list_head f6i_list; /* v6 entries using nh */
@@ -54,12 +72,15 @@ struct nexthop {
u8 protocol;
u8 nh_flags;
+ u8 is_group:1,
+ unused:7;
refcount_t refcnt;
struct rcu_head rcu;
union {
struct nh_info __rcu *nh_info;
+ struct nh_group __rcu *nh_grp;
};
};
@@ -81,6 +102,9 @@ struct nh_config {
struct in6_addr ipv6;
} gw;
+ struct nlattr *nh_grp;
+ u16 nh_grp_type;
+
u32 nlflags;
struct nl_info nlinfo;
};
@@ -88,42 +112,61 @@ struct nh_config {
void nexthop_get(struct nexthop *nh);
void nexthop_put(struct nexthop *nh);
+static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+{
+ return nh1 == nh2;
+}
+
/* caller is holding rtnl; no reference taken to nexthop */
struct nexthop *nexthop_find_by_id(struct net *net, u32 id);
-static inline bool nexthop_cmp(struct nexthop *nh1, struct nexthop *nh2)
+/* called with rcu lock */
+static inline bool nexthop_is_multipath(const struct nexthop *nh)
{
- return nh1 == nh2;
+ if (nh->is_group) {
+ struct nh_group *nh_grp;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ return !!nh_grp->mpath;
+ }
+ return false;
}
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel);
+
+/* called with rcu lock */
static inline int nexthop_num_path(struct nexthop *nh)
{
+ if (nexthop_is_multipath(nh)) {
+ struct nh_group *nh_grp;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ return nh_grp->num_nh_set;
+ }
+
return 1;
}
-/* called with rcu lock */
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash);
+
static inline bool nexthop_has_gw(struct nexthop *nh)
{
- struct nh_info *nhi;
-
- nhi = rcu_dereference(nh->nh_info);
- return !!nhi->has_gw;
+ return !!nh->nh_info->has_gw;
}
-/* called with rcu lock */
static inline bool nexthop_is_blackhole(struct nexthop *nh)
{
- struct nh_info *nhi;
-
- nhi = rcu_dereference(nh->nh_info);
- return !!nhi->reject_nh;
+ return !nexthop_is_multipath(nh) && !!nh->nh_info->reject_nh;
}
static inline struct fib_nh *nexthop_fib_nh(struct nexthop *nh, int nhsel)
{
struct nh_info *nhi;
- nhi = rcu_dereference(nh->nh_info);
+ if (nexthop_is_multipath(nh))
+ nh = nexthop_mpath_select(nh, nhsel);
+
+ nhi = nh->nh_info;
if (nhi->family == AF_INET ||
nhi->family == AF_UNSPEC) /* dev only re-uses IPv4 struct */
return &nhi->fib_nh;
@@ -164,11 +207,11 @@ static inline __be32 fib_info_nh_gw(struct fib_info *fi)
*/
static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh)
{
- struct nh_info *nhi;
+ if (nexthop_is_multipath(nh))
+ nh = nexthop_mpath_select(nh, 0);
- nhi = rcu_dereference(nh->nh_info);
- if (nhi->family == AF_INET6)
- return &nhi->fib6_nh;
+ if (nh->nh_info->family == AF_INET6)
+ return &nh->nh_info->fib6_nh;
return NULL;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index c91cdafd40ec..0ddf14512bb3 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1821,7 +1821,10 @@ void fib_select_path(struct net *net, struct fib_result *res,
goto check_saddr;
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi->fib_nhs > 1) {
+ if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+ h = fib_multipath_hash(net, fl4, skb, NULL);
+ nexthop_select_path(net, res, h);
+ } else if (res->fi->fib_nhs > 1) {
h = fib_multipath_hash(net, fl4, skb, NULL);
fib_select_multipath(res, h);
}
diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c
index 1e77fa94e562..f0b4151c661a 100644
--- a/net/ipv4/nexthop.c
+++ b/net/ipv4/nexthop.c
@@ -35,6 +35,8 @@ static const struct nla_policy rtm_nh_policy[NHA_MAX + 1] = {
[NHA_TABLE_ID] = { .type = NLA_U32 },
[NHA_BLACKHOLE] = { .type = NLA_FLAG },
[NHA_MASTER] = { .type = NLA_U32 },
+ [NHA_GROUP_TYPE] = { .type = NLA_U16 },
+ [NHA_GROUPS] = { .type = NLA_FLAG },
};
static unsigned int nh_dev_hashfn(unsigned int val)
@@ -67,19 +69,35 @@ static void nexthop_devhash_add(struct net *net, struct nh_info *nhi)
static void nexthop_free_rcu(struct rcu_head *head)
{
struct nexthop *nh = container_of(head, struct nexthop, rcu);
- struct nh_info *nhi;
- nhi = rcu_dereference_raw(nh->nh_info);
- switch (nhi->family) {
- case AF_INET:
- case AF_UNSPEC:
- fib_nh_release(nhi->net, &nhi->fib_nh);
- break;
- case AF_INET6:
- fib6_nh_release(&nhi->fib6_nh);
- break;
+ if (nh->is_group) {
+ struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rcu_dereference_raw(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (!nh_grp->nh_entries[i].nh)
+ continue;
+
+ list_del(&nh_grp->nh_entries[i].nh_list);
+ nexthop_put(nh_grp->nh_entries[i].nh);
+ }
+ kfree(nh_grp);
+ } else {
+ struct nh_info *nhi;
+
+ nhi = rcu_dereference_raw(nh->nh_info);
+ switch (nhi->family) {
+ case AF_INET:
+ case AF_UNSPEC:
+ fib_nh_release(nhi->net, &nhi->fib_nh);
+ break;
+ case AF_INET6:
+ fib6_nh_release(&nhi->fib6_nh);
+ break;
+ }
+ kfree(nhi);
}
- kfree(nhi);
kfree(nh);
}
@@ -89,6 +107,33 @@ static struct nexthop *nexthop_alloc(void)
return kzalloc(sizeof(struct nexthop), GFP_KERNEL);
}
+/* nexthop for group has variable size and may not use the kmem_cache */
+static struct nexthop *nexthop_grp_alloc(u16 num_nh)
+{
+ size_t sz = offsetof(struct nexthop, nh_grp)
+ + sizeof(struct nh_group)
+ + sizeof(struct nh_grp_entry) * num_nh;
+ struct nh_group *nh_grp;
+ struct nexthop *nh;
+
+ nh = nexthop_alloc();
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nh_grp = kzalloc(sz, GFP_KERNEL);
+ if (!nh_grp) {
+ kfree(nh);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ nh->is_group = 1;
+ nh_grp->num_nh = num_nh;
+ nh_grp->num_nh_set = num_nh;
+ rcu_assign_pointer(nh->nh_grp, nh_grp);
+
+ return nh;
+}
+
static void nh_base_seq_inc(struct net *net)
{
while (++net->nexthop.seq == 0)
@@ -173,23 +218,166 @@ static size_t nh_nlmsg_size_ipv4(struct nh_info *nhi)
static size_t nh_nlmsg_size(struct nexthop *nh)
{
- struct nh_info *nhi = rtnl_dereference(nh->nh_info);
size_t sz = nla_total_size(4); /* NHA_ID */
- /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
- * are mutually exclusive
- */
- sz += nla_total_size(4); /* NHA_OIF */
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+ size_t sz2 = sizeof(struct nh_group) * nh_grp->num_nh_set;
- if (nhi->family == AF_INET)
- sz += nh_nlmsg_size_ipv4(nhi);
+ sz += nla_total_size(sz2)
+ + nla_total_size(2); /* NHA_GROUP_TYPE */
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
- else if (nhi->family == AF_INET6)
- sz += nh_nlmsg_size_ipv6(nhi);
+ /* covers NHA_BLACKHOLE since NHA_OIF and BLACKHOLE
+ * are mutually exclusive
+ */
+ sz += nla_total_size(4); /* NHA_OIF */
+
+ if (nhi->family == AF_INET)
+ sz += nh_nlmsg_size_ipv4(nhi);
+ else if (nhi->family == AF_INET6)
+ sz += nh_nlmsg_size_ipv6(nhi);
+ }
return sz;
}
+static bool valid_group_nh(struct nexthop *nh, struct netlink_ext_ack *extack)
+{
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+ /* nested multipath (group within a group) is not
+ * supported
+ */
+ if (nh_grp->mpath) {
+ NL_SET_ERR_MSG(extack,
+ "Multipath group can not be a nexthop within a group");
+ return false;
+ }
+ } else {
+ struct nh_info *nhi = rtnl_dereference(nh->nh_info);
+
+ if (nhi->reject_nh) {
+ NL_SET_ERR_MSG(extack,
+ "Blackhole nexthop can not be used in a group");
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static int nh_check_attr_group(struct net *net, struct nlattr *tb[],
+ struct netlink_ext_ack *extack)
+{
+ unsigned int len = nla_len(tb[NHA_GROUP]);
+ struct nexthop_grp *nhg;
+ int i;
+
+ if (len & (sizeof(struct nh_group) - 1)) {
+ NL_SET_ERR_MSG(extack,
+ "Invalid length for nexthop group attribute");
+ return -EINVAL;
+ }
+
+ /* convert len to number of nexthop ids */
+ len /= sizeof(*nhg);
+
+ nhg = nla_data(tb[NHA_GROUP]);
+ for (i = 0; i < len; ++i) {
+ struct nexthop *nh;
+
+ nh = nexthop_find_by_id(net, nhg->id);
+ if (!nh) {
+ NL_SET_ERR_MSG(extack, "Invalid nexthop id");
+ return -EINVAL;
+ }
+ if (!valid_group_nh(nh, extack))
+ return -EINVAL;
+
+ nhg += 1;
+ }
+
+ for (i = NHA_GROUP + 1; i < __NHA_MAX; ++i) {
+ if (!tb[i])
+ continue;
+
+ NL_SET_ERR_MSG(extack,
+ "No other attributes can be set in nexthop groups");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int nla_put_nh_group(struct sk_buff *skb, struct nh_group *nh_grp)
+{
+ size_t len = nh_grp->num_nh_set * sizeof(struct nh_group);
+ struct nexthop_grp *p;
+ struct nlattr *nla;
+ u16 group_type = 0;
+ int i;
+
+ if (nh_grp->mpath)
+ group_type = NEXTHOP_GRP_TYPE_MPATH;
+
+ if (nla_put_u16(skb, NHA_GROUP_TYPE, group_type))
+ goto nla_put_failure;
+
+ nla = nla_reserve(skb, NHA_GROUP, len);
+ if (!nla)
+ goto nla_put_failure;
+
+ p = nla_data(nla);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (!nh_grp->nh_entries[i].nh)
+ continue;
+
+ p->id = nh_grp->nh_entries[i].nh->id;
+ p->weight = nh_grp->nh_entries[i].weight;
+ p += 1;
+ }
+
+ return 0;
+
+nla_put_failure:
+ return -EMSGSIZE;
+}
+
+static void nh_group_rebalance(struct nh_group *nhg)
+{
+ struct nh_grp_entry *nhge;
+ int total = 0;
+ int w = 0;
+ int i;
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ nhge = &nhg->nh_entries[i];
+
+ if (!nhge->nh)
+ continue;
+
+ total += nhge->weight;
+ }
+
+ for (i = 0; i < nhg->num_nh; ++i) {
+ int upper_bound;
+
+ nhge = &nhg->nh_entries[i];
+ if (!nhge->nh) {
+ upper_bound = -1;
+ } else {
+ w += nhge->weight;
+ upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
+ total) - 1;
+ }
+
+ atomic_set(&nhge->upper_bound, upper_bound);
+ }
+}
+
static const struct net_device *nh_info_dev(const struct nh_info *nhi)
{
switch (nhi->family) {
@@ -219,8 +407,25 @@ bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev)
const struct nh_info *nhi;
bool dev_match = false;
- nhi = rcu_dereference(nh->nh_info);
- dev_match = nh_info_uses_dev(nhi, dev);
+ if (nh->is_group) {
+ const struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ const struct nh_grp_entry *nhge;
+
+ nhge = &nh_grp->nh_entries[i];
+ nhi = rcu_dereference(nhge->nh->nh_info);
+ dev_match = nh_info_uses_dev(nhi, dev);
+ if (dev_match)
+ break;
+ }
+
+ } else {
+ nhi = rcu_dereference(nh->nh_info);
+ dev_match = nh_info_uses_dev(nhi, dev);
+ }
return dev_match;
}
@@ -249,6 +454,14 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
if (nla_put_u32(skb, NHA_ID, nh->id))
goto nla_put_failure;
+ if (nh->is_group) {
+ struct nh_group *nh_grp = rtnl_dereference(nh->nh_grp);
+
+ if (nla_put_nh_group(skb, nh_grp))
+ goto nla_put_failure;
+ goto end;
+ }
+
nhi = rtnl_dereference(nh->nh_info);
if (nhi->reject_nh && nla_put_flag(skb, NHA_BLACKHOLE))
goto nla_put_failure;
@@ -281,6 +494,7 @@ static int nh_fill_node(struct sk_buff *skb, struct nexthop *nh,
break;
}
+end:
nlmsg_end(skb, nlh);
return 0;
@@ -315,6 +529,50 @@ static void nexthop_notify(int event, struct nexthop *nh, struct nl_info *info)
rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
}
+static void remove_nh_grp_entry(struct nh_grp_entry *nhge, bool rebalance)
+{
+ struct nh_group *nh_grp;
+
+ list_del(&nhge->nh_list);
+ nexthop_put(nhge->nh);
+ nhge->nh = NULL;
+
+ nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+ nh_grp->num_nh_set--;
+ if (rebalance)
+ nh_group_rebalance(nh_grp);
+}
+
+static void remove_nexthop_from_groups(struct net *net, struct nexthop *nh,
+ bool skip_fib, struct nl_info *nlinfo)
+{
+ struct nh_grp_entry *nhge, *tmp;
+
+ list_for_each_entry_safe(nhge, tmp, &nh->grp_list, nh_list) {
+ struct nh_group *nh_grp;
+
+ remove_nh_grp_entry(nhge, true);
+
+ /* if this group has no more entries then remove it */
+ nh_grp = rtnl_dereference(nhge->nh_parent->nh_grp);
+ if (!nh_grp->num_nh_set)
+ remove_nexthop(net, nhge->nh_parent, skip_fib,
+ nlinfo);
+ }
+}
+
+static void remove_nexthop_group(struct nexthop *nh)
+{
+ struct nh_group *nh_grp;
+ int i;
+
+ nh_grp = rtnl_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (nh_grp->nh_entries[i].nh)
+ remove_nh_grp_entry(&nh_grp->nh_entries[i], false);
+ }
+}
+
static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
{
struct fib6_info *f6i, *tmp;
@@ -339,13 +597,19 @@ static void __remove_nexthop_fib(struct net *net, struct nexthop *nh)
static void __remove_nexthop(struct net *net, struct nexthop *nh,
bool skip_fib, struct nl_info *nlinfo)
{
- const struct net_device *dev;
- struct nh_info *nhi;
+ if (nh->is_group) {
+ remove_nexthop_group(nh);
+ } else {
+ const struct net_device *dev;
+ struct nh_info *nhi;
- nhi = rtnl_dereference(nh->nh_info);
- dev = nh_info_dev(nhi);
- if (dev)
- hlist_del(&nhi->dev_hash);
+ nhi = rtnl_dereference(nh->nh_info);
+ dev = nh_info_dev(nhi);
+ if (dev)
+ hlist_del(&nhi->dev_hash);
+
+ remove_nexthop_from_groups(net, nh, skip_fib, nlinfo);
+ }
if (!skip_fib)
__remove_nexthop_fib(net, nh);
}
@@ -362,21 +626,46 @@ static void remove_nexthop(struct net *net, struct nexthop *nh,
nexthop_put(nh);
- nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
+ if (nlinfo)
+ nexthop_notify(RTM_DELNEXTHOP, nh, nlinfo);
}
static int replace_nexthop(struct net *net, struct nexthop *old,
struct nexthop *new, struct netlink_ext_ack *extack)
{
- struct nh_info *oldi, *newi;
+ if (old->is_group) {
+ struct nh_group *oldg, *newg;
+ int i;
- oldi = rtnl_dereference(old->nh_info);
- newi = rtnl_dereference(new->nh_info);
- rcu_assign_pointer(old->nh_info, newi);
- rcu_assign_pointer(new->nh_info, oldi);
+ if (!new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop group with a nexthop.");
+ return -EINVAL;
+ }
+ oldg = rtnl_dereference(old->nh_grp);
+ newg = rtnl_dereference(new->nh_grp);
+ rcu_assign_pointer(old->nh_grp, newg);
+ rcu_assign_pointer(new->nh_grp, oldg);
+
+ /* update parents - used by nexthop code for cleanup */
+ for (i = 0; i < newg->num_nh; ++i)
+ newg->nh_entries[i].nh_parent = old;
+ for (i = 0; i < oldg->num_nh; ++i)
+ oldg->nh_entries[i].nh_parent = new;
+ } else {
+ struct nh_info *oldi, *newi;
- newi->nh_parent = old;
- oldi->nh_parent = new;
+ if (new->is_group) {
+ NL_SET_ERR_MSG(extack, "Can not replace a nexthop with a nexthop group.");
+ return -EINVAL;
+ }
+ oldi = rtnl_dereference(old->nh_info);
+ newi = rtnl_dereference(new->nh_info);
+ rcu_assign_pointer(old->nh_info, newi);
+ rcu_assign_pointer(new->nh_info, oldi);
+
+ newi->nh_parent = old;
+ oldi->nh_parent = new;
+ }
old->protocol = new->protocol;
old->nh_flags = new->nh_flags;
@@ -491,10 +780,16 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
struct netlink_ext_ack *extack)
{
struct nexthop *nh = fi->nh;
- struct nh_info *nhi;
- nhi = rtnl_dereference(nh->nh_info);
- if (nhi->family != AF_UNSPEC) {
+ if (nh->is_group) {
+ if (cfg->fc_scope == RT_SCOPE_HOST) {
+ NL_SET_ERR_MSG(extack, "Route with host scope can not have multiple nexthops");
+ return -EINVAL;
+ }
+ return 0;
+ }
+
+ if (nh->nh_info->family != AF_UNSPEC) {
if (nh->nh_flags & RTNH_F_ONLINK &&
cfg->fc_scope >= RT_SCOPE_LINK) {
NL_SET_ERR_MSG(extack, "Scope mismatch with nexthop");
@@ -505,6 +800,57 @@ int fib_check_nexthop(struct fib_info *fi, struct fib_config *cfg,
return 0;
}
+void nexthop_select_path(struct net *net, struct fib_result *res, int hash)
+{
+ struct fib_info *fi = res->fi;
+ struct nexthop *nh = fi->nh;
+ struct nh_group *nh_grp;
+ bool first = false;
+ int i;
+
+ WARN_ON(!nh->is_group);
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ struct nh_grp_entry *nhge = &nh_grp->nh_entries[i];
+ struct fib_nh *fib_nh;
+
+ if (hash > atomic_read(&nhge->upper_bound))
+ continue;
+
+ fib_nh = &nhge->nh->nh_info->fib_nh;
+
+ /* nexthops always check if it is good and does
+ * not rely on a sysctl for this behavior
+ */
+ if (fib_good_nh(fib_nh)) {
+ res->nh = fib_nh;
+ return;
+ }
+ if (!first) {
+ res->nh = fib_nh;
+ first = true;
+ }
+ }
+}
+
+struct nexthop *nexthop_mpath_select(struct nexthop *nh, int nhsel)
+{
+ struct nh_group *nh_grp;
+ int i, j = 0;
+
+ nh_grp = rcu_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ if (nh_grp->nh_entries[i].nh) {
+ if (nhsel == j)
+ return nh_grp->nh_entries[i].nh;
+ ++j;
+ }
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(nexthop_mpath_select);
+
static int nh_check_attr(struct nhmsg *nhm, struct nlattr *tb[],
struct net *net, struct netlink_ext_ack *extack)
{
@@ -557,6 +903,19 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
if (tb[NHA_ID])
cfg->nh_id = nla_get_u32(tb[NHA_ID]);
+ if (tb[NHA_GROUP]) {
+ cfg->nh_grp = tb[NHA_GROUP];
+
+ cfg->nh_grp_type = NEXTHOP_GRP_TYPE_MPATH;
+ if (tb[NHA_GROUP_TYPE])
+ cfg->nh_grp_type = nla_get_u16(tb[NHA_GROUP_TYPE]);
+
+ if (cfg->nh_grp_type > NEXTHOP_GRP_TYPE_MAX) {
+ NL_SET_ERR_MSG(extack, "Invalid group type");
+ goto out;
+ }
+ }
+
if (tb[NHA_OIF]) {
cfg->nh_ifindex = nla_get_u32(tb[NHA_OIF]);
@@ -644,6 +1003,14 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb,
goto out;
}
+ if (tb[NHA_GROUP]) {
+ err = nh_check_attr_group(net, tb, extack);
+ if (err)
+ goto out;
+
+ return 0;
+ }
+
err = 0;
out:
return err;
@@ -791,7 +1158,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh,
return err;
}
-static int nh_create_ipv6(struct net *net, struct nexthop *nh,
+static int nh_create_ipv6(struct net *net, struct nexthop *nh,
struct nh_info *nhi, struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -856,10 +1223,47 @@ static int nh_create_unspec(struct net *net, struct nexthop *nh,
static void nexthop_init_common(struct nexthop *nh)
{
+ INIT_LIST_HEAD(&nh->grp_list);
INIT_LIST_HEAD(&nh->fi_list);
INIT_LIST_HEAD(&nh->f6i_list);
}
+static struct nexthop *nexthop_create_group(struct net *net,
+ struct nh_config *cfg)
+{
+ struct nlattr *grps_attr = cfg->nh_grp;
+ struct nexthop_grp *entry = nla_data(grps_attr);
+ struct nh_group *nh_grp;
+ struct nexthop *nh;
+ int i;
+
+ nh = nexthop_grp_alloc(nla_len(grps_attr) / sizeof(*entry));
+ if (!nh)
+ return ERR_PTR(-ENOMEM);
+
+ nexthop_init_common(nh);
+
+ nh_grp = rtnl_dereference(nh->nh_grp);
+ for (i = 0; i < nh_grp->num_nh; ++i) {
+ struct nexthop *nhe;
+
+ nhe = nexthop_find_by_id(net, entry[i].id);
+ nexthop_get(nhe);
+
+ nh_grp->nh_entries[i].nh = nhe;
+ nh_grp->nh_entries[i].weight = entry[i].weight ? : 1;
+ list_add(&nh_grp->nh_entries[i].nh_list, &nhe->grp_list);
+ nh_grp->nh_entries[i].nh_parent = nh;
+ }
+
+ if (cfg->nh_grp_type == NEXTHOP_GRP_TYPE_MPATH) {
+ nh_grp->mpath = 1;
+ nh_group_rebalance(nh_grp);
+ }
+
+ return nh;
+}
+
static struct nexthop *nexthop_create(struct net *net, struct nh_config *cfg,
struct netlink_ext_ack *extack)
{
@@ -929,7 +1333,11 @@ static struct nexthop *nexthop_add(struct net *net, struct nh_config *cfg,
}
}
- nh = nexthop_create(net, cfg, extack);
+ if (cfg->nh_grp)
+ nh = nexthop_create_group(net, cfg);
+ else
+ nh = nexthop_create(net, cfg, extack);
+
if (IS_ERR(nh))
return nh;
@@ -968,19 +1376,25 @@ static int rtm_new_nexthop(struct sk_buff *skb, struct nlmsghdr *nlh,
return err;
}
-static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
+static bool nh_dump_filtered(struct nexthop *nh, int dev_idx, int group_filter,
int master_idx, u8 family)
{
const struct net_device *dev;
const struct nh_info *nhi;
- if (dev_idx || master_idx || family)
+ if (group_filter && !nh->is_group)
+ return true;
+
+ if ((dev_idx || master_idx || family) && nh->is_group)
return true;
nhi = rtnl_dereference(nh->nh_info);
- if (family && nhi->family != family)
+ if (family && !nh->is_group && nhi->family != family)
return true;
+ if (nh->is_group)
+ return false;
+
dev = nh_info_dev(nhi);
if (dev_idx && (!dev || dev->ifindex != dev_idx))
return true;
@@ -998,7 +1412,7 @@ static bool nh_dump_filtered(struct nexthop *nh, int dev_idx,
/* rtnl */
static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
{
- int dev_filter_idx = 0, master_idx = 0;
+ int group_filter = 0, dev_filter_idx = 0, master_idx = 0;
struct net *net = sock_net(skb->sk);
struct rb_root *root = &net->nexthop.root;
struct nlattr *tb[NHA_MAX + 1];
@@ -1010,6 +1424,9 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
if (nlmsg_parse(cb->nlh, sizeof(*nhm), tb, NHA_MAX,
rtm_nh_policy, NULL) >= 0) {
+ if (tb[NHA_GROUPS])
+ group_filter = 1;
+
if (tb[NHA_OIF])
dev_filter_idx = nla_get_u32(tb[NHA_OIF]);
@@ -1027,8 +1444,8 @@ static int rtm_dump_nexthop(struct sk_buff *skb, struct netlink_callback *cb)
goto cont;
nh = rb_entry(node, struct nexthop, rb_node);
- if (nh_dump_filtered(nh, dev_filter_idx, master_idx,
- nhm->nh_family))
+ if (nh_dump_filtered(nh, dev_filter_idx, group_filter,
+ master_idx, nhm->nh_family))
goto cont;
err = nh_fill_node(skb, nh, RTM_NEWNEXTHOP,
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 1297c7c934a8..4c16715607e0 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
#include <net/secure_seq.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
+#include <net/nexthop.h>
#include "fib_lookup.h"
@@ -1887,10 +1888,17 @@ static int ip_mkroute_input(struct sk_buff *skb,
struct flow_keys *hkeys)
{
#ifdef CONFIG_IP_ROUTE_MULTIPATH
- if (res->fi && res->fi->fib_nhs > 1) {
- int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
-
- fib_select_multipath(res, h);
+ if (res->fi) {
+ struct net *net = res->fi->fib_net;
+ int h;
+
+ if (res->fi->nh && nexthop_is_multipath(res->fi->nh)) {
+ h = fib_multipath_hash(net, NULL, skb, hkeys);
+ nexthop_select_path(net, res, h);
+ } else if (res->fi->fib_nhs > 1) {
+ h = fib_multipath_hash(net, NULL, skb, hkeys);
+ fib_select_multipath(res, h);
+ }
}
#endif
--
2.11.0
next prev parent reply other threads:[~2018-09-01 5:20 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-09-01 0:49 [PATCH RFC net-next 00/18] net: Improve route scalability via support for nexthop objects dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 01/18] net: Rename net/nexthop.h net/rtnh.h dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 02/18] net: ipv4: export fib_good_nh and fib_flush dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 03/18] net/ipv4: export fib_info_update_nh_saddr dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 04/18] net/ipv4: export fib_check_nh dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 05/18] net/ipv4: Define fib_get_nhs when CONFIG_IP_ROUTE_MULTIPATH is disabled dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 06/18] net/ipv4: Create init and release helpers for fib_nh dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 07/18] net: ipv4: Add fib_nh to fib_result dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 08/18] net/ipv4: Move device validation to helper dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 09/18] net/ipv6: Create init and release helpers for fib6_nh dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 10/18] net/ipv6: Make fib6_nh optional at the end of fib6_info dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 11/18] net: Initial nexthop code dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 12/18] net/ipv4: Add nexthop helpers for ipv4 integration dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 13/18] net/ipv4: Convert existing use of fib_info to new helpers dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 14/18] net/ipv4: Allow routes to use nexthop objects dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 15/18] net/ipv6: Use helpers to access fib6_nh data dsahern
2018-09-01 0:49 ` [PATCH RFC net-next 16/18] net/ipv6: Allow routes to use nexthop objects dsahern
2018-09-01 0:49 ` dsahern [this message]
2018-09-01 0:49 ` [PATCH RFC net-next 18/18] net/ipv4: Optimization for fib_info lookup dsahern
2018-09-01 20:43 ` Stephen Hemminger
2018-09-04 15:27 ` David Ahern
2018-09-01 0:49 ` [PATCH iproute2-next] ip: Add support for nexthop objects dsahern
2018-09-01 20:37 ` Stephen Hemminger
2018-09-04 15:30 ` David Ahern
2018-09-02 17:34 ` [PATCH RFC net-next 00/18] net: Improve route scalability via " David Miller
2018-09-04 15:57 ` David Ahern
2018-12-11 12:52 ` Jan Maria Matejka
2018-12-12 20:27 ` David Ahern
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20180901004954.7145-18-dsahern@kernel.org \
--to=dsahern@kernel.org \
--cc=davem@davemloft.net \
--cc=dsahern@gmail.com \
--cc=idosch@mellanox.com \
--cc=netdev@vger.kernel.org \
--cc=roopa@cumulusnetworks.com \
--cc=sharpd@cumulusnetworks.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.