From: Robert Shearman <rshearma@brocade.com>
To: Roopa Prabhu <roopa@cumulusnetworks.com>, <ebiederm@xmission.com>
Cc: <davem@davemloft.net>, <netdev@vger.kernel.org>
Subject: Re: [PATCH net-next v2] mpls: support for dead routes
Date: Tue, 3 Nov 2015 15:08:20 +0000 [thread overview]
Message-ID: <5638CDE4.2080501@brocade.com> (raw)
In-Reply-To: <1446527581-64787-1-git-send-email-roopa@cumulusnetworks.com>
On 03/11/15 05:13, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
> routes due to link events. Also adds code to ignore dead
> routes during route selection
>
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
> RFC to v1:
> Addressed a few comments from Eric and Robert:
> - remove support for weighted nexthops
> - Use rt_nhn_alive in the rt structure to keep count of alive
> routes.
> What i have not done is: sort nexthops on link events.
> I am not comfortable recreating or sorting nexthops on
> every carrier change. This leaves scope for optimizing in the future
>
> v1 to v2:
> Fix dead nexthop checks as suggested by dave
>
> net/mpls/af_mpls.c | 191 ++++++++++++++++++++++++++++++++++++++++++++--------
> net/mpls/internal.h | 3 +
> 2 files changed, 166 insertions(+), 28 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750..5e88118 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
> }
> EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>
> -static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> - struct sk_buff *skb, bool bos)
> +static u32 mpls_multipath_hash(struct mpls_route *rt,
> + struct sk_buff *skb, bool bos)
> {
> struct mpls_entry_decoded dec;
> struct mpls_shim_hdr *hdr;
> bool eli_seen = false;
> int label_index;
> - int nh_index = 0;
> u32 hash = 0;
>
> - /* No need to look further into packet if there's only
> - * one path
> - */
> - if (rt->rt_nhn == 1)
> - goto out;
> -
> for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
> label_index++) {
> if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
> @@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> }
> }
>
> - nh_index = hash % rt->rt_nhn;
> + return hash;
> +}
> +
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> + struct sk_buff *skb, bool bos)
> +{
> + u32 hash = 0;
> + int nh_index;
> + int n = 0;
> +
> + /* No need to look further into packet if there's only
> + * one path
> + */
> + if (rt->rt_nhn == 1)
> + goto out;
> +
> + if (rt->rt_nhn_alive <= 0)
> + return NULL;
> +
> + hash = mpls_multipath_hash(rt, skb, bos);
> + nh_index = hash % rt->rt_nhn_alive;
> + for_nexthops(rt) {
This should be optimised in the common rt->rt_nhn_alive == rt->rt_nhn
case by doing the direct array index and avoid the nh walk.
> + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> + continue;
> + if (n == nh_index)
> + return nh;
> + n++;
> + } endfor_nexthops(rt);
> +
> out:
> - return &rt->rt_nh[nh_index];
> + return &rt->rt_nh[0];
> }
>
> static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> @@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
> GFP_KERNEL);
> if (rt) {
> rt->rt_nhn = num_nh;
> + rt->rt_nhn_alive = num_nh;
> rt->rt_max_alen = max_alen_aligned;
> }
>
> @@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
>
> RCU_INIT_POINTER(nh->nh_dev, dev);
>
> + if (!netif_carrier_ok(dev))
> + nh->nh_flags |= RTNH_F_LINKDOWN;
> +
> + if (!(dev->flags & IFF_UP))
> + nh->nh_flags |= RTNH_F_DEAD;
Below you're testing IFF_RUNNING | IFF_LOWER_UP. Is the difference
intended here?
> +
> + if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> + rt->rt_nhn_alive--;
You don't update your new rt->rt_flags field here. This highlights the
issue with duplicating data, which you're doing with the rt_flags field.
> +
> return 0;
>
> errout:
> @@ -577,7 +608,7 @@ errout:
> }
>
> static int mpls_nh_build(struct net *net, struct mpls_route *rt,
> - struct mpls_nh *nh, int oif,
> + struct mpls_nh *nh, int oif, int hops,
This change isn't required.
> struct nlattr *via, struct nlattr *newdst)
> {
> int err = -ENOMEM;
> @@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> /* neither weighted multipath nor any flags
> * are supported
> */
> - if (rtnh->rtnh_hops || rtnh->rtnh_flags)
> + if (rtnh->rtnh_flags || rtnh->rtnh_flags)
As the build bot has pointed out, this change is not entirely correct.
> goto errout;
>
> attrlen = rtnh_attrlen(rtnh);
> @@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
> goto errout;
>
> err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
> - rtnh->rtnh_ifindex, nla_via,
> - nla_newdst);
> + rtnh->rtnh_ifindex, rtnh->rtnh_hops,
> + nla_via, nla_newdst);
This change isn't required.
> if (err)
> goto errout;
>
> @@ -875,34 +906,100 @@ free:
> return ERR_PTR(err);
> }
>
> -static void mpls_ifdown(struct net_device *dev)
> +static void mpls_ifdown(struct net_device *dev, int event)
> {
> struct mpls_route __rcu **platform_label;
> struct net *net = dev_net(dev);
> - struct mpls_dev *mdev;
> unsigned index;
> + int dead;
>
> platform_label = rtnl_dereference(net->mpls.platform_label);
> for (index = 0; index < net->mpls.platform_labels; index++) {
> struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> if (!rt)
> continue;
> + dead = 0;
> for_nexthops(rt) {
> + if ((event == NETDEV_DOWN &&
> + (nh->nh_flags & RTNH_F_DEAD)) ||
> + (event == NETDEV_CHANGE &&
> + (nh->nh_flags & RTNH_F_LINKDOWN))) {
> + dead++;
> + continue;
> + }
> +
> if (rtnl_dereference(nh->nh_dev) != dev)
> continue;
> - nh->nh_dev = NULL;
> + switch (event) {
> + case NETDEV_DOWN:
> + case NETDEV_UNREGISTER:
> + nh->nh_flags |= RTNH_F_DEAD;
> + /* fall through */
> + case NETDEV_CHANGE:
> + nh->nh_flags |= RTNH_F_LINKDOWN;
> + rt->rt_nhn_alive--;
Are these operations atomic on all architectures?
Even if they are, I'm a bit worried about the RCU-correctness of this. I
think the fallthrough case in mpls_select_multipath saves you, causing
traffic to get via nh0 instead of the one selected by the hash (which is
fine transiently).
> + break;
> + }
> + if (event == NETDEV_UNREGISTER) {
> + nh->nh_dev = NULL;
I realise this was just moved from the original code, but I think this
should be at least RCU_INIT_POINTER(nh->nh_dev, NULL), if not
rcu_assign_pointer.
> + dead = rt->rt_nhn;
> + break;
> + }
> + dead++;
> } endfor_nexthops(rt);
> +
> + if (dead == rt->rt_nhn) {
> + switch (event) {
> + case NETDEV_DOWN:
> + case NETDEV_UNREGISTER:
> + rt->rt_flags |= RTNH_F_DEAD;
> + /* fall through */
> + case NETDEV_CHANGE:
> + rt->rt_flags |= RTNH_F_LINKDOWN;
> + rt->rt_nhn_alive = 0;
Won't rt_nhn_alive be zero already at this point?
There's also the problem that depending on the type of the last event,
rt->rt_flags will get set differently.
E.g.
- NETDEV_DOWN for nh0 followed by NETDEV_CHANGE[down] for nh1 then
rt->rt_flags will be RTNH_F_LINKDOWN.
- NETDEV_CHANGE[down] for nh1 followed by NETDEV_DOWN for nh0 then
rt->rt_flags will be RTNH_F_LINKDOWN|RTNH_F_DEAD.
That doesn't seem like desirable behaviour. What are expected semantics?
> + break;
> + }
> + }
> }
>
> - mdev = mpls_dev_get(dev);
> - if (!mdev)
> - return;
> + return;
> +}
> +
> +static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
> +{
> + struct mpls_route __rcu **platform_label;
> + struct net *net = dev_net(dev);
> + unsigned index;
> + int alive;
> +
> + platform_label = rtnl_dereference(net->mpls.platform_label);
> + for (index = 0; index < net->mpls.platform_labels; index++) {
> + struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> + if (!rt)
> + continue;
> + alive = 0;
> + for_nexthops(rt) {
> + struct net_device *nh_dev =
> + rtnl_dereference(nh->nh_dev);
>
> - mpls_dev_sysctl_unregister(mdev);
> + if (!(nh->nh_flags & nh_flags)) {
> + alive++;
> + continue;
> + }
> + if (nh_dev != dev)
> + continue;
> + alive++;
> + nh->nh_flags &= ~nh_flags;
> + } endfor_nexthops(rt);
>
> - RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> + if (alive > 0)
> + rt->rt_flags &= ~nh_flags;
Again, this creates a dependence on the ordering of events.
> + rt->rt_nhn_alive = alive;
> + }
>
> - kfree_rcu(mdev, rcu);
> + return;
> }
>
> static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> @@ -910,9 +1007,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> {
> struct net_device *dev = netdev_notifier_info_to_dev(ptr);
> struct mpls_dev *mdev;
> + unsigned int flags;
>
> - switch(event) {
> - case NETDEV_REGISTER:
> + if (event == NETDEV_REGISTER) {
> /* For now just support ethernet devices */
> if ((dev->type == ARPHRD_ETHER) ||
> (dev->type == ARPHRD_LOOPBACK)) {
> @@ -920,10 +1017,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> if (IS_ERR(mdev))
> return notifier_from_errno(PTR_ERR(mdev));
> }
> - break;
> + return NOTIFY_OK;
> + }
>
> + mdev = mpls_dev_get(dev);
> + if (!mdev)
> + return NOTIFY_OK;
> +
> + switch (event) {
> + case NETDEV_DOWN:
> + mpls_ifdown(dev, event);
> + break;
> + case NETDEV_UP:
> + flags = dev_get_flags(dev);
> + if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> + else
> + mpls_ifup(dev, RTNH_F_DEAD);
> + break;
> + case NETDEV_CHANGE:
> + flags = dev_get_flags(dev);
> + if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> + mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> + else
> + mpls_ifdown(dev, event);
> + break;
> case NETDEV_UNREGISTER:
> - mpls_ifdown(dev);
> + mpls_ifdown(dev, event);
> + mdev = mpls_dev_get(dev);
> + if (mdev) {
> + mpls_dev_sysctl_unregister(mdev);
> + RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> + kfree_rcu(mdev, rcu);
> + }
> break;
> case NETDEV_CHANGENAME:
> mdev = mpls_dev_get(dev);
> @@ -1237,6 +1363,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
> dev = rtnl_dereference(nh->nh_dev);
> if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
> goto nla_put_failure;
> + if (nh->nh_flags & RTNH_F_LINKDOWN)
> + rtm->rtm_flags |= RTNH_F_LINKDOWN;
> + if (nh->nh_flags & RTNH_F_DEAD)
> + rtm->rtm_flags |= RTNH_F_DEAD;
> } else {
> struct rtnexthop *rtnh;
> struct nlattr *mp;
> @@ -1253,6 +1383,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
> dev = rtnl_dereference(nh->nh_dev);
> if (dev)
> rtnh->rtnh_ifindex = dev->ifindex;
> + if (nh->nh_flags & RTNH_F_LINKDOWN)
> + rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
> + if (nh->nh_flags & RTNH_F_DEAD)
> + rtnh->rtnh_flags |= RTNH_F_DEAD;
> +
You never read from rt->rt_flags. Was your intention to set
rtm->rtm_flags using that field in this function?
> if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
> nh->nh_labels,
> nh->nh_label))
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index bde52ce..4f9bf2b 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -41,6 +41,7 @@ enum mpls_payload_type {
>
> struct mpls_nh { /* next hop label forwarding entry */
> struct net_device __rcu *nh_dev;
> + unsigned int nh_flags;
This could be implemented as a u8 (or even two 1-bit fields) after
nh_via_table (making use of the 1-byte hole already there) without
increasing the size of the structure by a fifth.
> u32 nh_label[MAX_NEW_LABELS];
> u8 nh_labels;
> u8 nh_via_alen;
> @@ -70,10 +71,12 @@ struct mpls_nh { /* next hop label forwarding entry */
> */
> struct mpls_route { /* next hop label forwarding entry */
> struct rcu_head rt_rcu;
> + unsigned int rt_flags;
Storing this is unnecessary - it can be derived from rt_nhn_alive == 0
and looking at the nexthop flags, which also avoids the event ordering
problems described above.
Thanks,
Rob
> u8 rt_protocol;
> u8 rt_payload_type;
> u8 rt_max_alen;
> unsigned int rt_nhn;
> + unsigned int rt_nhn_alive;
> struct mpls_nh rt_nh[0];
> };
>
>
next prev parent reply other threads:[~2015-11-03 15:08 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
2015-11-03 5:13 [PATCH net-next v2] mpls: support for dead routes Roopa Prabhu
2015-11-03 15:08 ` Robert Shearman [this message]
2015-11-03 17:21 ` roopa
2015-11-03 18:54 ` Eric W. Biederman
2015-11-03 23:08 ` roopa
2015-11-03 15:28 ` David Miller
2015-11-03 16:00 ` roopa
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=5638CDE4.2080501@brocade.com \
--to=rshearma@brocade.com \
--cc=davem@davemloft.net \
--cc=ebiederm@xmission.com \
--cc=netdev@vger.kernel.org \
--cc=roopa@cumulusnetworks.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.