Re: [PATCH net-next v2] mpls: support for dead routes

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Robert Shearman <rshearma@brocade.com>
To: Roopa Prabhu <roopa@cumulusnetworks.com>, <ebiederm@xmission.com>
Cc: <davem@davemloft.net>, <netdev@vger.kernel.org>
Subject: Re: [PATCH net-next v2] mpls: support for dead routes
Date: Tue, 3 Nov 2015 15:08:20 +0000	[thread overview]
Message-ID: <5638CDE4.2080501@brocade.com> (raw)
In-Reply-To: <1446527581-64787-1-git-send-email-roopa@cumulusnetworks.com>

On 03/11/15 05:13, Roopa Prabhu wrote:
> From: Roopa Prabhu <roopa@cumulusnetworks.com>
>
> Adds support for RTNH_F_DEAD and RTNH_F_LINKDOWN flags on mpls
> routes due to link events. Also adds code to ignore dead
> routes during route selection
>
> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
> ---
> RFC to v1:
>          Addressed a few comments from Eric and Robert:
>          - remove support for weighted nexthops
>          - Use rt_nhn_alive in the rt structure to keep count of alive
>          routes.
>          What i have not done is: sort nexthops on link events.
>          I am not comfortable recreating or sorting nexthops on
>          every carrier change. This leaves scope for optimizing in the future
>
> v1 to v2:
> 	Fix dead nexthop checks as suggested by dave
>
>   net/mpls/af_mpls.c  | 191 ++++++++++++++++++++++++++++++++++++++++++++--------
>   net/mpls/internal.h |   3 +
>   2 files changed, 166 insertions(+), 28 deletions(-)
>
> diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
> index c70d750..5e88118 100644
> --- a/net/mpls/af_mpls.c
> +++ b/net/mpls/af_mpls.c
> @@ -96,22 +96,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
>   }
>   EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
>
> -static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> -					     struct sk_buff *skb, bool bos)
> +static u32 mpls_multipath_hash(struct mpls_route *rt,
> +			       struct sk_buff *skb, bool bos)
>   {
>   	struct mpls_entry_decoded dec;
>   	struct mpls_shim_hdr *hdr;
>   	bool eli_seen = false;
>   	int label_index;
> -	int nh_index = 0;
>   	u32 hash = 0;
>
> -	/* No need to look further into packet if there's only
> -	 * one path
> -	 */
> -	if (rt->rt_nhn == 1)
> -		goto out;
> -
>   	for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
>   	     label_index++) {
>   		if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
> @@ -165,9 +158,37 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
>   		}
>   	}
>
> -	nh_index = hash % rt->rt_nhn;
> +	return hash;
> +}
> +
> +static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
> +					     struct sk_buff *skb, bool bos)
> +{
> +	u32 hash = 0;
> +	int nh_index;
> +	int n = 0;
> +
> +	/* No need to look further into packet if there's only
> +	 * one path
> +	 */
> +	if (rt->rt_nhn == 1)
> +		goto out;
> +
> +	if (rt->rt_nhn_alive <= 0)
> +		return NULL;
> +
> +	hash = mpls_multipath_hash(rt, skb, bos);
> +	nh_index = hash % rt->rt_nhn_alive;
> +	for_nexthops(rt) {

This should be optimised in the common rt->rt_nhn_alive == rt->rt_nhn 
case by doing the direct array index and avoid the nh walk.

> +		if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> +			continue;
> +		if (n == nh_index)
> +			return nh;
> +		n++;
> +	} endfor_nexthops(rt);
> +
>   out:
> -	return &rt->rt_nh[nh_index];
> +	return &rt->rt_nh[0];
>   }
>
>   static bool mpls_egress(struct mpls_route *rt, struct sk_buff *skb,
> @@ -365,6 +386,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
>   		     GFP_KERNEL);
>   	if (rt) {
>   		rt->rt_nhn = num_nh;
> +		rt->rt_nhn_alive = num_nh;
>   		rt->rt_max_alen = max_alen_aligned;
>   	}
>
> @@ -536,6 +558,15 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
>
>   	RCU_INIT_POINTER(nh->nh_dev, dev);
>
> +	if (!netif_carrier_ok(dev))
> +		nh->nh_flags |= RTNH_F_LINKDOWN;
> +
> +	if (!(dev->flags & IFF_UP))
> +		nh->nh_flags |= RTNH_F_DEAD;

Below you're testing IFF_RUNNING | IFF_LOWER_UP. Is the difference 
intended here?

> +
> +	if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
> +		rt->rt_nhn_alive--;

You don't update your new rt->rt_flags field here. This highlights the 
issue with duplicating data, which you're doing with the rt_flags field.

> +
>   	return 0;
>
>   errout:
> @@ -577,7 +608,7 @@ errout:
>   }
>
>   static int mpls_nh_build(struct net *net, struct mpls_route *rt,
> -			 struct mpls_nh *nh, int oif,
> +			 struct mpls_nh *nh, int oif, int hops,

This change isn't required.

>   			 struct nlattr *via, struct nlattr *newdst)
>   {
>   	int err = -ENOMEM;
> @@ -666,7 +697,7 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>   		/* neither weighted multipath nor any flags
>   		 * are supported
>   		 */
> -		if (rtnh->rtnh_hops || rtnh->rtnh_flags)
> +		if (rtnh->rtnh_flags || rtnh->rtnh_flags)

As the build bot has pointed out, this change is not entirely correct.

>   			goto errout;
>
>   		attrlen = rtnh_attrlen(rtnh);
> @@ -681,8 +712,8 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
>   			goto errout;
>
>   		err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
> -				    rtnh->rtnh_ifindex, nla_via,
> -				    nla_newdst);
> +				    rtnh->rtnh_ifindex, rtnh->rtnh_hops,
> +				    nla_via, nla_newdst);

This change isn't required.

>   		if (err)
>   			goto errout;
>
> @@ -875,34 +906,100 @@ free:
>   	return ERR_PTR(err);
>   }
>
> -static void mpls_ifdown(struct net_device *dev)
> +static void mpls_ifdown(struct net_device *dev, int event)
>   {
>   	struct mpls_route __rcu **platform_label;
>   	struct net *net = dev_net(dev);
> -	struct mpls_dev *mdev;
>   	unsigned index;
> +	int dead;
>
>   	platform_label = rtnl_dereference(net->mpls.platform_label);
>   	for (index = 0; index < net->mpls.platform_labels; index++) {
>   		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
>   		if (!rt)
>   			continue;
> +		dead = 0;
>   		for_nexthops(rt) {
> +			if ((event == NETDEV_DOWN &&
> +			     (nh->nh_flags & RTNH_F_DEAD)) ||
> +			     (event == NETDEV_CHANGE &&
> +			     (nh->nh_flags & RTNH_F_LINKDOWN))) {
> +				dead++;
> +				continue;
> +			}
> +
>   			if (rtnl_dereference(nh->nh_dev) != dev)
>   				continue;
> -			nh->nh_dev = NULL;
> +			switch (event) {
> +			case NETDEV_DOWN:
> +			case NETDEV_UNREGISTER:
> +				nh->nh_flags |= RTNH_F_DEAD;
> +				/* fall through */
> +			case NETDEV_CHANGE:
> +				nh->nh_flags |= RTNH_F_LINKDOWN;
> +				rt->rt_nhn_alive--;

Are these operations atomic on all architectures?

Even if they are, I'm a bit worried about the RCU-correctness of this. I 
think the fallthrough case in mpls_select_multipath saves you, causing 
traffic to get via nh0 instead of the one selected by the hash (which is 
fine transiently).

> +				break;
> +			}
> +			if (event == NETDEV_UNREGISTER) {
> +				nh->nh_dev = NULL;

I realise this was just moved from the original code, but I think this 
should be at least RCU_INIT_POINTER(nh->nh_dev, NULL), if not 
rcu_assign_pointer.

> +				dead = rt->rt_nhn;
> +				break;
> +			}
> +			dead++;
>   		} endfor_nexthops(rt);
> +
> +		if (dead == rt->rt_nhn) {
> +			switch (event) {
> +			case NETDEV_DOWN:
> +			case NETDEV_UNREGISTER:
> +				rt->rt_flags |= RTNH_F_DEAD;
> +				/* fall through */
> +			case NETDEV_CHANGE:
> +				rt->rt_flags |= RTNH_F_LINKDOWN;
> +				rt->rt_nhn_alive = 0;

Won't rt_nhn_alive be zero already at this point?

There's also the problem that depending on the type of the last event, 
rt->rt_flags will get set differently.

E.g.

- NETDEV_DOWN for nh0 followed by NETDEV_CHANGE[down] for nh1 then 
rt->rt_flags will be RTNH_F_LINKDOWN.
- NETDEV_CHANGE[down] for nh1 followed by NETDEV_DOWN for nh0 then 
rt->rt_flags will be RTNH_F_LINKDOWN|RTNH_F_DEAD.

That doesn't seem like desirable behaviour. What are expected semantics?

> +				break;
> +			}
> +		}
>   	}
>
> -	mdev = mpls_dev_get(dev);
> -	if (!mdev)
> -		return;
> +	return;
> +}
> +
> +static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
> +{
> +	struct mpls_route __rcu **platform_label;
> +	struct net *net = dev_net(dev);
> +	unsigned index;
> +	int alive;
> +
> +	platform_label = rtnl_dereference(net->mpls.platform_label);
> +	for (index = 0; index < net->mpls.platform_labels; index++) {
> +		struct mpls_route *rt = rtnl_dereference(platform_label[index]);
> +
> +		if (!rt)
> +			continue;
> +		alive = 0;
> +		for_nexthops(rt) {
> +			struct net_device *nh_dev =
> +				rtnl_dereference(nh->nh_dev);
>
> -	mpls_dev_sysctl_unregister(mdev);
> +			if (!(nh->nh_flags & nh_flags)) {
> +				alive++;
> +				continue;
> +			}
> +			if (nh_dev != dev)
> +				continue;
> +			alive++;
> +			nh->nh_flags &= ~nh_flags;
> +		} endfor_nexthops(rt);
>
> -	RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> +		if (alive > 0)
> +			rt->rt_flags &= ~nh_flags;

Again, this creates a dependence on the ordering of events.

> +		rt->rt_nhn_alive = alive;
> +	}
>
> -	kfree_rcu(mdev, rcu);
> +	return;
>   }
>
>   static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
> @@ -910,9 +1007,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
>   {
>   	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
>   	struct mpls_dev *mdev;
> +	unsigned int flags;
>
> -	switch(event) {
> -	case NETDEV_REGISTER:
> +	if (event == NETDEV_REGISTER) {
>   		/* For now just support ethernet devices */
>   		if ((dev->type == ARPHRD_ETHER) ||
>   		    (dev->type == ARPHRD_LOOPBACK)) {
> @@ -920,10 +1017,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
>   			if (IS_ERR(mdev))
>   				return notifier_from_errno(PTR_ERR(mdev));
>   		}
> -		break;
> +		return NOTIFY_OK;
> +	}
>
> +	mdev = mpls_dev_get(dev);
> +	if (!mdev)
> +		return NOTIFY_OK;
> +
> +	switch (event) {
> +	case NETDEV_DOWN:
> +		mpls_ifdown(dev, event);
> +		break;
> +	case NETDEV_UP:
> +		flags = dev_get_flags(dev);
> +		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> +			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> +		else
> +			mpls_ifup(dev, RTNH_F_DEAD);
> +		break;
> +	case NETDEV_CHANGE:
> +		flags = dev_get_flags(dev);
> +		if (flags & (IFF_RUNNING | IFF_LOWER_UP))
> +			mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
> +		else
> +			mpls_ifdown(dev, event);
> +		break;
>   	case NETDEV_UNREGISTER:
> -		mpls_ifdown(dev);
> +		mpls_ifdown(dev, event);
> +		mdev = mpls_dev_get(dev);
> +		if (mdev) {
> +			mpls_dev_sysctl_unregister(mdev);
> +			RCU_INIT_POINTER(dev->mpls_ptr, NULL);
> +			kfree_rcu(mdev, rcu);
> +		}
>   		break;
>   	case NETDEV_CHANGENAME:
>   		mdev = mpls_dev_get(dev);
> @@ -1237,6 +1363,10 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   		dev = rtnl_dereference(nh->nh_dev);
>   		if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
>   			goto nla_put_failure;
> +		if (nh->nh_flags & RTNH_F_LINKDOWN)
> +			rtm->rtm_flags |= RTNH_F_LINKDOWN;
> +		if (nh->nh_flags & RTNH_F_DEAD)
> +			rtm->rtm_flags |= RTNH_F_DEAD;
>   	} else {
>   		struct rtnexthop *rtnh;
>   		struct nlattr *mp;
> @@ -1253,6 +1383,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
>   			dev = rtnl_dereference(nh->nh_dev);
>   			if (dev)
>   				rtnh->rtnh_ifindex = dev->ifindex;
> +			if (nh->nh_flags & RTNH_F_LINKDOWN)
> +				rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
> +			if (nh->nh_flags & RTNH_F_DEAD)
> +				rtnh->rtnh_flags |= RTNH_F_DEAD;
> +

You never read from rt->rt_flags. Was your intention to set 
rtm->rtm_flags using that field in this function?

>   			if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
>   							    nh->nh_labels,
>   							    nh->nh_label))
> diff --git a/net/mpls/internal.h b/net/mpls/internal.h
> index bde52ce..4f9bf2b 100644
> --- a/net/mpls/internal.h
> +++ b/net/mpls/internal.h
> @@ -41,6 +41,7 @@ enum mpls_payload_type {
>
>   struct mpls_nh { /* next hop label forwarding entry */
>   	struct net_device __rcu *nh_dev;
> +	unsigned int		nh_flags;

This could be implemented as a u8 (or even two 1-bit fields) after 
nh_via_table (making use of the 1-byte hole already there) without 
increasing the size of the structure by a fifth.

>   	u32			nh_label[MAX_NEW_LABELS];
>   	u8			nh_labels;
>   	u8			nh_via_alen;
> @@ -70,10 +71,12 @@ struct mpls_nh { /* next hop label forwarding entry */
>    */
>   struct mpls_route { /* next hop label forwarding entry */
>   	struct rcu_head		rt_rcu;
> +	unsigned int		rt_flags;

Storing this is unnecessary - it can be derived from rt_nhn_alive == 0 
and looking at the nexthop flags, which also avoids the event ordering 
problems described above.

Thanks,
Rob

>   	u8			rt_protocol;
>   	u8			rt_payload_type;
>   	u8			rt_max_alen;
>   	unsigned int		rt_nhn;
> +	unsigned int		rt_nhn_alive;
>   	struct mpls_nh		rt_nh[0];
>   };
>
>

next prev parent reply	other threads:[~2015-11-03 15:08 UTC|newest]

Thread overview: 7+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2015-11-03  5:13 [PATCH net-next v2] mpls: support for dead routes Roopa Prabhu
2015-11-03 15:08 ` Robert Shearman [this message]
2015-11-03 17:21   ` roopa
2015-11-03 18:54     ` Eric W. Biederman
2015-11-03 23:08       ` roopa
2015-11-03 15:28 ` David Miller
2015-11-03 16:00   ` roopa

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5638CDE4.2080501@brocade.com \
    --to=rshearma@brocade.com \
    --cc=davem@davemloft.net \
    --cc=ebiederm@xmission.com \
    --cc=netdev@vger.kernel.org \
    --cc=roopa@cumulusnetworks.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.