netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Vlad Yasevich <vyasevich@gmail.com>
To: Hannes Frederic Sowa <hannes@stressinduktion.org>,
	David Miller <davem@davemloft.net>
Cc: netdev@vger.kernel.org, eric.dumazet@gmail.com,
	nicolas.dichtel@6wind.com
Subject: Re: [PATCH net-next] ipv6: implement rt_genid_bump_ipv6 with fn_sernum and remove rt6i_genid
Date: Thu, 11 Sep 2014 10:19:30 -0400	[thread overview]
Message-ID: <5411AF72.2070407@gmail.com> (raw)
In-Reply-To: <1410437146.18873.2.camel@localhost>

On 09/11/2014 08:05 AM, Hannes Frederic Sowa wrote:
> On Mi, 2014-09-10 at 13:09 -0700, David Miller wrote:
>> From: Hannes Frederic Sowa <hannes@stressinduktion.org>
>> Date: Wed, 10 Sep 2014 11:31:28 +0200
>>
>>> In case we need to force the sockets to relookup the routes we now
>>> increase the fn_sernum on all fibnodes in the routing tree. This is a
>>> costly operation but should only happen if we have major routing/policy
>>> changes in the kernel (e.g. manual route adding/removal, xfrm policy
>>> changes).
>>
>> Core routers can update thousands of route updates per second, and they
>> do this via what you refer to as "manual route adding/removal".
>>
>> I don't think we want to put such a scalability problem into the tree.
>>
>> There has to be a lightweight way to address this.
> 
> An alternative approach without traversing the routing table, but each
> newly inserted route (even only cached ones) might bump all other routes
> out of the per-socket caches:
> 
> diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
> index 9bcb220..a7e45b9 100644
> --- a/include/net/ip6_fib.h
> +++ b/include/net/ip6_fib.h
> @@ -119,8 +119,6 @@ struct rt6_info {
>  	struct inet6_dev		*rt6i_idev;
>  	unsigned long			_rt6i_peer;
>  
> -	u32				rt6i_genid;
> -
>  	/* more non-fragment space at head required */
>  	unsigned short			rt6i_nfheader_len;
>  
> diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
> index 361d260..428fdcb 100644
> --- a/include/net/net_namespace.h
> +++ b/include/net/net_namespace.h
> @@ -358,18 +358,28 @@ static inline int rt_genid_ipv6(struct net *net)
>  	return atomic_read(&net->ipv6.rt_genid);
>  }
>  
> -static inline void rt_genid_bump_ipv6(struct net *net)
> +static inline int rt_genid_bump_ipv6(struct net *net)
>  {
> -	atomic_inc(&net->ipv6.rt_genid);
> +	int new, old;
> +
> +	do {
> +		old = atomic_read(&net->ipv6.rt_genid);
> +		new = old + 1;
> +		if (new <= 0)
> +			new = 1;
> +	} while (atomic_cmpxchg(&net->ipv6.rt_genid, old, new) != old);
> +	return new;
> +
>  }
>  #else
>  static inline int rt_genid_ipv6(struct net *net)
>  {
> -	return 0;
> +	return 1;
>  }
>  
> -static inline void rt_genid_bump_ipv6(struct net *net)
> +static inline int rt_genid_bump_ipv6(struct net *net)
>  {
> +	return 1;
>  }
>  #endif
>  
> diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
> index 76b7f5e..4a2f130 100644
> --- a/net/ipv6/ip6_fib.c
> +++ b/net/ipv6/ip6_fib.c
> @@ -84,7 +84,10 @@ static int fib6_walk_continue(struct fib6_walker_t *w);
>   *	result of redirects, path MTU changes, etc.
>   */
>  
> -static __u32 rt_sernum;
> +static int fib6_new_sernum(struct net *net)
> +{
> +	return rt_genid_bump_ipv6(net);
> +}
>  
>  static void fib6_gc_timer_cb(unsigned long arg);
>  
> @@ -104,13 +107,6 @@ static inline void fib6_walker_unlink(struct fib6_walker_t *w)
>  	list_del(&w->lh);
>  	write_unlock_bh(&fib6_walker_lock);
>  }
> -static __inline__ u32 fib6_new_sernum(void)
> -{
> -	u32 n = ++rt_sernum;
> -	if ((__s32)n <= 0)
> -		rt_sernum = n = 1;
> -	return n;
> -}
>  
>  /*
>   *	Auxiliary address test functions for the radix tree.
> @@ -421,16 +417,15 @@ out:
>   */
>  
>  static struct fib6_node *fib6_add_1(struct fib6_node *root,
> -				     struct in6_addr *addr, int plen,
> -				     int offset, int allow_create,
> -				     int replace_required)
> +				    struct in6_addr *addr, int plen,
> +				    int offset, int allow_create,
> +				    int replace_required, int sernum)
>  {
>  	struct fib6_node *fn, *in, *ln;
>  	struct fib6_node *pn = NULL;
>  	struct rt6key *key;
>  	int	bit;
>  	__be32	dir = 0;
> -	__u32	sernum = fib6_new_sernum();
>  
>  	RT6_TRACE("fib6_add_1\n");
>  
> @@ -844,6 +839,7 @@ void fib6_force_start_gc(struct net *net)
>  int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>  	     struct nlattr *mx, int mx_len)
>  {
> +	struct net *net = dev_net(rt->dst.dev);
>  	struct fib6_node *fn, *pn = NULL;
>  	int err = -ENOMEM;
>  	int allow_create = 1;
> @@ -860,7 +856,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>  
>  	fn = fib6_add_1(root, &rt->rt6i_dst.addr, rt->rt6i_dst.plen,
>  			offsetof(struct rt6_info, rt6i_dst), allow_create,
> -			replace_required);
> +			replace_required, fib6_new_sernum(net));
>  	if (IS_ERR(fn)) {
>  		err = PTR_ERR(fn);
>  		fn = NULL;
> @@ -894,14 +890,15 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>  			sfn->leaf = info->nl_net->ipv6.ip6_null_entry;
>  			atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref);
>  			sfn->fn_flags = RTN_ROOT;
> -			sfn->fn_sernum = fib6_new_sernum();
> +			sfn->fn_sernum = fib6_new_sernum(net);
>  
>  			/* Now add the first leaf node to new subtree */
>  
>  			sn = fib6_add_1(sfn, &rt->rt6i_src.addr,
>  					rt->rt6i_src.plen,
>  					offsetof(struct rt6_info, rt6i_src),
> -					allow_create, replace_required);
> +					allow_create, replace_required,
> +					fib6_new_sernum(net));
>  
>  			if (IS_ERR(sn)) {
>  				/* If it is failed, discard just allocated
> @@ -920,7 +917,8 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, struct nl_info *info,
>  			sn = fib6_add_1(fn->subtree, &rt->rt6i_src.addr,
>  					rt->rt6i_src.plen,
>  					offsetof(struct rt6_info, rt6i_src),
> -					allow_create, replace_required);
> +					allow_create, replace_required,
> +					fib6_new_sernum(net));
>  
>  			if (IS_ERR(sn)) {
>  				err = PTR_ERR(sn);
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index f74b041..54b7d81 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -314,7 +314,6 @@ static inline struct rt6_info *ip6_dst_alloc(struct net *net,
>  
>  		memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
>  		rt6_init_peer(rt, table ? &table->tb6_peers : net->ipv6.peers);
> -		rt->rt6i_genid = rt_genid_ipv6(net);
>  		INIT_LIST_HEAD(&rt->rt6i_siblings);
>  	}
>  	return rt;
> @@ -1096,10 +1095,7 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
>  	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
>  	 * into this function always.
>  	 */
> -	if (rt->rt6i_genid != rt_genid_ipv6(dev_net(rt->dst.dev)))
> -		return NULL;
> -
> -	if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie))
> +	if (!rt->rt6i_node || rt_genid_ipv6(dev_net(rt->dst.dev)) != cookie)
>  		return NULL;
>  
>  	if (rt6_check_expired(rt))
> 

Ok, so now we bump the gen_id every time we add a route and use that as fn_sernum for
that route.  But this doesn't solve the problem that we are seeing in that a re-lookup
of the route still gives us an older route with an older gen_id.

Or am I missing something?

-vlad

  reply	other threads:[~2014-09-11 14:19 UTC|newest]

Thread overview: 57+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-08-14 18:19 Performance regression on kernels 3.10 and newer Alexander Duyck
2014-08-14 18:46 ` Eric Dumazet
2014-08-14 19:50   ` Eric Dumazet
2014-08-14 19:59   ` Rick Jones
2014-08-14 20:31     ` Alexander Duyck
2014-08-14 20:51       ` Eric Dumazet
2014-08-14 20:46     ` Eric Dumazet
2014-08-14 23:16   ` Alexander Duyck
2014-08-14 23:20     ` David Miller
2014-08-14 23:25       ` Tom Herbert
2014-08-21 23:24         ` David Miller
2014-09-06 14:45           ` Eric Dumazet
2014-09-06 15:27             ` Eric Dumazet
2014-09-06 15:46               ` Eric Dumazet
2014-09-06 16:38                 ` Eric Dumazet
2014-09-06 18:21                   ` Eric Dumazet
2014-09-07 19:05                     ` [PATCH net] ipv6: refresh rt6i_genid in ip6_pol_route() Eric Dumazet
2014-09-07 22:54                       ` David Miller
2014-09-08  4:18                         ` Eric Dumazet
2014-09-08  4:27                           ` David Miller
2014-09-08  4:43                             ` Eric Dumazet
2014-09-08  4:59                               ` David Miller
2014-09-08  5:07                                 ` Eric Dumazet
2014-09-08  8:11                                   ` Nicolas Dichtel
2014-09-08 10:28                                     ` Eric Dumazet
2014-09-08 12:16                                       ` Nicolas Dichtel
2014-09-08 18:48                                   ` Vlad Yasevich
2014-09-09 12:58                                   ` Hannes Frederic Sowa
2014-09-10  9:31                                     ` [PATCH net-next] ipv6: implement rt_genid_bump_ipv6 with fn_sernum and remove rt6i_genid Hannes Frederic Sowa
2014-09-10 13:26                                       ` Vlad Yasevich
2014-09-10 13:42                                         ` Hannes Frederic Sowa
2014-09-10 20:09                                       ` David Miller
2014-09-11  8:30                                         ` Hannes Frederic Sowa
2014-09-11 12:22                                           ` Vlad Yasevich
2014-09-11 12:40                                             ` Hannes Frederic Sowa
2014-09-11 12:05                                         ` Hannes Frederic Sowa
2014-09-11 14:19                                           ` Vlad Yasevich [this message]
2014-09-11 14:32                                             ` Hannes Frederic Sowa
2014-09-11 14:44                                               ` Vlad Yasevich
2014-09-11 14:47                                                 ` Hannes Frederic Sowa
2014-09-08 15:06               ` [PATCH v2 net-next] tcp: remove dst refcount false sharing for prequeue mode Eric Dumazet
2014-09-08 21:21                 ` David Miller
2014-09-08 21:30                   ` Eric Dumazet
2014-09-08 22:41                     ` David Miller
2014-09-09 23:56                     ` David Miller
2014-08-15 17:15       ` Performance regression on kernels 3.10 and newer Alexander Duyck
2014-08-15 17:59         ` Eric Dumazet
2014-08-15 18:49         ` Tom Herbert
2014-08-15 19:10           ` Alexander Duyck
2014-08-15 22:16             ` Tom Herbert
2014-08-15 23:23               ` Alexander Duyck
2014-08-18  9:03                 ` David Laight
2014-08-18 15:22                   ` Alexander Duyck
2014-08-18 15:29                     ` Rick Jones
2014-08-21 23:51         ` David Miller
2014-08-14 23:48     ` Eric Dumazet
2014-08-15  0:33       ` Rick Jones

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=5411AF72.2070407@gmail.com \
    --to=vyasevich@gmail.com \
    --cc=davem@davemloft.net \
    --cc=eric.dumazet@gmail.com \
    --cc=hannes@stressinduktion.org \
    --cc=netdev@vger.kernel.org \
    --cc=nicolas.dichtel@6wind.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).