netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] net: compute a more reasonable default ip6_rt_max_size
@ 2012-05-25 20:15 Arun Sharma
  2012-05-25 20:26 ` David Miller
  2012-05-25 20:47 ` Eric Dumazet
  0 siblings, 2 replies; 14+ messages in thread
From: Arun Sharma @ 2012-05-25 20:15 UTC (permalink / raw)
  To: netdev; +Cc: Arun Sharma, linux-kernel, David Miller

The algorithm is based on ipv4 and alloc_large_system_hash().

The following data is from a x86_64 box I tested:

128MB
$ cat /proc/sys/net/ipv{4,6}/route/max_size
16384
22444

512MB
$ cat /proc/sys/net/ipv{4,6}/route/max_size
65536
99856

1GB
$ cat /proc/sys/net/ipv{4,6}/route/max_size
524288
203068

2GB
$ cat /proc/sys/net/ipv{4,6}/route/max_size
1048576
524288

4GB
$ cat /proc/sys/net/ipv{4,6}/route/max_size
2097152
524288

Signed-off-by: Arun Sharma <asharma@fb.com>
Cc: netdev@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: David Miller <davem@davemloft.net>
---
 net/ipv6/route.c |   21 ++++++++++++++++++++-
 1 files changed, 20 insertions(+), 1 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 49d6ce1..c89ebbb 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2827,6 +2827,16 @@ struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
 }
 #endif
 
+static __initdata unsigned long ip6_rt_entries;
+static int __init set_rt_entries(char *str)
+{
+	if (!str)
+		return 0;
+	ip6_rt_entries = simple_strtoul(str, &str, 0);
+	return 1;
+}
+__setup("ip6_rt_entries=", set_rt_entries);
+
 static int __net_init ip6_route_net_init(struct net *net)
 {
 	int ret = -ENOMEM;
@@ -2872,8 +2882,17 @@ static int __net_init ip6_route_net_init(struct net *net)
 			 ip6_template_metrics, true);
 #endif
 
+	/* Compute a reasonable default based on what we do for ipv4
+	 * total size = 1/16th of total RAM
+	 * No more than 512k entries unless overridden on kernel cmdline */
+        if (ip6_rt_entries == 0) {
+		ip6_rt_entries = (totalram_pages << PAGE_SHIFT) >> 4;
+		ip6_rt_entries /= sizeof(struct rt6_info);
+		ip6_rt_entries = min(512 * 1024UL, ip6_rt_entries);
+        }
+
 	net->ipv6.sysctl.flush_delay = 0;
-	net->ipv6.sysctl.ip6_rt_max_size = 4096;
+	net->ipv6.sysctl.ip6_rt_max_size = ip6_rt_entries;
 	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
 	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
 	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
-- 
1.7.8.4

^ permalink raw reply related	[flat|nested] 14+ messages in thread
* Re: [PATCH] net: compute a more reasonable default ip6_rt_max_size
@ 2012-05-30 23:50 Lubashev, Igor
  2012-06-04 19:04 ` Lubashev, Igor
  0 siblings, 1 reply; 14+ messages in thread
From: Lubashev, Igor @ 2012-05-30 23:50 UTC (permalink / raw)
  To: David Miller, Arun Sharma
  Cc: eric.dumazet@gmail.com, netdev@vger.kernel.org,
	linux-kernel@vger.kernel.org

>It's possible that there is a bug somewhere - we didn't get a chance to 
>dig deeper. What we observed is that as we got close to the 4096 limit, 
>some hosts were becoming unreachable. A modest increase in the routing 
>table size made things better.

First of all, we have observed the same thing.

While I am not an expert in this area of the routing code, the function fib6_age in net/ipv6/ip6_fib.c puzzles me.


In kernel version 2.7.2.0.3, we have net/ipv6/ip6_fib.c:
static int fib6_age(struct rt6_info *rt, void *arg)
{
	unsigned long now = jiffies;

	if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
		if (time_after(now, rt->rt6i_expires)) {
			RT6_TRACE("expiring %p\n", rt);
			return -1;
		}
		gc_args.more++;
	} else if (rt->rt6i_flags & RTF_CACHE) {
		if (atomic_read(&rt->dst.__refcnt) == 0 &&
		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			return -1;
		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
			   (!(rt->rt6i_nexthop->flags & NTF_ROUTER))) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			return -1;
		}
		gc_args.more++;
	}

	return 0;
}


In kernel 3.0.32, we have net/ipv6/ip6_fib.c:
static int fib6_age(struct rt6_info *rt, void *arg)
{
	unsigned long now = jiffies;

	if (rt->rt6i_flags&RTF_EXPIRES && rt->rt6i_expires) {
		if (time_after(now, rt->rt6i_expires)) {
			RT6_TRACE("expiring %p\n", rt);
			return -1;
		}
		gc_args.more++;
	} else if (rt->rt6i_flags & RTF_CACHE) {
		if (atomic_read(&rt->dst.__refcnt) == 0 &&
		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			return -1;
		} else if ((rt->rt6i_flags & RTF_GATEWAY) &&
			   (!(dst_get_neighbour_raw(&rt->dst)->flags & NTF_ROUTER))) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			return -1;
		}
		gc_args.more++;
	}

	return 0;
}


In kernel 3.4, we have net/ipv6/ip6_fib.c:
static int fib6_age(struct rt6_info *rt, void *arg)
{
	unsigned long now = jiffies;

	if (rt->rt6i_flags & RTF_EXPIRES && rt->dst.expires) {
		if (time_after(now, rt->dst.expires)) {
			RT6_TRACE("expiring %p\n", rt);
			return -1;
		}
		gc_args.more++;
	} else if (rt->rt6i_flags & RTF_CACHE) {
		if (atomic_read(&rt->dst.__refcnt) == 0 &&
		    time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			return -1;
		} else if (rt->rt6i_flags & RTF_GATEWAY) {
			struct neighbour *neigh;
			__u8 neigh_flags = 0;

			neigh = dst_neigh_lookup(&rt->dst, &rt->rt6i_gateway);
			if (neigh) {
				neigh_flags = neigh->flags;
				neigh_release(neigh);
			}
			if (neigh_flags & NTF_ROUTER) {
				RT6_TRACE("purging route %p via non-router but gateway\n",
					  rt);
				return -1;
			}
		}
		gc_args.more++;
	}

	return 0;
}


Do we have the meaning of the NTF_ROUTER flag reversed in kernel 3.4?  Or is the opposite use of that flag a fix for the bug in the previous releases? Or is this a bug in kernel 3.4?

Also, could this remove a Gateway entry, if there is no neighbor entry for it (in any of the version of the code)?  Could this try to deference a null pointer in 3.0.32 version of the code (and any version prior to 3.4)?  In general, is this the right place to remove a gateway route that has __refcnt > 0?

I wish I had more expertise in this area of the code to answer questions and not only to pose them.

Thank you,

- Igor

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2012-06-04 19:04 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2012-05-25 20:15 [PATCH] net: compute a more reasonable default ip6_rt_max_size Arun Sharma
2012-05-25 20:26 ` David Miller
2012-05-25 20:47 ` Eric Dumazet
2012-05-25 22:22   ` Arun Sharma
2012-05-25 22:51     ` David Miller
2012-05-26  0:08       ` Arun Sharma
2012-05-26  0:11         ` David Miller
2012-05-26  0:44           ` Arun Sharma
2012-05-26  3:39             ` Eric Dumazet
2012-05-26  4:17               ` Eric Dumazet
2012-05-27  3:54               ` Arun Sharma
2012-05-27 13:18                 ` Eric Dumazet
  -- strict thread matches above, loose matches on Subject: below --
2012-05-30 23:50 Lubashev, Igor
2012-06-04 19:04 ` Lubashev, Igor

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).