public inbox for netdev@vger.kernel.org
 help / color / mirror / Atom feed
From: Eric Dumazet <eric.dumazet@gmail.com>
To: David Miller <davem@davemloft.net>
Cc: chris2553@googlemail.com, netdev@vger.kernel.org, gpiez@web.de,
	Dave Jones <davej@redhat.com>
Subject: Re: Possible networking regression in 3.6.0
Date: Tue, 02 Oct 2012 17:48:39 +0200	[thread overview]
Message-ID: <1349192919.12401.778.camel@edumazet-glaptop> (raw)
In-Reply-To: <1349192133.12401.768.camel@edumazet-glaptop>

From: Eric Dumazet <edumazet@google.com>

On Tue, 2012-10-02 at 17:35 +0200, Eric Dumazet wrote:
> On Mon, 2012-10-01 at 22:04 +0200, Eric Dumazet wrote:
> > On Mon, 2012-10-01 at 16:01 -0400, David Miller wrote:
> 
> > > If you can find a way to more reliably trigger the case, that would
> > > help us immensely.
> > 
> > I am building a KMEMCHECK kernel, as a last try before my night ;)
> 
> This was a total disaster. KMEMCHECK dies horribly on my machine
> 
> David, shouldnt we use a nh_rth_forward instead of a nh_rth_input in
> __mkroute_input() ?
> 
> (And change rt_cache_route() as well ?)
> 
> I am testing a patch right now.

Yeah, this patch seems to fix the bug for me.

[PATCH] ipv4: properly cache forward routes

commit d2d68ba9fe8 (ipv4: Cache input routes in fib_info nexthops.)
introduced a regression for forwarding.

This was hard to reproduce but the symptom was that packets were
delivered to local host instead of being forwarded.

Add a separate cache (nh_rth_forward) to solve the problem.

Many thanks to Chris Clayton for his patience and help.

Reported-by: Chris Clayton <chris2553@googlemail.com>
Bisected-by: Chris Clayton <chris2553@googlemail.com>
Reported-by: Dave Jones <davej@redhat.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
---
 include/net/ip_fib.h     |    1 +
 net/ipv4/fib_semantics.c |    1 +
 net/ipv4/route.c         |   16 ++++++++--------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 926142e..ce7ffe9 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -85,6 +85,7 @@ struct fib_nh {
 	int			nh_saddr_genid;
 	struct rtable __rcu * __percpu *nh_pcpu_rth_output;
 	struct rtable __rcu	*nh_rth_input;
+	struct rtable __rcu	*nh_rth_forward;
 	struct fnhe_hash_bucket	*nh_exceptions;
 };
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3509065..45b5d1d 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -208,6 +208,7 @@ static void free_fib_info_rcu(struct rcu_head *head)
 			free_nh_exceptions(nexthop_nh);
 		rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
 		rt_fibinfo_free(&nexthop_nh->nh_rth_input);
+		rt_fibinfo_free(&nexthop_nh->nh_rth_forward);
 	} endfor_nexthops(fi);
 
 	release_net(fi->fib_net);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ff62206..50898d6 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1193,14 +1193,12 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
 	return ret;
 }
 
-static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
+static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt, struct rtable **p)
 {
-	struct rtable *orig, *prev, **p;
+	struct rtable *orig, *prev;
 	bool ret = true;
 
-	if (rt_is_input_route(rt)) {
-		p = (struct rtable **)&nh->nh_rth_input;
-	} else {
+	if (!p) {
 		if (!nh->nh_pcpu_rth_output)
 			goto nocache;
 		p = (struct rtable **)__this_cpu_ptr(nh->nh_pcpu_rth_output);
@@ -1290,7 +1288,7 @@ static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
 		if (unlikely(fnhe))
 			cached = rt_bind_exception(rt, fnhe, daddr);
 		else if (!(rt->dst.flags & DST_NOCACHE))
-			cached = rt_cache_route(nh, rt);
+			cached = rt_cache_route(nh, rt, NULL);
 	}
 	if (unlikely(!cached))
 		rt_add_uncached_list(rt);
@@ -1462,7 +1460,7 @@ static int __mkroute_input(struct sk_buff *skb,
 	do_cache = false;
 	if (res->fi) {
 		if (!itag) {
-			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+			rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_forward);
 			if (rt_cache_valid(rth)) {
 				skb_dst_set_noref(skb, &rth->dst);
 				goto out;
@@ -1493,6 +1491,8 @@ static int __mkroute_input(struct sk_buff *skb,
 
 	rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
 	skb_dst_set(skb, &rth->dst);
+	if (do_cache)
+		rt_cache_route(&FIB_RES_NH(*res), rth, &FIB_RES_NH(*res).nh_rth_forward);
 out:
 	err = 0;
  cleanup:
@@ -1663,7 +1663,7 @@ local_input:
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	if (do_cache)
-		rt_cache_route(&FIB_RES_NH(res), rth);
+		rt_cache_route(&FIB_RES_NH(res), rth, &FIB_RES_NH(res).nh_rth_input);
 	skb_dst_set(skb, &rth->dst);
 	err = 0;
 	goto out;

  reply	other threads:[~2012-10-02 15:48 UTC|newest]

Thread overview: 59+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2012-09-17 15:44 Possible networking regression in 3.6.0 Chris Clayton
2012-09-18 14:21 ` Chris Clayton
2012-09-18 14:31   ` Chris Clayton
2012-09-18 14:40     ` Eric Dumazet
2012-09-18 15:51       ` Chris Clayton
2012-09-19 15:26       ` Chris Clayton
2012-09-22  6:26         ` Chris Clayton
2012-09-27 11:50           ` Chris Clayton
2012-09-27 12:14             ` Eric Dumazet
2012-09-27 18:05               ` Chris Clayton
2012-09-27 21:03                 ` Eric Dumazet
2012-09-27 21:17                   ` Eric Dumazet
2012-09-28  6:53                     ` David Miller
2012-09-28  9:14                       ` Chris Clayton
2012-09-28  9:22                     ` Chris Clayton
2012-09-28 11:26                       ` Eric Dumazet
2012-09-28 14:28                         ` Chris Clayton
2012-09-30 15:26                         ` Chris Clayton
2012-09-30 19:45                           ` Eric Dumazet
2012-10-01  8:36                             ` Chris Clayton
2012-10-01  9:15                               ` Eric Dumazet
2012-10-01 15:13                                 ` Chris Clayton
2012-10-01 15:31                                   ` Eric Dumazet
2012-10-01 16:19                                     ` Chris Clayton
2012-10-01 16:37                                       ` Eric Dumazet
2012-10-01 18:28                                         ` Chris Clayton
2012-10-01 18:34                                     ` Captain Obvious
2012-10-01 19:21                                       ` Eric Dumazet
2012-10-01 19:55                                         ` Chris Clayton
2012-10-01 19:22                                       ` Chris Clayton
2012-10-01 19:34                                 ` Dave Jones
2012-10-01 20:01                                   ` David Miller
2012-10-01 20:04                                     ` Eric Dumazet
2012-10-02 15:27                                       ` Edivaldo de Araújo Pereira
2012-10-02 15:35                                       ` Eric Dumazet
2012-10-02 15:48                                         ` Eric Dumazet [this message]
2012-10-02 15:57                                           ` Dave Jones
2012-10-02 16:06                                             ` Eric Dumazet
2012-10-02 18:25                                           ` David Miller
2012-10-02 21:14                                             ` Alexander Duyck
2012-10-02 21:35                                               ` Eric Dumazet
2012-10-02 23:24                                           ` Julian Anastasov
2012-10-03  3:10                                             ` David Miller
2012-10-03 15:01                                               ` Chris Clayton
2012-10-03 20:57                                               ` Julian Anastasov
2012-10-03  7:28                                             ` [PATCH] udp: increment UDP_MIB_NOPORTS in mcast receive Eric Dumazet
2012-10-03 12:45                                               ` David Stevens
2012-10-03 13:15                                                 ` Eric Dumazet
2012-10-03 14:09                                                   ` David Stevens
2012-10-03 15:29                                                     ` Eric Dumazet
2012-10-03 17:31                                                       ` David Stevens
2012-10-03 19:30                                                         ` David Miller
2012-10-03 17:39                                                     ` Rick Jones
2012-10-03  2:55                                           ` Possible networking regression in 3.6.0 David Miller
2012-10-04 11:25                                           ` [PATCH] ipv4: add a fib_type to fib_info Eric Dumazet
2012-10-04 13:08                                             ` Chris Clayton
2012-10-04 13:32                                               ` Eric Dumazet
2012-10-04 18:14                                                 ` David Miller
2012-09-18 14:44     ` Possible networking regression in 3.6.0 Chris Clayton

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1349192919.12401.778.camel@edumazet-glaptop \
    --to=eric.dumazet@gmail.com \
    --cc=chris2553@googlemail.com \
    --cc=davej@redhat.com \
    --cc=davem@davemloft.net \
    --cc=gpiez@web.de \
    --cc=netdev@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox