From: "Timo Teräs" <timo.teras@iki.fi>
To: netdev@vger.kernel.org
Cc: "Timo Teräs" <timo.teras@iki.fi>
Subject: [PATCH RFC net-next 6/6] ipv4: use next hop exceptions also for input routes
Date: Mon, 27 May 2013 14:16:16 +0300 [thread overview]
Message-ID: <1369653376-4731-7-git-send-email-timo.teras@iki.fi> (raw)
In-Reply-To: <1369653376-4731-1-git-send-email-timo.teras@iki.fi>
Commit d2d68ba9 (ipv4: Cache input routes in fib_info nexthops)
assmued that "locally destined, and routed packets, never trigger
PMTU events or redirects that will be processed by us".
However, it seems that tunnel devices do trigger PMTU events in certain
cases. At least ip_gre, ip6_gre, sit, and ipip do use the inner flow's
skb_dst(skb)->ops->update_pmtu to propage mtu information from the
outer flows. These can cause the inner flow mtu to be decreased. If
next hop exceptions are not consulted for pmtu, IP fragmentation will
not be done properly for these routes.
It also seems that we really need to have the PMTU information always
for netfilter TCPMSS clamp-to-pmtu feature to work properly.
So for the time being, cache separate copies of input routes for
each next hop exception.
Signed-off-by: Timo Teräs <timo.teras@iki.fi>
---
include/net/ip_fib.h | 3 ++-
net/ipv4/fib_semantics.c | 3 ++-
net/ipv4/route.c | 65 +++++++++++++++++++++++++++++++++++++-----------
3 files changed, 54 insertions(+), 17 deletions(-)
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index 44424e9..aac8553 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -56,7 +56,8 @@ struct fib_nh_exception {
u32 fnhe_pmtu;
__be32 fnhe_gw;
unsigned long fnhe_expires;
- struct rtable __rcu *fnhe_rth;
+ struct rtable __rcu *fnhe_rth_input;
+ struct rtable __rcu *fnhe_rth_output;
unsigned long fnhe_stamp;
};
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 8f6cb7a..d5dbca5 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -169,7 +169,8 @@ static void free_nh_exceptions(struct fib_nh *nh)
next = rcu_dereference_protected(fnhe->fnhe_next, 1);
- rt_fibinfo_free(&fnhe->fnhe_rth);
+ rt_fibinfo_free(&fnhe->fnhe_rth_input);
+ rt_fibinfo_free(&fnhe->fnhe_rth_output);
kfree(fnhe);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 403e283..82f2074 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -565,10 +565,25 @@ static inline void rt_free(struct rtable *rt)
static DEFINE_SPINLOCK(fnhe_lock);
+static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
+{
+ struct rtable *rt;
+
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
+ rt_free(rt);
+ }
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
+ if (rt) {
+ RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
+ rt_free(rt);
+ }
+}
+
static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
{
struct fib_nh_exception *fnhe, *oldest;
- struct rtable *orig;
oldest = rcu_dereference(hash->chain);
for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
@@ -576,11 +591,7 @@ static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
oldest = fnhe;
}
- orig = rcu_dereference(oldest->fnhe_rth);
- if (orig) {
- RCU_INIT_POINTER(oldest->fnhe_rth, NULL);
- rt_free(orig);
- }
+ fnhe_flush_routes(oldest);
return oldest;
}
@@ -644,7 +655,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
fnhe->fnhe_expires = max(1UL, expires);
}
/* Update all cached dsts too */
- rt = rcu_dereference(fnhe->fnhe_rth);
+ rt = rcu_dereference(fnhe->fnhe_rth_input);
+ if (rt)
+ fill_route_from_fnhe(rt, fnhe);
+ rt = rcu_dereference(fnhe->fnhe_rth_output);
if (rt)
fill_route_from_fnhe(rt, fnhe);
} else {
@@ -668,6 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
* stale, so anyone caching it rechecks if this exception
* applies to them.
*/
+ rt = rcu_dereference(nh->nh_rth_input);
+ if (rt)
+ rt->dst.obsolete = DST_OBSOLETE_KILL;
+
for_each_possible_cpu(i) {
struct rtable __rcu **prt;
prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
@@ -1237,25 +1255,36 @@ static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
spin_lock_bh(&fnhe_lock);
if (daddr == fnhe->fnhe_daddr) {
+ struct rtable __rcu **porig;
+ struct rtable *orig;
int genid = fnhe_genid(dev_net(rt->dst.dev));
- struct rtable *orig = rcu_dereference(fnhe->fnhe_rth);
+
+ if (rt_is_input_route(rt))
+ porig = &fnhe->fnhe_rth_input;
+ else
+ porig = &fnhe->fnhe_rth_output;
+ orig = rcu_dereference(*porig);
if (fnhe->fnhe_genid != genid) {
fnhe->fnhe_genid = genid;
fnhe->fnhe_gw = 0;
fnhe->fnhe_pmtu = 0;
fnhe->fnhe_expires = 0;
+ fnhe_flush_routes(fnhe);
+ orig = NULL;
}
fill_route_from_fnhe(rt, fnhe);
if (!rt->rt_gateway)
rt->rt_gateway = daddr;
- rcu_assign_pointer(fnhe->fnhe_rth, rt);
- if (orig)
- rt_free(orig);
+ if (!(rt->dst.flags & DST_NOCACHE)) {
+ rcu_assign_pointer(*porig, rt);
+ if (orig)
+ rt_free(orig);
+ ret = true;
+ }
fnhe->fnhe_stamp = jiffies;
- ret = true;
}
spin_unlock_bh(&fnhe_lock);
@@ -1487,6 +1516,7 @@ static int __mkroute_input(struct sk_buff *skb,
struct in_device *in_dev,
__be32 daddr, __be32 saddr, u32 tos)
{
+ struct fib_nh_exception *fnhe;
struct rtable *rth;
int err;
struct in_device *out_dev;
@@ -1533,8 +1563,13 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
+ fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ if (fnhe != NULL)
+ rth = rcu_dereference(fnhe->fnhe_rth_input);
+ else
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -1562,7 +1597,7 @@ static int __mkroute_input(struct sk_buff *skb,
rth->dst.input = ip_forward;
rth->dst.output = ip_output;
- rt_set_nexthop(rth, daddr, res, NULL, res->fi, res->type, itag);
+ rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
skb_dst_set(skb, &rth->dst);
out:
err = 0;
@@ -1877,7 +1912,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
fnhe = find_exception(nh, fl4->daddr);
if (fnhe)
- prth = &fnhe->fnhe_rth;
+ prth = &fnhe->fnhe_rth_output;
else {
if (unlikely(fl4->flowi4_flags &
FLOWI_FLAG_KNOWN_NH &&
--
1.8.2.3
next prev parent reply other threads:[~2013-05-27 11:14 UTC|newest]
Thread overview: 24+ messages / expand[flat|nested] mbox.gz Atom feed top
2013-05-27 11:16 [PATCH net-next 0/6] ipv4 fixes for dmvpn Timo Teräs
2013-05-27 11:16 ` [PATCH net-next 1/6] net: inform NETDEV_CHANGE callbacks which flags were changed Timo Teräs
2013-05-27 11:18 ` Jiri Pirko
2013-05-27 11:16 ` [PATCH net-next 2/6] arp: flush arp cache on IFF_NOARP change Timo Teräs
2013-05-27 11:16 ` [PATCH net-next 3/6] ipv4: properly refresh rtable entries on pmtu/redirect events Timo Teräs
2013-05-27 11:16 ` [PATCH net-next 4/6] ipv4: rate limit updating of next hop exceptions with same pmtu Timo Teräs
2013-05-27 11:16 ` [PATCH net-next 5/6] ipv4: use separate genid for next hop exceptions Timo Teräs
2013-05-27 11:16 ` Timo Teräs [this message]
2013-05-28 6:33 ` [PATCH net-next 0/6] ipv4 fixes for dmvpn Timo Teras
2013-05-28 6:38 ` David Miller
2013-05-28 6:46 ` [PATCH net-next 1/3] ipv4: properly refresh rtable entries on pmtu/redirect events Timo Teräs
2013-05-28 6:46 ` [PATCH net-next 2/3] ipv4: rate limit updating of next hop exceptions with same pmtu Timo Teräs
2013-05-28 8:45 ` Julian Anastasov
2013-05-28 10:07 ` Timo Teras
2013-05-28 21:04 ` Julian Anastasov
2013-05-29 5:07 ` Timo Teras
2013-05-29 22:49 ` Julian Anastasov
2013-06-03 7:10 ` David Miller
2013-05-28 6:46 ` [PATCH net-next 3/3] ipv4: use separate genid for next hop exceptions Timo Teräs
2013-06-03 7:10 ` David Miller
2013-05-28 8:25 ` [PATCH net-next 1/3] ipv4: properly refresh rtable entries on pmtu/redirect events Julian Anastasov
2013-05-28 8:53 ` Timo Teras
2013-05-28 19:44 ` Julian Anastasov
2013-06-03 7:09 ` David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1369653376-4731-7-git-send-email-timo.teras@iki.fi \
--to=timo.teras@iki.fi \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).