netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
@ 2018-03-09 16:43 Sabrina Dubroca
  2018-03-09 21:06 ` David Miller
  0 siblings, 1 reply; 4+ messages in thread
From: Sabrina Dubroca @ 2018-03-09 16:43 UTC (permalink / raw)
  To: netdev; +Cc: sbrivio, Sabrina Dubroca

Prior to the rework of PMTU information storage in commit
2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer."),
when a PMTU event advertising a PMTU smaller than
net.ipv4.route.min_pmtu was received, we would disable setting the DF
flag on packets by locking the MTU metric, and set the PMTU to
net.ipv4.route.min_pmtu.

Since then, we don't disable DF, and set the PMTU to
net.ipv4.route.min_pmtu, so the intermediate router that has this link
with a small MTU will have to drop the packets.

This patch reestablishes pre-2.6.39 behavior by adding rt_mtu_locked
to rtable, to record that we shouldn't set the DF bit on that path, and
use this in ip_dont_fragment().

One possible workaround is to set net.ipv4.route.min_pmtu to a value low
enough to accommodate the lowest MTU encountered.

Fixes: 2c8cec5c10bc ("ipv4: Cache learned PMTU information in inetpeer.")
Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
Reviewed-by: Stefano Brivio <sbrivio@redhat.com>
---
 include/net/ip.h     | 11 +++++++++--
 include/net/ip_fib.h |  1 +
 include/net/route.h  |  1 +
 net/ipv4/route.c     | 25 ++++++++++++++++++-------
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 746abff9ce51..f49b3a576bec 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -328,6 +328,13 @@ int ip_decrease_ttl(struct iphdr *iph)
 	return --iph->ttl;
 }
 
+static inline int ip_mtu_locked(const struct dst_entry *dst)
+{
+	const struct rtable *rt = (const struct rtable *)dst;
+
+	return rt->rt_mtu_locked || dst_metric_locked(dst, RTAX_MTU);
+}
+
 static inline
 int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 {
@@ -335,7 +342,7 @@ int ip_dont_fragment(const struct sock *sk, const struct dst_entry *dst)
 
 	return  pmtudisc == IP_PMTUDISC_DO ||
 		(pmtudisc == IP_PMTUDISC_WANT &&
-		 !(dst_metric_locked(dst, RTAX_MTU)));
+		 !ip_mtu_locked(dst));
 }
 
 static inline bool ip_sk_accept_pmtu(const struct sock *sk)
@@ -361,7 +368,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst,
 	struct net *net = dev_net(dst->dev);
 
 	if (net->ipv4.sysctl_ip_fwd_use_pmtu ||
-	    dst_metric_locked(dst, RTAX_MTU) ||
+	    ip_mtu_locked(dst) ||
 	    !forwarding)
 		return dst_mtu(dst);
 
diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
index f80524396c06..77d0a78cf7d2 100644
--- a/include/net/ip_fib.h
+++ b/include/net/ip_fib.h
@@ -59,6 +59,7 @@ struct fib_nh_exception {
 	int				fnhe_genid;
 	__be32				fnhe_daddr;
 	u32				fnhe_pmtu;
+	bool				fnhe_mtu_locked;
 	__be32				fnhe_gw;
 	unsigned long			fnhe_expires;
 	struct rtable __rcu		*fnhe_rth_input;
diff --git a/include/net/route.h b/include/net/route.h
index 1eb9ce470e25..729bb5e61e9a 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -64,6 +64,7 @@ struct rtable {
 
 	/* Miscellaneous cached information */
 	u32			rt_pmtu;
+	bool			rt_mtu_locked;
 
 	u32			rt_table_id;
 
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 860b3fd2f54b..c53a33af953a 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -634,6 +634,7 @@ static inline u32 fnhe_hashfun(__be32 daddr)
 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 {
 	rt->rt_pmtu = fnhe->fnhe_pmtu;
+	rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 	rt->dst.expires = fnhe->fnhe_expires;
 
 	if (fnhe->fnhe_gw) {
@@ -644,7 +645,7 @@ static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnh
 }
 
 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
-				  u32 pmtu, unsigned long expires)
+				  u32 pmtu, bool lock, unsigned long expires)
 {
 	struct fnhe_hash_bucket *hash;
 	struct fib_nh_exception *fnhe;
@@ -681,8 +682,10 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 			fnhe->fnhe_genid = genid;
 		if (gw)
 			fnhe->fnhe_gw = gw;
-		if (pmtu)
+		if (pmtu) {
 			fnhe->fnhe_pmtu = pmtu;
+			fnhe->fnhe_mtu_locked = lock;
+		}
 		fnhe->fnhe_expires = max(1UL, expires);
 		/* Update all cached dsts too */
 		rt = rcu_dereference(fnhe->fnhe_rth_input);
@@ -706,6 +709,7 @@ static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 		fnhe->fnhe_daddr = daddr;
 		fnhe->fnhe_gw = gw;
 		fnhe->fnhe_pmtu = pmtu;
+		fnhe->fnhe_mtu_locked = lock;
 		fnhe->fnhe_expires = expires;
 
 		/* Exception created; mark the cached routes for the nexthop
@@ -787,7 +791,8 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
 				struct fib_nh *nh = &FIB_RES_NH(res);
 
 				update_or_create_fnhe(nh, fl4->daddr, new_gw,
-						0, jiffies + ip_rt_gc_timeout);
+						0, false,
+						jiffies + ip_rt_gc_timeout);
 			}
 			if (kill_route)
 				rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1009,15 +1014,18 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
 	struct dst_entry *dst = &rt->dst;
 	struct fib_result res;
+	bool lock = false;
 
-	if (dst_metric_locked(dst, RTAX_MTU))
+	if (ip_mtu_locked(dst))
 		return;
 
 	if (ipv4_mtu(dst) < mtu)
 		return;
 
-	if (mtu < ip_rt_min_pmtu)
+	if (mtu < ip_rt_min_pmtu) {
+		lock = true;
 		mtu = ip_rt_min_pmtu;
+	}
 
 	if (rt->rt_pmtu == mtu &&
 	    time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
@@ -1027,7 +1035,7 @@ static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 	if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
 		struct fib_nh *nh = &FIB_RES_NH(res);
 
-		update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
+		update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
 				      jiffies + ip_rt_mtu_expires);
 	}
 	rcu_read_unlock();
@@ -1280,7 +1288,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst)
 
 	mtu = READ_ONCE(dst->dev->mtu);
 
-	if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
+	if (unlikely(ip_mtu_locked(dst))) {
 		if (rt->rt_uses_gateway && mtu > 576)
 			mtu = 576;
 	}
@@ -1516,6 +1524,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
 		rt->rt_is_input = 0;
 		rt->rt_iif = 0;
 		rt->rt_pmtu = 0;
+		rt->rt_mtu_locked = false;
 		rt->rt_gateway = 0;
 		rt->rt_uses_gateway = 0;
 		rt->rt_table_id = 0;
@@ -2643,6 +2652,8 @@ static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
 	memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
 	if (rt->rt_pmtu && expires)
 		metrics[RTAX_MTU - 1] = rt->rt_pmtu;
+	if (rt->rt_mtu_locked && expires)
+		metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
 	if (rtnetlink_put_metrics(skb, metrics) < 0)
 		goto nla_put_failure;
 
-- 
2.16.2

^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
  2018-03-09 16:43 [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu Sabrina Dubroca
@ 2018-03-09 21:06 ` David Miller
  2018-03-12 16:05   ` Sabrina Dubroca
  0 siblings, 1 reply; 4+ messages in thread
From: David Miller @ 2018-03-09 21:06 UTC (permalink / raw)
  To: sd; +Cc: netdev, sbrivio

From: Sabrina Dubroca <sd@queasysnail.net>
Date: Fri,  9 Mar 2018 17:43:21 +0100

> diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> index f80524396c06..77d0a78cf7d2 100644
> --- a/include/net/ip_fib.h
> +++ b/include/net/ip_fib.h
> @@ -59,6 +59,7 @@ struct fib_nh_exception {
>  	int				fnhe_genid;
>  	__be32				fnhe_daddr;
>  	u32				fnhe_pmtu;
> +	bool				fnhe_mtu_locked;
>  	__be32				fnhe_gw;
>  	unsigned long			fnhe_expires;
>  	struct rtable __rcu		*fnhe_rth_input;
> diff --git a/include/net/route.h b/include/net/route.h
> index 1eb9ce470e25..729bb5e61e9a 100644
> --- a/include/net/route.h
> +++ b/include/net/route.h
> @@ -64,6 +64,7 @@ struct rtable {
>  
>  	/* Miscellaneous cached information */
>  	u32			rt_pmtu;
> +	bool			rt_mtu_locked;
>  
>  	u32			rt_table_id;
>  

Please use a flag bit for this, we've worked hard to shrink these
datastructures as much as possible.

I think if you just choose an unused RTCF_* bit (f.e. 0x02000000) for
the state, you can use that because values propagate into the
rtable->rt_flags, and do not propagate out.  So you should be able to
use it in this way privately inside the kernel.

Thanks.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
  2018-03-09 21:06 ` David Miller
@ 2018-03-12 16:05   ` Sabrina Dubroca
  2018-03-12 16:27     ` David Miller
  0 siblings, 1 reply; 4+ messages in thread
From: Sabrina Dubroca @ 2018-03-12 16:05 UTC (permalink / raw)
  To: David Miller; +Cc: netdev, sbrivio

2018-03-09, 16:06:19 -0500, David Miller wrote:
> From: Sabrina Dubroca <sd@queasysnail.net>
> Date: Fri,  9 Mar 2018 17:43:21 +0100
> 
> > diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h
> > index f80524396c06..77d0a78cf7d2 100644
> > --- a/include/net/ip_fib.h
> > +++ b/include/net/ip_fib.h
> > @@ -59,6 +59,7 @@ struct fib_nh_exception {
> >  	int				fnhe_genid;
> >  	__be32				fnhe_daddr;
> >  	u32				fnhe_pmtu;
> > +	bool				fnhe_mtu_locked;
> >  	__be32				fnhe_gw;
> >  	unsigned long			fnhe_expires;
> >  	struct rtable __rcu		*fnhe_rth_input;
> > diff --git a/include/net/route.h b/include/net/route.h
> > index 1eb9ce470e25..729bb5e61e9a 100644
> > --- a/include/net/route.h
> > +++ b/include/net/route.h
> > @@ -64,6 +64,7 @@ struct rtable {
> >  
> >  	/* Miscellaneous cached information */
> >  	u32			rt_pmtu;
> > +	bool			rt_mtu_locked;
> >  
> >  	u32			rt_table_id;
> >  
> 
> Please use a flag bit for this, we've worked hard to shrink these
> datastructures as much as possible.

Oops, sorry.

> I think if you just choose an unused RTCF_* bit (f.e. 0x02000000) for
> the state, you can use that because values propagate into the
> rtable->rt_flags, and do not propagate out.  So you should be able to
> use it in this way privately inside the kernel.

What about a bitfield?

-	u32			rt_pmtu;
+	u32			rt_mtu_locked:1,
+				rt_pmtu:31;

Since it's going to be private to the kernel, I'd rather not use a
value that's in uapi, especially considering that they're almost all
used (unless we start recycling).

-- 
Sabrina

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu
  2018-03-12 16:05   ` Sabrina Dubroca
@ 2018-03-12 16:27     ` David Miller
  0 siblings, 0 replies; 4+ messages in thread
From: David Miller @ 2018-03-12 16:27 UTC (permalink / raw)
  To: sd; +Cc: netdev, sbrivio

From: Sabrina Dubroca <sd@queasysnail.net>
Date: Mon, 12 Mar 2018 17:05:28 +0100

> 2018-03-09, 16:06:19 -0500, David Miller wrote:
>> From: Sabrina Dubroca <sd@queasysnail.net>
>> Date: Fri,  9 Mar 2018 17:43:21 +0100
>> 
>> I think if you just choose an unused RTCF_* bit (f.e. 0x02000000) for
>> the state, you can use that because values propagate into the
>> rtable->rt_flags, and do not propagate out.  So you should be able to
>> use it in this way privately inside the kernel.
> 
> What about a bitfield?
> 
> -	u32			rt_pmtu;
> +	u32			rt_mtu_locked:1,
> +				rt_pmtu:31;

I guess that's fine.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2018-03-12 16:27 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2018-03-09 16:43 [PATCH net] ipv4: lock mtu in fnhe when received PMTU < net.ipv4.route.min_pmtu Sabrina Dubroca
2018-03-09 21:06 ` David Miller
2018-03-12 16:05   ` Sabrina Dubroca
2018-03-12 16:27     ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).