From mboxrd@z Thu Jan 1 00:00:00 1970 From: David Miller Subject: Re: [PATCH 2/5] ipv4: Kill ip_rt_frag_needed(). Date: Mon, 11 Jun 2012 16:02:58 -0700 (PDT) Message-ID: <20120611.160258.866525532025442350.davem@davemloft.net> References: <20120611.042024.1022194952800114410.davem@davemloft.net> <20120611.042813.401909584318598192.davem@davemloft.net> <20120611114256.GL27795@secunet.com> Mime-Version: 1.0 Content-Type: Text/Plain; charset=us-ascii Content-Transfer-Encoding: 7bit Cc: netdev@vger.kernel.org To: steffen.klassert@secunet.com Return-path: Received: from shards.monkeyblade.net ([149.20.54.216]:46775 "EHLO shards.monkeyblade.net" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1751495Ab2FKXC7 (ORCPT ); Mon, 11 Jun 2012 19:02:59 -0400 In-Reply-To: <20120611114256.GL27795@secunet.com> Sender: netdev-owner@vger.kernel.org List-ID: From: Steffen Klassert Date: Mon, 11 Jun 2012 13:42:56 +0200 > On Mon, Jun 11, 2012 at 04:28:13AM -0700, David Miller wrote: >> From: David Miller >> Date: Mon, 11 Jun 2012 04:20:24 -0700 (PDT) >> >> > We need to find a way to implement this then, in such a way >> > that we have the route context used to send the ping packet >> > out. >> >> The problem is RAW sockets right? If so, then this is where the >> fix belongs. >> > > Hm, I've just tried with tracepath (udp) and I also don't see the > pmtu informations cached. > > I still had no time to look deeper into the new inetpeer code, > I've just gave it a quick try. I'll try to find out what's going on. Here below is the kind of patch I was suggesting we make. I did a simple test to make sure the update MTU code path is taken in raw_err(). But I'm having second thoughts about whether any of this is a good idea. UDP works by notifying userspace of PMTU events. And this is mandatory, if we're setting DF we have to get the user to decrease the size of it's datagram writes below the reported PMTU value. As a consequence I believe RAW sockets should also work via notifications. And therefore it can be argued that in neither case should we update the routing cache PMTU information. This is in line with the fact that TCP goes to great lengths to validate that the PMTU it's getting is really legitimate and not forged, and UDP and RAW have no way of making such strict checks. And it also shows that those TCP strict checks were for nothing beforehand. That PMTU update guard done by TCP was (before my changes this past weekend) pointless because we were doing the PMTU update unconditionally earlier in the ICMP handling. diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 4032b81..3e2507b 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -242,6 +242,17 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) err = icmp_err_convert[code].errno; harderr = icmp_err_convert[code].fatal; if (code == ICMP_FRAG_NEEDED) { + struct flowi4 *fl4 = &inet->cork.fl.u.ip4; + struct rtable *rt; + + bh_lock_sock(sk); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); + if (!IS_ERR(rt)) { + rt->dst.ops->update_pmtu(&rt->dst, info); + ip_rt_put(rt); + } + bh_unlock_sock(sk); + harderr = inet->pmtudisc != IP_PMTUDISC_DONT; err = EMSGSIZE; } @@ -316,9 +327,8 @@ int raw_rcv(struct sock *sk, struct sk_buff *skb) return 0; } -static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, - void *from, size_t length, - struct rtable **rtp, +static int raw_send_hdrinc(struct sock *sk, __be32 saddr, __be32 daddr, + void *from, size_t length, struct rtable **rtp, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); @@ -331,7 +341,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, int hlen, tlen; if (length > rt->dst.dev->mtu) { - ip_local_error(sk, EMSGSIZE, fl4->daddr, inet->inet_dport, + ip_local_error(sk, EMSGSIZE, daddr, inet->inet_dport, rt->dst.dev->mtu); return -EMSGSIZE; } @@ -378,7 +388,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, if (iphlen >= sizeof(*iph)) { if (!iph->saddr) - iph->saddr = fl4->saddr; + iph->saddr = saddr; iph->check = 0; iph->tot_len = htons(length); if (!iph->id) @@ -461,7 +471,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct inet_sock *inet = inet_sk(sk); struct ipcm_cookie ipc; struct rtable *rt = NULL; - struct flowi4 fl4; + struct flowi4 *fl4; int free = 0; __be32 daddr; __be32 saddr; @@ -563,54 +573,66 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, } else if (!ipc.oif) ipc.oif = inet->uc_index; - flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos, + lock_sock(sk); + fl4 = &inet->cork.fl.u.ip4; + flowi4_init_output(fl4, ipc.oif, sk->sk_mark, tos, RT_SCOPE_UNIVERSE, inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, inet_sk_flowi_flags(sk) | FLOWI_FLAG_CAN_SLEEP, daddr, saddr, 0, 0); if (!inet->hdrincl) { - err = raw_probe_proto_opt(&fl4, msg); + err = raw_probe_proto_opt(fl4, msg); if (err) - goto done; + goto done_unlock; } - security_sk_classify_flow(sk, flowi4_to_flowi(&fl4)); - rt = ip_route_output_flow(sock_net(sk), &fl4, sk); + security_sk_classify_flow(sk, flowi4_to_flowi(fl4)); + rt = ip_route_output_flow(sock_net(sk), fl4, sk); if (IS_ERR(rt)) { err = PTR_ERR(rt); rt = NULL; - goto done; + goto done_unlock; } err = -EACCES; if (rt->rt_flags & RTCF_BROADCAST && !sock_flag(sk, SOCK_BROADCAST)) - goto done; + goto done_unlock; + + if (msg->msg_flags & MSG_CONFIRM) { + dst_confirm(&rt->dst); + if ((msg->msg_flags & MSG_PROBE) && !len) { + err = 0; + goto done_unlock; + } + } - if (msg->msg_flags & MSG_CONFIRM) - goto do_confirm; -back_from_confirm: + if (inet->hdrincl) { + __be32 saddr = fl4->saddr; + __be32 daddr = fl4->daddr; - if (inet->hdrincl) - err = raw_send_hdrinc(sk, &fl4, msg->msg_iov, len, + release_sock(sk); + err = raw_send_hdrinc(sk, saddr, daddr, + msg->msg_iov, len, &rt, msg->msg_flags); - else { + goto done; + } else { if (!ipc.addr) - ipc.addr = fl4.daddr; - lock_sock(sk); - err = ip_append_data(sk, &fl4, ip_generic_getfrag, + ipc.addr = fl4->daddr; + err = ip_append_data(sk, fl4, ip_generic_getfrag, msg->msg_iov, len, 0, &ipc, &rt, msg->msg_flags); if (err) ip_flush_pending_frames(sk); else if (!(msg->msg_flags & MSG_MORE)) { - err = ip_push_pending_frames(sk, &fl4); + err = ip_push_pending_frames(sk, fl4); if (err == -ENOBUFS && !inet->recverr) err = 0; } - release_sock(sk); } +done_unlock: + release_sock(sk); done: if (free) kfree(ipc.opt); @@ -620,13 +642,6 @@ out: if (err < 0) return err; return len; - -do_confirm: - dst_confirm(&rt->dst); - if (!(msg->msg_flags & MSG_PROBE) || len) - goto back_from_confirm; - err = 0; - goto done; } static void raw_close(struct sock *sk, long timeout)