From: John Heffner <jheffner@psc.edu>
To: David Miller <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>,
netdev@vger.kernel.org, John Heffner <jheffner@psc.edu>
Subject: [PATCH 4/4] [NET] Add IP(V6)_PMTUDISC_RPOBE
Date: Wed, 18 Apr 2007 21:09:42 -0400 [thread overview]
Message-ID: <11769449832206-git-send-email-jheffner@psc.edu> (raw)
In-Reply-To: <11769449831291-git-send-email-jheffner@psc.edu>
Add IP(V6)_PMTUDISC_PROBE value for IP(V6)_MTU_DISCOVER. This option forces
us not to fragment, but does not make use of the kernel path MTU discovery.
That is, it allows for user-mode MTU probing (or, packetization-layer path
MTU discovery). This is particularly useful for diagnostic utilities, like
traceroute/tracepath.
Signed-off-by: John Heffner <jheffner@psc.edu>
---
include/linux/in.h | 1 +
include/linux/in6.h | 1 +
net/ipv4/ip_output.c | 20 +++++++++++++++-----
net/ipv4/ip_sockglue.c | 2 +-
net/ipv6/ip6_output.c | 15 ++++++++++++---
net/ipv6/ipv6_sockglue.c | 2 +-
6 files changed, 31 insertions(+), 10 deletions(-)
diff --git a/include/linux/in.h b/include/linux/in.h
index 1912e7c..3975cbf 100644
--- a/include/linux/in.h
+++ b/include/linux/in.h
@@ -83,6 +83,7 @@ struct in_addr {
#define IP_PMTUDISC_DONT 0 /* Never send DF frames */
#define IP_PMTUDISC_WANT 1 /* Use per route hints */
#define IP_PMTUDISC_DO 2 /* Always DF */
+#define IP_PMTUDISC_PROBE 3 /* Ignore dst pmtu */
#define IP_MULTICAST_IF 32
#define IP_MULTICAST_TTL 33
diff --git a/include/linux/in6.h b/include/linux/in6.h
index 4e8350a..d559fac 100644
--- a/include/linux/in6.h
+++ b/include/linux/in6.h
@@ -179,6 +179,7 @@ struct in6_flowlabel_req
#define IPV6_PMTUDISC_DONT 0
#define IPV6_PMTUDISC_WANT 1
#define IPV6_PMTUDISC_DO 2
+#define IPV6_PMTUDISC_PROBE 3
/* Flowlabel */
#define IPV6_FLOWLABEL_MGR 32
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 34606ef..66e2c3a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -189,6 +189,14 @@ static inline int ip_finish_output2(struct sk_buff *skb)
return -EINVAL;
}
+static inline int ip_skb_dst_mtu(struct sk_buff *skb)
+{
+ struct inet_sock *inet = skb->sk ? inet_sk(skb->sk) : NULL;
+
+ return (inet && inet->pmtudisc == IP_PMTUDISC_PROBE) ?
+ skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
static inline int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
@@ -198,7 +206,7 @@ static inline int ip_finish_output(struct sk_buff *skb)
return dst_output(skb);
}
#endif
- if (skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb))
+ if (skb->len > ip_skb_dst_mtu(skb) && !skb_is_gso(skb))
return ip_fragment(skb, ip_finish_output2);
else
return ip_finish_output2(skb);
@@ -422,7 +430,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
if (unlikely((iph->frag_off & htons(IP_DF)) && !skb->local_df)) {
IP_INC_STATS(IPSTATS_MIB_FRAGFAILS);
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED,
- htonl(dst_mtu(&rt->u.dst)));
+ htonl(ip_skb_dst_mtu(skb)));
kfree_skb(skb);
return -EMSGSIZE;
}
@@ -787,7 +795,9 @@ int ip_append_data(struct sock *sk,
inet->cork.addr = ipc->addr;
}
dst_hold(&rt->u.dst);
- inet->cork.fragsize = mtu = dst_mtu(rt->u.dst.path);
+ inet->cork.fragsize = mtu = inet->pmtudisc == IP_PMTUDISC_PROBE ?
+ rt->u.dst.dev->mtu :
+ dst_mtu(rt->u.dst.path);
inet->cork.rt = rt;
inet->cork.length = 0;
sk->sk_sndmsg_page = NULL;
@@ -1203,13 +1213,13 @@ int ip_push_pending_frames(struct sock *sk)
* to fragment the frame generated here. No matter, what transforms
* how transforms change size of the packet, it will come out.
*/
- if (inet->pmtudisc != IP_PMTUDISC_DO)
+ if (inet->pmtudisc < IP_PMTUDISC_DO)
skb->local_df = 1;
/* DF bit is set when we want to see DF on outgoing frames.
* If local_df is set too, we still allow to fragment this frame
* locally. */
- if (inet->pmtudisc == IP_PMTUDISC_DO ||
+ if (inet->pmtudisc >= IP_PMTUDISC_DO ||
(skb->len <= dst_mtu(&rt->u.dst) &&
ip_dont_fragment(sk, &rt->u.dst)))
df = htons(IP_DF);
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c199d23..4d54457 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -542,7 +542,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
inet->hdrincl = val ? 1 : 0;
break;
case IP_MTU_DISCOVER:
- if (val<0 || val>2)
+ if (val<0 || val>3)
goto e_inval;
inet->pmtudisc = val;
break;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index 5a5b7d4..f508171 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -137,9 +137,17 @@ static int ip6_output2(struct sk_buff *skb)
return NF_HOOK(PF_INET6, NF_IP6_POST_ROUTING, skb,NULL, skb->dev,ip6_output_finish);
}
+static inline int ip6_skb_dst_mtu(struct sk_buff *skb)
+{
+ struct ipv6_pinfo *np = skb->sk ? inet6_sk(skb->sk) : NULL;
+
+ return (np && np->pmtudisc == IPV6_PMTUDISC_PROBE) ?
+ skb->dst->dev->mtu : dst_mtu(skb->dst);
+}
+
int ip6_output(struct sk_buff *skb)
{
- if ((skb->len > dst_mtu(skb->dst) && !skb_is_gso(skb)) ||
+ if ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) ||
dst_allfrag(skb->dst))
return ip6_fragment(skb, ip6_output2);
else
@@ -566,7 +574,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
hlen = ip6_find_1stfragopt(skb, &prevhdr);
nexthdr = *prevhdr;
- mtu = dst_mtu(&rt->u.dst);
+ mtu = ip6_skb_dst_mtu(skb);
/* We must not fragment if the socket is set to force MTU discovery
* or if the skb it not generated by a local socket. (This last
@@ -1063,7 +1071,8 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to,
inet->cork.fl = *fl;
np->cork.hop_limit = hlimit;
np->cork.tclass = tclass;
- mtu = dst_mtu(rt->u.dst.path);
+ mtu = np->pmtudisc == IPV6_PMTUDISC_PROBE ?
+ rt->u.dst.dev->mtu : dst_mtu(rt->u.dst.path);
if (np->frag_size < mtu) {
if (np->frag_size)
mtu = np->frag_size;
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index da930fa..aa3d07c 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -694,7 +694,7 @@ done:
retv = ip6_ra_control(sk, val, NULL);
break;
case IPV6_MTU_DISCOVER:
- if (val<0 || val>2)
+ if (val<0 || val>3)
goto e_inval;
np->pmtudisc = val;
retv = 0;
--
1.5.1.rc3.30.ga8f4-dirty
prev parent reply other threads:[~2007-04-19 1:09 UTC|newest]
Thread overview: 26+ messages / expand[flat|nested] mbox.gz Atom feed top
2007-03-24 0:06 [PATCH 1/3] [NET] Do pmtu check in transport layer John Heffner
2007-03-24 0:06 ` [PATCH 2/3] [NET] Move DF check to ip_forward John Heffner
2007-03-24 0:06 ` [PATCH 3/3] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
2007-03-25 4:23 ` David Miller
2007-03-27 14:18 ` Andi Kleen
[not found] ` <4609640D.7010709@psc.edu>
[not found] ` <20070327193115.GA28138@one.firstfloor.org>
2007-03-27 19:52 ` [PATCH] ip(7) IP_PMTUDISC_PROBE John Heffner
2007-04-08 18:08 ` Michael Kerrisk
2007-03-25 4:17 ` [PATCH 2/3] [NET] Move DF check to ip_forward David Miller
2007-03-25 13:37 ` [NET]: Fix breakage, use ip_hdr() for DF check in ip_forward Thomas Graf
2007-03-25 20:27 ` David Miller
2007-03-25 4:14 ` [PATCH 1/3] [NET] Do pmtu check in transport layer David Miller
2007-04-09 8:40 ` Patrick McHardy
2007-04-09 16:23 ` John Heffner
2007-04-09 16:40 ` Patrick McHardy
2007-04-19 1:07 ` [PATCH 0/0] Re-try changes for PMTUDISC_PROBE John Heffner
2007-04-20 22:55 ` David Miller
2007-04-19 1:07 ` [PATCH] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
2007-04-19 1:07 ` [PATCH] Revert "[NET] Do pmtu check in transport layer" John Heffner
2007-04-19 1:07 ` [PATCH] [NET] MTU discovery check in ip6_fragment() John Heffner
2007-04-19 1:07 ` [PATCH] [NET] Add IP(V6)_PMTUDISC_RPOBE John Heffner
2007-04-19 1:11 ` John Heffner
2007-04-19 1:25 ` David Miller
2007-04-19 1:09 ` [PATCH 1/4] Revert "[NET] Add IP(V6)_PMTUDISC_RPOBE" John Heffner
2007-04-19 1:09 ` [PATCH 2/4] Revert "[NET] Do pmtu check in transport layer" John Heffner
2007-04-19 1:09 ` [PATCH 3/4] [NET] MTU discovery check in ip6_fragment() John Heffner
2007-04-19 1:09 ` John Heffner [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=11769449832206-git-send-email-jheffner@psc.edu \
--to=jheffner@psc.edu \
--cc=davem@davemloft.net \
--cc=kaber@trash.net \
--cc=netdev@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.