netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Patrick McHardy <kaber@trash.net>
To: "David S. Miller" <davem@redhat.com>
Cc: yoshfuji@linux-ipv6.org, Herbert Xu <herbert@debian.org>,
	netdev@oss.sgi.com
Subject: [PATCH 2.6]: Fix suboptimal fragment sizing for last fragment
Date: Thu, 02 Sep 2004 20:36:13 +0200	[thread overview]
Message-ID: <4137681D.3000902@trash.net> (raw)

[-- Attachment #1: Type: text/plain, Size: 810 bytes --]

Yoshifuji's recent fragment patch prevents unnecessary fragmentation
when the data can be kept in a single packet, but only for the first
packet. When fragmenting, all fragments are still truncated to
multiples of 8 and we might end up creating an unnecessary fragment.

This dump shows the problem (MTU 1499):

172.16.1.123.32771 > 172.16.195.3.4135: udp 2937 (frag 7066:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1472@1472+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1@2944)

This patch always builds mtu sized fragments and truncates the previous
fragment to a multiple of 8 bytes when allocating a new one. With the
patch the dump looks like this:


172.16.1.123.32772 > 172.16.195.3.4135: udp 2937 (frag 49641:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 49641:1473@1472)

Regards
Patrick


[-- Attachment #2: frag.diff --]
[-- Type: text/x-patch, Size: 7844 bytes --]

# This is a BitKeeper generated diff -Nru style patch.
#
# ChangeSet
#   2004/09/02 17:35:32+02:00 kaber@coreworks.de 
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
#   
#   Signed-off-by: Patrick McHardy <kaber@trash.net>
# 
# net/ipv6/ip6_output.c
#   2004/09/02 17:35:14+02:00 kaber@coreworks.de +13 -22
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
# 
# net/ipv4/ip_output.c
#   2004/09/02 17:35:14+02:00 kaber@coreworks.de +20 -49
#   [IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment
# 
diff -Nru a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
--- a/net/ipv4/ip_output.c	2004-09-02 17:39:01 +02:00
+++ b/net/ipv4/ip_output.c	2004-09-02 17:39:01 +02:00
@@ -735,10 +735,10 @@
 	int hh_len;
 	int exthdrlen;
 	int mtu;
-	int copy = 0;
+	int copy;
 	int err;
 	int offset = 0;
-	unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+	unsigned int maxfraglen, fragheaderlen;
 	int csummode = CHECKSUM_NONE;
 
 	if (flags&MSG_PROBE)
@@ -781,6 +781,7 @@
 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
 	if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
@@ -788,26 +789,11 @@
 	}
 
 	/*
-	 * Let's try using as much space as possible to avoid generating
-	 * additional unnecessary small fragment of length 
-	 * (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
-	 * -- yoshfuji
-	 */
-	if (fragheaderlen + inet->cork.length + length <= mtu)
-		maxfraglen = mtu;
-	else
-		maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-
-	if (fragheaderlen + inet->cork.length <= mtu &&
-	    fragheaderlen + inet->cork.length + length > mtu)
-		fraggap = 1;
-
-	/*
 	 * transhdrlen > 0 means that this is the first fragment and we wish
 	 * it won't be fragmented in the future.
 	 */
 	if (transhdrlen &&
-	    length + fragheaderlen <= maxfraglen &&
+	    length + fragheaderlen <= mtu &&
 	    rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
 	    !exthdrlen)
 		csummode = CHECKSUM_HW;
@@ -821,34 +807,33 @@
 	 * adding appropriate IP header.
 	 */
 
-	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
-		fraggap = 0;
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 		goto alloc_new_skb;
-	}
 
 	while (length > 0) {
-		if ((copy = maxfraglen - skb->len) <= 0) {
+		if ((copy = mtu - skb->len) <= 0) {
 			char *data;
 			unsigned int datalen;
 			unsigned int fraglen;
+			unsigned int fraggap;
 			unsigned int alloclen;
 			struct sk_buff *skb_prev;
-			BUG_TRAP(fraggap || copy == 0);
+			BUG_TRAP(copy == 0);
 
 alloc_new_skb:
 			skb_prev = skb;
+			fraggap = 0;
+			if (skb_prev)
+				fraggap = mtu - maxfraglen;
 
-			if (fraggap)
-				fraggap = -copy;
-
-			datalen = maxfraglen - fragheaderlen;
+			datalen = mtu - fragheaderlen;
 			if (datalen > length + fraggap)
 				datalen = length + fraggap;
 
 			fraglen = datalen + fragheaderlen;
 			if ((flags & MSG_MORE) && 
 			    !(rt->u.dst.dev->features&NETIF_F_SG))
-				alloclen = maxfraglen;
+				alloclen = mtu;
 			else
 				alloclen = datalen + fragheaderlen;
 
@@ -913,7 +898,6 @@
 			length -= datalen - fraggap;
 			transhdrlen = 0;
 			exthdrlen = 0;
-			fraggap = 0;
 			csummode = CHECKSUM_NONE;
 
 			/*
@@ -1006,7 +990,7 @@
 	int mtu;
 	int len;
 	int err;
-	unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+	unsigned int maxfraglen, fragheaderlen, fraggap;
 
 	if (inet->hdrincl)
 		return -EPERM;
@@ -1028,27 +1012,13 @@
 	mtu = inet->cork.fragsize;
 
 	fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
 
 	if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
 		ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
 		return -EMSGSIZE;
 	}
 
-	/*
-	 * Let's try using as much space as possible to avoid generating
-	 * additional unnecessary small fragment of length 
-	 * (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
-	 * -- yoshfuji
-	 */
-	if (fragheaderlen + inet->cork.length + size <= mtu)
-		maxfraglen = mtu;
-	else
-		maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
-
-	if (fragheaderlen + inet->cork.length <= mtu &&
-	    fragheaderlen + inet->cork.length + size > mtu)
-		fraggap = 1;
-
 	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 		return -EINVAL;
 
@@ -1056,17 +1026,18 @@
 
 	while (size > 0) {
 		int i;
-		if ((len = maxfraglen - skb->len) <= 0) {
+		if ((len = mtu - skb->len) <= 0) {
 			struct sk_buff *skb_prev;
 			char *data;
 			struct iphdr *iph;
 			int alloclen;
 
-			BUG_TRAP(fraggap || len == 0);
+			BUG_TRAP(len == 0);
 
 			skb_prev = skb;
-			if (fraggap)
-				fraggap = -len;
+			fraggap = 0;
+			if (skb_prev)
+				fraggap = mtu - maxfraglen;
 
 			alloclen = fragheaderlen + hh_len + fraggap + 15;
 			skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
diff -Nru a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
--- a/net/ipv6/ip6_output.c	2004-09-02 17:39:01 +02:00
+++ b/net/ipv6/ip6_output.c	2004-09-02 17:39:01 +02:00
@@ -814,11 +814,11 @@
 	struct inet_opt *inet = inet_sk(sk);
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct sk_buff *skb;
-	unsigned int maxfraglen, fragheaderlen, fraggap = 0;
+	unsigned int maxfraglen, fragheaderlen;
 	int exthdrlen;
 	int hh_len;
 	int mtu;
-	int copy = 0;
+	int copy;
 	int err;
 	int offset = 0;
 	int csummode = CHECKSUM_NONE;
@@ -867,6 +867,7 @@
 	hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
 
 	fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
+	maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
 
 	if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
 		if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
@@ -883,46 +884,37 @@
 	 *
 	 * Note that we may need to "move" the data from the tail of
 	 * of the buffer to the new fragment when we split 
-	 * the message at the first time.
+	 * the message.
 	 *
 	 * FIXME: It may be fragmented into multiple chunks 
 	 *        at once if non-fragmentable extension headers
 	 *        are too large.
 	 * --yoshfuji 
 	 */
-	if (fragheaderlen + inet->cork.length + length <= mtu)
-		maxfraglen = mtu;
-	else
-		maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen 
-			     - sizeof(struct frag_hdr);
-
-	if (fragheaderlen + inet->cork.length <= mtu &&
-	    fragheaderlen + inet->cork.length + length > mtu)
-		fraggap = 1;
 
 	inet->cork.length += length;
 
-	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) {
-		fraggap = 0;
+	if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
 		goto alloc_new_skb;
-	}
 
 	while (length > 0) {
-		if ((copy = maxfraglen - skb->len) <= 0) {
+		if ((copy = mtu - skb->len) <= 0) {
 			char *data;
 			unsigned int datalen;
 			unsigned int fraglen;
+			unsigned int fraggap;
 			unsigned int alloclen;
 			struct sk_buff *skb_prev;
-			BUG_TRAP(fraggap || copy == 0);
+			BUG_TRAP(copy == 0);
 alloc_new_skb:
 			skb_prev = skb;
 
 			/* There's no room in the current skb */
-			if (fraggap)
-				fraggap = -copy;
+			fraggap = 0;
+			if (skb_prev)
+				fraggap = mtu - maxfraglen;
 
-			datalen = maxfraglen - fragheaderlen;
+			datalen = mtu - fragheaderlen;
 
 			if (datalen > length + fraggap)
 				datalen = length + fraggap;
@@ -930,7 +922,7 @@
 			fraglen = datalen + fragheaderlen;
 			if ((flags & MSG_MORE) &&
 			    !(rt->u.dst.dev->features&NETIF_F_SG))
-				alloclen = maxfraglen;
+				alloclen = mtu;
 			else
 				alloclen = datalen + fragheaderlen;
 
@@ -1005,7 +997,6 @@
 			length -= datalen - fraggap;
 			transhdrlen = 0;
 			exthdrlen = 0;
-			fraggap = 0;
 			csummode = CHECKSUM_NONE;
 
 			/*

             reply	other threads:[~2004-09-02 18:36 UTC|newest]

Thread overview: 11+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2004-09-02 18:36 Patrick McHardy [this message]
2004-09-02 19:48 ` [PATCH 2.6]: Fix suboptimal fragment sizing for last fragment YOSHIFUJI Hideaki / 吉藤英明
2004-09-02 21:44 ` David S. Miller
2004-09-02 22:03   ` Herbert Xu
2004-09-02 22:08     ` David S. Miller
2004-09-03  1:40     ` YOSHIFUJI Hideaki / 吉藤英明
2004-09-07 20:35       ` David S. Miller
2004-09-07 23:15       ` Herbert Xu
2004-09-07 23:26         ` YOSHIFUJI Hideaki / 吉藤英明
2004-09-08  3:21           ` Herbert Xu
2004-09-08 20:38             ` David S. Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4137681D.3000902@trash.net \
    --to=kaber@trash.net \
    --cc=davem@redhat.com \
    --cc=herbert@debian.org \
    --cc=netdev@oss.sgi.com \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).