netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Fragment ID wrap workaround (read-only, untested).
@ 2004-07-15  5:57 Rusty Russell (IBM)
  2004-07-15  8:28 ` David Stevens
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell (IBM) @ 2004-07-15  5:57 UTC (permalink / raw)
  To: netdev

Hi all,

	I spoke about this today, thought I'd send the code out.  Useful only
for reading, as it's entirely untested and some is tricky and needs
careful thinking.

Name: Fragment ID Wrap Workaround
Status: Untested
Signed-off-by: Rusty Russell <rusty@au.ibm.com> (authored)

There's at least one old IBM Bugzilla bug, in which fragement IDs
wrapped, causing NFS data corruption on UDP stresstesting.

Solution presented here is twofold:

1) Move the offset of the fragments every time the ID wraps (usually
   the packet doesn't fit exactly into the MTU, so we have some
   slack), and

2) Check overlapping fragments that the contents match: if not, drop
   the whole thing.

Note that I also implemented skb_iter functions, so I could compare
the fragment overlap efficiently: really should be a separate patch.
DaveM points out (FIXME) that doing the double walk means we need to
guarantee two kmaps for the networking code.

Also applies to IPv6.  Simpler implementation would just drop all
fragments on any overlap as a "doesn't happen IRL" case (it needs
someone to duplicate a packet, then send each one by a different MTU
path).

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/ip.h .4882-linux-2.6.7-bk20.updated/include/linux/ip.h
--- .4882-linux-2.6.7-bk20/include/linux/ip.h	2004-07-08 15:10:10.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/ip.h	2004-07-09 13:08:42.000000000 +1000
@@ -118,12 +118,12 @@ struct inet_opt {
 	int			tos;		/* TOS */
 	unsigned	   	cmsg_flags;
 	struct ip_options	*opt;
+	__u32			id;		/* ID counter for DF pkts */
 	__u16			sport;		/* Source port */
 	unsigned char		hdrincl;	/* Include headers ? */
 	__u8			mc_ttl;		/* Multicasting TTL */
 	__u8			mc_loop;	/* Loopback */
 	__u8			pmtudisc;
-	__u16			id;		/* ID counter for DF pkts */
 	unsigned		recverr : 1,
 				freebind : 1;
 	int			mc_index;	/* Multicast device index */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/skbuff.h .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h
--- .4882-linux-2.6.7-bk20/include/linux/skbuff.h	2004-07-08 15:10:11.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h	2004-07-09 14:31:11.000000000 +1000
@@ -1108,6 +1108,23 @@ extern void	       skb_split(struct sk_b
 extern void skb_init(void);
 extern void skb_add_mtu(int mtu);
 
+struct skb_iter
+{
+	/* Iteration functions set these */
+	unsigned char *data;
+	unsigned int len;
+
+	/* Private to iteration */
+	unsigned int nextfrag;
+	struct sk_buff *fraglist;
+};
+
+/* Keep iterating until skb_iter_next returns false. */
+extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i);
+extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i);
+/* Call this if aborting loop before !skb_iter_next */
+extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i);
+
 #ifdef CONFIG_NETFILTER
 static inline void nf_conntrack_put(struct nf_ct_info *nfct)
 {
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/core/skbuff.c .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c
--- .4882-linux-2.6.7-bk20/net/core/skbuff.c	2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c	2004-07-09 14:35:28.000000000 +1000
@@ -929,6 +929,70 @@ fault:
 	return -EFAULT;
 }
 
+/* Keep iterating until skb_iter_next returns false. */
+void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i)
+{
+	i->len = skb_headlen(skb);
+	i->data = (unsigned char *)skb->data;
+	i->nextfrag = 0;
+	i->fraglist = NULL;
+}
+
+int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i)
+{
+	/* Unmap previous, if not head fragment. */
+	if (i->nextfrag)
+		kunmap_skb_frag(i->data);
+
+	if (i->fraglist) {
+	fraglist:
+		/* We're iterating through fraglist. */
+		if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) {
+			i->data = kmap_skb_frag(&skb_shinfo(i->fraglist)
+						->frags[i->nextfrag]);
+			i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag]
+				.size;
+			i->nextfrag++;
+			return 1;
+		}
+		/* Fragments with fragments?  Too hard! */
+		BUG_ON(skb_shinfo(i->fraglist)->frag_list);
+		i->fraglist = i->fraglist->next;
+		if (!i->fraglist)
+			goto end;
+
+		i->len = skb_headlen(i->fraglist);
+		i->data = i->fraglist->data;
+		i->nextfrag = 0;
+		return 1;
+	}
+
+	if (i->nextfrag < skb_shinfo(skb)->nr_frags) {
+		i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]);
+		i->len = skb_shinfo(skb)->frags[i->nextfrag].size;
+		i->nextfrag++;
+		return 1;
+	}
+
+	i->fraglist = skb_shinfo(skb)->frag_list;
+	if (i->fraglist)
+		goto fraglist;
+
+end:
+	/* Bug trap for callers */
+	i->data = NULL;
+	return 0;
+}
+
+void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i)
+{
+	/* Unmap previous, if not head fragment. */
+	if (i->data && i->nextfrag)
+		kunmap_skb_frag(i->data);
+	/* Bug trap for callers */
+	i->data = NULL;
+}
+
 /* Checksum skb data. */
 
 unsigned int skb_checksum(const struct sk_buff *skb, int offset,
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c	2004-06-17 08:49:53.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c	2004-07-09 15:28:48.000000000 +1000
@@ -399,8 +399,81 @@ static inline struct ipq *ip_find(struct
 	return ip_frag_create(hash, iph);
 }
 
-/* Add new segment to existing queue. */
-static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int skb_data_equal(const struct sk_buff *new, int startnew,
+			  const struct sk_buff *old, int startold,
+			  int len)
+{
+	struct skb_iter newi, oldi;
+	int ret = 1;
+
+	/* Move to first chunk with this offset in both cases */
+	skb_iter_first(new, &newi);
+	while (newi.len < startnew) {
+		startnew -= newi.len;
+		skb_iter_next(new, &newi);
+	}
+
+	skb_iter_first(old, &oldi);
+	while (oldi.len < startold) {
+		startold -= oldi.len;
+		skb_iter_next(old, &oldi);
+	}
+
+	while (len > 0) {
+		int cmplen = len;
+
+		/* How much can we compare? */
+		if (cmplen > oldi.len - startold)
+			cmplen = oldi.len - startold;
+		if (cmplen > newi.len - startnew)
+			cmplen = newi.len - startnew;
+		if (memcmp(oldi.data+startold, newi.data+startnew, cmplen)) {
+			ret = 0;
+			break;
+		}
+		startnew += cmplen;
+		startold += cmplen;
+		if (startold == oldi.len) {
+			skb_iter_next(old, &oldi);
+			startold = 0;
+		}
+		if (startnew == newi.len) {
+			skb_iter_next(new, &newi);
+			startnew = 0;
+		}
+		len -= cmplen;
+	}
+
+	skb_iter_abort(new, &newi);
+	skb_iter_abort(old, &oldi);
+	return ret;
+}
+
+static int frag_overlap_mismatch(const struct sk_buff *new,
+				 int offset,
+				 const struct sk_buff *old)
+{
+	int old_offset = FRAG_CB(old)->offset;
+	int startnew, startold, len;
+
+	if (offset < old_offset) {
+		startnew = old_offset - offset;
+		startold = 0;
+	} else {
+		startnew = 0;
+		startold = offset - old_offset;
+	}
+
+	len = min(old->len - startold, new->len - startnew);
+	if (len < 0)
+		return 0;
+
+	return !skb_data_equal(new, startnew, old, startold, len);
+}
+
+/* Add new segment to existing queue.  Return false if whole queue
+ * must drop. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct sk_buff *prev, *next;
 	int flags, offset;
@@ -471,6 +544,8 @@ static void ip_frag_queue(struct ipq *qp
 			offset += i;
 			if (end <= offset)
 				goto err;
+			if (frag_overlap_mismatch(skb, offset, prev))
+				goto mismatch;
 			if (!pskb_pull(skb, i))
 				goto err;
 			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -481,6 +556,9 @@ static void ip_frag_queue(struct ipq *qp
 	while (next && FRAG_CB(next)->offset < end) {
 		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
 
+		if (frag_overlap_mismatch(skb, offset, next))
+			goto mismatch;
+
 		if (i < next->len) {
 			/* Eat head of the next overlapped fragment
 			 * and leave the loop. The next ones cannot overlap.
@@ -532,10 +610,17 @@ static void ip_frag_queue(struct ipq *qp
 	list_move_tail(&qp->lru_list, &ipq_lru_list);
 	write_unlock(&ipfrag_lock);
 
-	return;
+	return 1;
 
 err:
 	kfree_skb(skb);
+	return 1;
+
+mismatch:
+	/* Roughly equiv. to checksum incorrect. */
+	ipq_kill(qp);
+	kfree_skb(skb);
+	return 0;
 }
 
 
@@ -650,12 +735,13 @@ struct sk_buff *ip_defrag(struct sk_buff
 
 		spin_lock(&qp->lock);
 
-		ip_frag_queue(qp, skb);
-
-		if (qp->last_in == (FIRST_IN|LAST_IN) &&
-		    qp->meat == qp->len)
-			ret = ip_frag_reasm(qp, dev);
-
+		if (!ip_frag_queue(qp, skb))
+			ipq_kill(qp);
+		else {
+			if (qp->last_in == (FIRST_IN|LAST_IN) &&
+			    qp->meat == qp->len)
+				ret = ip_frag_reasm(qp, dev);
+		}
 		spin_unlock(&qp->lock);
 		ipq_put(qp);
 		return ret;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c	2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c	2004-07-10 09:44:49.000000000 +1000
@@ -582,20 +582,33 @@ slow_path:
 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 	not_last_frag = iph->frag_off & htons(IP_MF);
 
+	len = left;
+	/* IF: it doesn't fit, use 'mtu' - the data space left */
+	if (len > mtu)
+		len = mtu;
+
+	/* IF: we are not sending upto and including the packet end
+	   then align the next start on an eight byte boundary */
+	if (len < left)
+		len &= ~7;
+
+	/* Try to shift initial fragment boundary if we can, to help
+	 * other end detect ID wrap. */
+	if (skb->sk) {
+		unsigned int slack;
+		struct inet_opt *inet = inet_sk(skb->sk);
+
+		slack = (left % mtu);
+		if (slack)
+			/* Shift by 8 bytes per id wrap. */
+			len = mtu - (slack % ((inet->id >> 16) << 3));
+	}
+
 	/*
 	 *	Keep copying data until we run out.
 	 */
 
 	while(left > 0)	{
-		len = left;
-		/* IF: it doesn't fit, use 'mtu' - the data space left */
-		if (len > mtu)
-			len = mtu;
-		/* IF: we are not sending upto and including the packet end
-		   then align the next start on an eight byte boundary */
-		if (len < left)	{
-			len &= ~7;
-		}
 		/*
 		 *	Allocate buffer.
 		 */
@@ -674,6 +687,16 @@ slow_path:
 		err = output(skb2);
 		if (err)
 			goto fail;
+
+		len = left;
+		/* IF: it doesn't fit, use 'mtu' - the data space left */
+		if (len > mtu)
+			len = mtu;
+		/* IF: we are not sending upto and including the packet end
+		   then align the next start on an eight byte boundary */
+		if (len < left)	{
+			len &= ~7;
+		}
 	}
 	kfree_skb(skb);
 	IP_INC_STATS(FragOKs);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Fragment ID wrap workaround (read-only, untested).
@ 2004-07-15  6:36 Rusty Russell (IBM)
  2004-07-15 17:34 ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: Rusty Russell (IBM) @ 2004-07-15  6:36 UTC (permalink / raw)
  To: netdev

Hi all,

	I spoke about this today, thought I'd send the code out.  Useful only
for reading, as it's entirely untested and some is tricky and needs
careful thinking.

Name: Fragment ID Wrap Workaround
Status: Untested
Signed-off-by: Rusty Russell <rusty@au.ibm.com> (authored)

There's at least one old IBM Bugzilla bug, in which fragement IDs
wrapped, causing NFS data corruption on UDP stresstesting.

Solution presented here is twofold:

1) Move the offset of the fragments every time the ID wraps (usually
   the packet doesn't fit exactly into the MTU, so we have some
   slack), and

2) Check overlapping fragments that the contents match: if not, drop
   the whole thing.

Note that I also implemented skb_iter functions, so I could compare
the fragment overlap efficiently: really should be a separate patch.
DaveM points out (FIXME) that doing the double walk means we need to
guarantee two kmaps for the networking code.

Also applies to IPv6.  Simpler implementation would just drop all
fragments on any overlap as a "doesn't happen IRL" case (it needs
someone to duplicate a packet, then send each one by a different MTU
path).

diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/ip.h .4882-linux-2.6.7-bk20.updated/include/linux/ip.h
--- .4882-linux-2.6.7-bk20/include/linux/ip.h	2004-07-08 15:10:10.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/ip.h	2004-07-09 13:08:42.000000000 +1000
@@ -118,12 +118,12 @@ struct inet_opt {
 	int			tos;		/* TOS */
 	unsigned	   	cmsg_flags;
 	struct ip_options	*opt;
+	__u32			id;		/* ID counter for DF pkts */
 	__u16			sport;		/* Source port */
 	unsigned char		hdrincl;	/* Include headers ? */
 	__u8			mc_ttl;		/* Multicasting TTL */
 	__u8			mc_loop;	/* Loopback */
 	__u8			pmtudisc;
-	__u16			id;		/* ID counter for DF pkts */
 	unsigned		recverr : 1,
 				freebind : 1;
 	int			mc_index;	/* Multicast device index */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/skbuff.h .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h
--- .4882-linux-2.6.7-bk20/include/linux/skbuff.h	2004-07-08 15:10:11.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h	2004-07-09 14:31:11.000000000 +1000
@@ -1108,6 +1108,23 @@ extern void	       skb_split(struct sk_b
 extern void skb_init(void);
 extern void skb_add_mtu(int mtu);
 
+struct skb_iter
+{
+	/* Iteration functions set these */
+	unsigned char *data;
+	unsigned int len;
+
+	/* Private to iteration */
+	unsigned int nextfrag;
+	struct sk_buff *fraglist;
+};
+
+/* Keep iterating until skb_iter_next returns false. */
+extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i);
+extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i);
+/* Call this if aborting loop before !skb_iter_next */
+extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i);
+
 #ifdef CONFIG_NETFILTER
 static inline void nf_conntrack_put(struct nf_ct_info *nfct)
 {
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/core/skbuff.c .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c
--- .4882-linux-2.6.7-bk20/net/core/skbuff.c	2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c	2004-07-09 14:35:28.000000000 +1000
@@ -929,6 +929,70 @@ fault:
 	return -EFAULT;
 }
 
+/* Keep iterating until skb_iter_next returns false. */
+void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i)
+{
+	i->len = skb_headlen(skb);
+	i->data = (unsigned char *)skb->data;
+	i->nextfrag = 0;
+	i->fraglist = NULL;
+}
+
+int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i)
+{
+	/* Unmap previous, if not head fragment. */
+	if (i->nextfrag)
+		kunmap_skb_frag(i->data);
+
+	if (i->fraglist) {
+	fraglist:
+		/* We're iterating through fraglist. */
+		if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) {
+			i->data = kmap_skb_frag(&skb_shinfo(i->fraglist)
+						->frags[i->nextfrag]);
+			i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag]
+				.size;
+			i->nextfrag++;
+			return 1;
+		}
+		/* Fragments with fragments?  Too hard! */
+		BUG_ON(skb_shinfo(i->fraglist)->frag_list);
+		i->fraglist = i->fraglist->next;
+		if (!i->fraglist)
+			goto end;
+
+		i->len = skb_headlen(i->fraglist);
+		i->data = i->fraglist->data;
+		i->nextfrag = 0;
+		return 1;
+	}
+
+	if (i->nextfrag < skb_shinfo(skb)->nr_frags) {
+		i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]);
+		i->len = skb_shinfo(skb)->frags[i->nextfrag].size;
+		i->nextfrag++;
+		return 1;
+	}
+
+	i->fraglist = skb_shinfo(skb)->frag_list;
+	if (i->fraglist)
+		goto fraglist;
+
+end:
+	/* Bug trap for callers */
+	i->data = NULL;
+	return 0;
+}
+
+void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i)
+{
+	/* Unmap previous, if not head fragment. */
+	if (i->data && i->nextfrag)
+		kunmap_skb_frag(i->data);
+	/* Bug trap for callers */
+	i->data = NULL;
+}
+
 /* Checksum skb data. */
 
 unsigned int skb_checksum(const struct sk_buff *skb, int offset,
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c	2004-06-17 08:49:53.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c	2004-07-09 15:28:48.000000000 +1000
@@ -399,8 +399,81 @@ static inline struct ipq *ip_find(struct
 	return ip_frag_create(hash, iph);
 }
 
-/* Add new segment to existing queue. */
-static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int skb_data_equal(const struct sk_buff *new, int startnew,
+			  const struct sk_buff *old, int startold,
+			  int len)
+{
+	struct skb_iter newi, oldi;
+	int ret = 1;
+
+	/* Move to first chunk with this offset in both cases */
+	skb_iter_first(new, &newi);
+	while (newi.len < startnew) {
+		startnew -= newi.len;
+		skb_iter_next(new, &newi);
+	}
+
+	skb_iter_first(old, &oldi);
+	while (oldi.len < startold) {
+		startold -= oldi.len;
+		skb_iter_next(old, &oldi);
+	}
+
+	while (len > 0) {
+		int cmplen = len;
+
+		/* How much can we compare? */
+		if (cmplen > oldi.len - startold)
+			cmplen = oldi.len - startold;
+		if (cmplen > newi.len - startnew)
+			cmplen = newi.len - startnew;
+		if (memcmp(oldi.data+startold, newi.data+startnew, cmplen)) {
+			ret = 0;
+			break;
+		}
+		startnew += cmplen;
+		startold += cmplen;
+		if (startold == oldi.len) {
+			skb_iter_next(old, &oldi);
+			startold = 0;
+		}
+		if (startnew == newi.len) {
+			skb_iter_next(new, &newi);
+			startnew = 0;
+		}
+		len -= cmplen;
+	}
+
+	skb_iter_abort(new, &newi);
+	skb_iter_abort(old, &oldi);
+	return ret;
+}
+
+static int frag_overlap_mismatch(const struct sk_buff *new,
+				 int offset,
+				 const struct sk_buff *old)
+{
+	int old_offset = FRAG_CB(old)->offset;
+	int startnew, startold, len;
+
+	if (offset < old_offset) {
+		startnew = old_offset - offset;
+		startold = 0;
+	} else {
+		startnew = 0;
+		startold = offset - old_offset;
+	}
+
+	len = min(old->len - startold, new->len - startnew);
+	if (len < 0)
+		return 0;
+
+	return !skb_data_equal(new, startnew, old, startold, len);
+}
+
+/* Add new segment to existing queue.  Return false if whole queue
+ * must drop. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
 {
 	struct sk_buff *prev, *next;
 	int flags, offset;
@@ -471,6 +544,8 @@ static void ip_frag_queue(struct ipq *qp
 			offset += i;
 			if (end <= offset)
 				goto err;
+			if (frag_overlap_mismatch(skb, offset, prev))
+				goto mismatch;
 			if (!pskb_pull(skb, i))
 				goto err;
 			if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -481,6 +556,9 @@ static void ip_frag_queue(struct ipq *qp
 	while (next && FRAG_CB(next)->offset < end) {
 		int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
 
+		if (frag_overlap_mismatch(skb, offset, next))
+			goto mismatch;
+
 		if (i < next->len) {
 			/* Eat head of the next overlapped fragment
 			 * and leave the loop. The next ones cannot overlap.
@@ -532,10 +610,17 @@ static void ip_frag_queue(struct ipq *qp
 	list_move_tail(&qp->lru_list, &ipq_lru_list);
 	write_unlock(&ipfrag_lock);
 
-	return;
+	return 1;
 
 err:
 	kfree_skb(skb);
+	return 1;
+
+mismatch:
+	/* Roughly equiv. to checksum incorrect. */
+	ipq_kill(qp);
+	kfree_skb(skb);
+	return 0;
 }
 

@@ -650,12 +735,13 @@ struct sk_buff *ip_defrag(struct sk_buff
 
 		spin_lock(&qp->lock);
 
-		ip_frag_queue(qp, skb);
-
-		if (qp->last_in == (FIRST_IN|LAST_IN) &&
-		    qp->meat == qp->len)
-			ret = ip_frag_reasm(qp, dev);
-
+		if (!ip_frag_queue(qp, skb))
+			ipq_kill(qp);
+		else {
+			if (qp->last_in == (FIRST_IN|LAST_IN) &&
+			    qp->meat == qp->len)
+				ret = ip_frag_reasm(qp, dev);
+		}
 		spin_unlock(&qp->lock);
 		ipq_put(qp);
 		return ret;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c	2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c	2004-07-10 09:44:49.000000000 +1000
@@ -582,20 +582,33 @@ slow_path:
 	offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
 	not_last_frag = iph->frag_off & htons(IP_MF);
 
+	len = left;
+	/* IF: it doesn't fit, use 'mtu' - the data space left */
+	if (len > mtu)
+		len = mtu;
+
+	/* IF: we are not sending upto and including the packet end
+	   then align the next start on an eight byte boundary */
+	if (len < left)
+		len &= ~7;
+
+	/* Try to shift initial fragment boundary if we can, to help
+	 * other end detect ID wrap. */
+	if (skb->sk) {
+		unsigned int slack;
+		struct inet_opt *inet = inet_sk(skb->sk);
+
+		slack = (left % mtu);
+		if (slack)
+			/* Shift by 8 bytes per id wrap. */
+			len = mtu - (slack % ((inet->id >> 16) << 3));
+	}
+
 	/*
 	 *	Keep copying data until we run out.
 	 */
 
 	while(left > 0)	{
-		len = left;
-		/* IF: it doesn't fit, use 'mtu' - the data space left */
-		if (len > mtu)
-			len = mtu;
-		/* IF: we are not sending upto and including the packet end
-		   then align the next start on an eight byte boundary */
-		if (len < left)	{
-			len &= ~7;
-		}
 		/*
 		 *	Allocate buffer.
 		 */
@@ -674,6 +687,16 @@ slow_path:
 		err = output(skb2);
 		if (err)
 			goto fail;
+
+		len = left;
+		/* IF: it doesn't fit, use 'mtu' - the data space left */
+		if (len > mtu)
+			len = mtu;
+		/* IF: we are not sending upto and including the packet end
+		   then align the next start on an eight byte boundary */
+		if (len < left)	{
+			len &= ~7;
+		}
 	}
 	kfree_skb(skb);
 	IP_INC_STATS(FragOKs);

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15  5:57 Rusty Russell (IBM)
@ 2004-07-15  8:28 ` David Stevens
  2004-07-15  9:27   ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: David Stevens @ 2004-07-15  8:28 UTC (permalink / raw)
  To: Rusty Russell (IBM); +Cc: netdev

Rusty,
        Those ideas should work if both sides are Linux, but not
when mixing with something else. A non-Linux receiver won't
detect wrap and drop the packet, generally, even if the fragments
overlap and don't match, and a non-Linux sender will send
(typically) same-sized fragments that aren't offset. Doesn't hurt
anything for those cases, so maybe in addition:

My idea for solving this problem is to create an estimator for the
reassembly timer. The fundamental problem as I see it is that the
reassembly timer is fixed, and about 6000 times too long on a
fast network (and worse the faster networks get).

In the typical case, you'll receive lots of successfully reassembled
packets from the same destination and, from those, you can build
a good time estimate for how long it takes you to receive all the
fragments, when you're going to. The reassembly timeout ought
to be a little longer than that estimator, which might be a fraction of
a millisecond on a local link, and maybe tens of seconds going across
the Internet. Then you can time out and release the frags based on
the expected behavior, rather than waiting so long the other side
has time to wrap while you still have a valid frag queue.

It also has the advantage that, on a faster link, a higher loss rate
doesn't have you wasting memory holding fragments that aren't
ever going to be reassembled successfully, anyway.

The estimator could be very much like the TCP rtt estimator, and
could go in the routing table (not so good for asymmetric paths),
the fib, or its own little cache w/ timeout maintained by the reassembly
code.

There are some problems with this scheme, too, but I think it fixes
most bad behavior on the receiver side with unmodified senders.

Comments?
                                        +-DLS

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15  8:28 ` David Stevens
@ 2004-07-15  9:27   ` Andi Kleen
  2004-07-15 14:49     ` David Stevens
  0 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2004-07-15  9:27 UTC (permalink / raw)
  To: David Stevens; +Cc: Rusty Russell (IBM), netdev

On Thu, Jul 15, 2004 at 02:28:05AM -0600, David Stevens wrote:
> My idea for solving this problem is to create an estimator for the
> reassembly timer. The fundamental problem as I see it is that the
> reassembly timer is fixed, and about 6000 times too long on a
> fast network (and worse the faster networks get).

Won't that make the worst case behaviour on a congested link much worse?

e.g. consider a very congested link with variable RTTs. Or a 
link that works relatively smoothly and suddenly the RTT increases.

Yes, running fragmentation over those is not a good idea, but
still it should not be made worse.

Your variable timer even with a smoothing algorithm in the RTT 
estimator will expire far too early and very likely drop a lot more 
fragments in this scenario than before.

In general handling a link where the RTT increases would seem
tricky with your scheme. Unlike TCP there is no retransmit
to save the day.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15  9:27   ` Andi Kleen
@ 2004-07-15 14:49     ` David Stevens
  2004-07-15 16:24       ` John Heffner
                         ` (2 more replies)
  0 siblings, 3 replies; 11+ messages in thread
From: David Stevens @ 2004-07-15 14:49 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev, Rusty Russell (IBM)

Andi Kleen <ak@suse.de> wrote on 07/15/2004 02:27:17 AM:

> Won't that make the worst case behaviour on a congested link much worse?

> e.g. consider a very congested link with variable RTTs. Or a
> link that works relatively smoothly and suddenly the RTT increases.

I know what you mean here, but just to be precise, this isn't an RTT
estimator, but an estimator for the time to receive a complete set
of fragments. And, of course, it'd need to be scaled to a (potential)
max-sized packet, since the number of fragments isn't known in advance,
and could be larger. Better multiple the time-out for a 4K reassembly
by 16, in case you get a 64K datagram next.

> Yes, running fragmentation over those is not a good idea, but
> still it should not be made worse.

Delivery to the user of incorrect data is the problem, and, no, it doesn't
make that worse. :-) The scenario, to make it clear for everyone, is a
small loss rate on a fast network leads to reassembling packets with the
same IP ID that are not the same packet when the ID wraps before the frag
queue timer has expired. If you're blasting away on a gigabit network (or
faster) and you drop one fragment (or more) from a packet you've received,
that frag queue will be there 65536 packets later when you reuse the same 
ID
for a different packet. I think that works out to be 7 secs or so at full
rate-- well within the 1-4 minute typical frag queue timer on most 
systems.
When the second packet arrives, if it's big enough that the missing frag
offsets can fulfill reassembly, it'll use them. So, 100% of the time when
sending same-sized packets, like NFS mostly does, and you lose 1 fragment,
you'll reassemble garbage when the IP ID wraps (well before the frag queue
expires). And the checksum will pass anyway on average about 1/64K of the
time. If you send at full rate and drop, say, 100 frags a second, it
doesn't take too long to get a Frankenpacket-- reassembled from parts of
others. :-)

That's the problem the timer idea is trying to solve, and a higher loss
rate here is acceptable-- the checksum only fails to catch the problem
1/64K of the time, so you probably have a relatively high loss rate to
start with when it's occurring.

> Your variable timer even with a smoothing algorithm in the RTT
> estimator will expire far too early and very likely drop a lot more
> fragments in this scenario than before.

Not necessarily, because it doesn't at all have to be a "near" estimate,
the way TCP is trying to make it. It can solve the problem by taking a
close estimate to the actual time and then using a frag timeout that's
10 times bigger. As long as the frag timeout isn't thousands of times too
large (as it is now), IP ID wrap can't happen before you dump the frag
queue-- the whole point.

> In general handling a link where the RTT increases would seem
> tricky with your scheme. Unlike TCP there is no retransmit
> to save the day.

In the particular case (NFS over UDP), there is both a retransmit (done
by RPC) and significant loss rate to start with. As long as the time-out
is conservative, I don't think this has to affect other cases
significantly.

                                                +-DLS

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15 14:49     ` David Stevens
@ 2004-07-15 16:24       ` John Heffner
  2004-07-15 16:27       ` Andi Kleen
  2004-07-27 12:38       ` Olaf Kirch
  2 siblings, 0 replies; 11+ messages in thread
From: John Heffner @ 2004-07-15 16:24 UTC (permalink / raw)
  To: David Stevens; +Cc: Andi Kleen, netdev, Rusty Russell (IBM)

FYI, we have an informational internet-draft documenting this problem.
Currently can be foud at:

<http://www.ietf.org/internet-drafts/draft-mathis-frag-harmful-00.txt>.

  -John

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15 14:49     ` David Stevens
  2004-07-15 16:24       ` John Heffner
@ 2004-07-15 16:27       ` Andi Kleen
  2004-07-15 16:54         ` David Stevens
  2004-07-27 12:38       ` Olaf Kirch
  2 siblings, 1 reply; 11+ messages in thread
From: Andi Kleen @ 2004-07-15 16:27 UTC (permalink / raw)
  To: David Stevens; +Cc: netdev, rusty

On Thu, 15 Jul 2004 07:49:13 -0700
David Stevens <dlstevens@us.ibm.com> wrote:

> 
> > Yes, running fragmentation over those is not a good idea, but
> > still it should not be made worse.
> 
> Delivery to the user of incorrect data is the problem, and, no, it doesn't
> make that worse. :-) The scenario, to make it clear for everyone, is a]

The data corruption problem only starts to become a real issue
at Gigabit Speeds and faster. Normally such congested links are much slower

> small loss rate on a fast network leads to reassembling packets with the
> same IP ID that are not the same packet when the ID wraps before the frag
> queue timer has expired. If you're blasting away on a gigabit network (or
> faster) and you drop one fragment (or more) from a packet you've received,
> that frag queue will be there 65536 packets later when you reuse the same 
> ID
> for a different packet. I think that works out to be 7 secs or so at full
> rate-- well within the 1-4 minute typical frag queue timer on most 
> systems.

[...]

I'm well aware of the Gigabit+ NFS problem. My standard suggestion to solve
it is to just get rid of NFS over UDP - it always was a bad idea.

My point was just that you're concentrating on that one only,
but you're potentially causing more problems for slow links.
The stack has to work well both for slow and fast links though.


> 
> > In general handling a link where the RTT increases would seem
> > tricky with your scheme. Unlike TCP there is no retransmit
> > to save the day.
> 
> In the particular case (NFS over UDP), there is both a retransmit (done
> by RPC) and significant loss rate to start with. As long as the time-out
> is conservative, I don't think this has to affect other cases
> significantly.

NFS over UDP is just a bad idea. Don't do it. NFS over TCP 
works fine these days and should be the prefered choice for everybody.

I don't really see the point of risking problems with slower links
just to fix a fundamentally flawed protocol.

And you cannot rely on all UDP based protocols doing this as well.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15 16:27       ` Andi Kleen
@ 2004-07-15 16:54         ` David Stevens
  2004-07-15 17:02           ` Andi Kleen
  0 siblings, 1 reply; 11+ messages in thread
From: David Stevens @ 2004-07-15 16:54 UTC (permalink / raw)
  To: Andi Kleen; +Cc: netdev, rusty

Andi Kleen wrote on 07/15/2004 09:27:35 AM:

> I'm well aware of the Gigabit+ NFS problem. My standard suggestion to 
solve
> it is to just get rid of NFS over UDP - it always was a bad idea.

No disagreement here.

> My point was just that you're concentrating on that one only,
> but you're potentially causing more problems for slow links.
> The stack has to work well both for slow and fast links though.

That's the problem it's intended to solve, but of course any
actual solution would have to behave well for all links, and
that's in fact the whole reason to have a dynamic frag queue
timeout instead of a fixed one for all links. As long as the
timeout is conservative without being so enormously so that it
allows IP ID wrap, it shouldn't affect any existing use at all.

If the timeout were scaled to be only 10 (or 100!) times any reasonable
expectation of success rather than thousands or tens of thousands
of times too large, the problem wouldn't exist on fast links, and
slow links would behave exactly as they do now.

I agree that NFS over UDP should be dead as soon as possible,
and fragmentation in general not far behind it. They aren't quite
dead yet; until they are, why not make them better behaved? And
if your argument is that it isn't worth fixing because it isn't
used, then of course the argument that it'll break slow links to
change it doesn't fly. Sites obviously have fast links where this
can break, and done correctly, this shouldn't affect slow links
at all. People who don't use NFS over UDP aren't affected by it
either way, right? :-)
                                                        +-DLS

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15 16:54         ` David Stevens
@ 2004-07-15 17:02           ` Andi Kleen
  0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2004-07-15 17:02 UTC (permalink / raw)
  To: David Stevens; +Cc: netdev, rusty

On Thu, 15 Jul 2004 09:54:53 -0700
David Stevens <dlstevens@us.ibm.com> wrote:


> I agree that NFS over UDP should be dead as soon as possible,
> and fragmentation in general not far behind it. They aren't quite
> dead yet; until they are, why not make them better behaved? And
> if your argument is that it isn't worth fixing because it isn't

I wouldn't go that far, just make extremly sure that any 
solution works on slow links too. The problem I see 
is that if you make the delay factor long enough to make 
the extremly variable links not regress you risk 
making the wrap on very fast links likely again. 

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15  6:36 Fragment ID wrap workaround (read-only, untested) Rusty Russell (IBM)
@ 2004-07-15 17:34 ` Andi Kleen
  0 siblings, 0 replies; 11+ messages in thread
From: Andi Kleen @ 2004-07-15 17:34 UTC (permalink / raw)
  To: Rusty Russell (IBM); +Cc: netdev

> +	 * other end detect ID wrap. */
> +	if (skb->sk) {
> +		unsigned int slack;
> +		struct inet_opt *inet = inet_sk(skb->sk);
> +
> +		slack = (left % mtu);
> +		if (slack)
> +			/* Shift by 8 bytes per id wrap. */
> +			len = mtu - (slack % ((inet->id >> 16) << 3));

I'm pretty sure inet->id is wrong here. You would like the counter
in the inet peer entry. inet_sk(sk)->id is just used for the pseudo
counter in TCP that only works around VJ compression bugs. It's never used
for real fragmentation.

-Andi

^ permalink raw reply	[flat|nested] 11+ messages in thread

* Re: Fragment ID wrap workaround (read-only, untested).
  2004-07-15 14:49     ` David Stevens
  2004-07-15 16:24       ` John Heffner
  2004-07-15 16:27       ` Andi Kleen
@ 2004-07-27 12:38       ` Olaf Kirch
  2 siblings, 0 replies; 11+ messages in thread
From: Olaf Kirch @ 2004-07-27 12:38 UTC (permalink / raw)
  To: David Stevens; +Cc: Andi Kleen, netdev, Rusty Russell (IBM)

> you'll reassemble garbage when the IP ID wraps (well before the frag queue
> expires). And the checksum will pass anyway on average about 1/64K of the
> time. If you send at full rate and drop, say, 100 frags a second, it
> doesn't take too long to get a Frankenpacket-- reassembled from parts of
> others. :-)

In the scenarios we were looking at, packet loss rate was fairly low.
What compounded the problem was that the NFS payload wasn't very varied,
so the UDP checksum distribution was far from even.

When we looked into the problem, we considered implementing a per-route
parameter where the admin can set lower reassembly timeouts. I think this
is a solution that both addresses the problem, and does not interfere
with WAN traffic. The user space tools could even select reasonable
defaults based on the hardware type when setting up the device.

(We did not implement this because we decided to go for NFS over TCP by
default instead).

> > In general handling a link where the RTT increases would seem
> > tricky with your scheme. Unlike TCP there is no retransmit
> > to save the day.
> 
> In the particular case (NFS over UDP), there is both a retransmit (done
> by RPC) and significant loss rate to start with. As long as the time-out
> is conservative, I don't think this has to affect other cases
> significantly.

NFS isn't the only application making heavy use of UDP.  Video and
audio do so too, and these don't have retransmits. Granted, these should
choose a paket size that is below the path MTU, but not all applications
always do.

IMO an estimator such as you describe would need to be very sensitive
to jitter in fragment latencies, and it may be fairly hard to find a
solution that works from 802.11 up to 10GE. A per-route reassembly
timeout is probably a lot less of a headache.

Olaf
-- 
Olaf Kirch     |  The Hardware Gods hate me.
okir@suse.de   |
---------------+ 

^ permalink raw reply	[flat|nested] 11+ messages in thread

end of thread, other threads:[~2004-07-27 12:38 UTC | newest]

Thread overview: 11+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-07-15  6:36 Fragment ID wrap workaround (read-only, untested) Rusty Russell (IBM)
2004-07-15 17:34 ` Andi Kleen
  -- strict thread matches above, loose matches on Subject: below --
2004-07-15  5:57 Rusty Russell (IBM)
2004-07-15  8:28 ` David Stevens
2004-07-15  9:27   ` Andi Kleen
2004-07-15 14:49     ` David Stevens
2004-07-15 16:24       ` John Heffner
2004-07-15 16:27       ` Andi Kleen
2004-07-15 16:54         ` David Stevens
2004-07-15 17:02           ` Andi Kleen
2004-07-27 12:38       ` Olaf Kirch

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).