From: "Rusty Russell (IBM)" <rusty@au1.ibm.com>
To: netdev@oss.sgi.com
Subject: Fragment ID wrap workaround (read-only, untested).
Date: Thu, 15 Jul 2004 15:57:58 +1000 [thread overview]
Message-ID: <1089871078.3571.56.camel@bach> (raw)
Hi all,
I spoke about this today, thought I'd send the code out. Useful only
for reading, as it's entirely untested and some is tricky and needs
careful thinking.
Name: Fragment ID Wrap Workaround
Status: Untested
Signed-off-by: Rusty Russell <rusty@au.ibm.com> (authored)
There's at least one old IBM Bugzilla bug, in which fragement IDs
wrapped, causing NFS data corruption on UDP stresstesting.
Solution presented here is twofold:
1) Move the offset of the fragments every time the ID wraps (usually
the packet doesn't fit exactly into the MTU, so we have some
slack), and
2) Check overlapping fragments that the contents match: if not, drop
the whole thing.
Note that I also implemented skb_iter functions, so I could compare
the fragment overlap efficiently: really should be a separate patch.
DaveM points out (FIXME) that doing the double walk means we need to
guarantee two kmaps for the networking code.
Also applies to IPv6. Simpler implementation would just drop all
fragments on any overlap as a "doesn't happen IRL" case (it needs
someone to duplicate a packet, then send each one by a different MTU
path).
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/ip.h .4882-linux-2.6.7-bk20.updated/include/linux/ip.h
--- .4882-linux-2.6.7-bk20/include/linux/ip.h 2004-07-08 15:10:10.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/ip.h 2004-07-09 13:08:42.000000000 +1000
@@ -118,12 +118,12 @@ struct inet_opt {
int tos; /* TOS */
unsigned cmsg_flags;
struct ip_options *opt;
+ __u32 id; /* ID counter for DF pkts */
__u16 sport; /* Source port */
unsigned char hdrincl; /* Include headers ? */
__u8 mc_ttl; /* Multicasting TTL */
__u8 mc_loop; /* Loopback */
__u8 pmtudisc;
- __u16 id; /* ID counter for DF pkts */
unsigned recverr : 1,
freebind : 1;
int mc_index; /* Multicast device index */
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/include/linux/skbuff.h .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h
--- .4882-linux-2.6.7-bk20/include/linux/skbuff.h 2004-07-08 15:10:11.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/include/linux/skbuff.h 2004-07-09 14:31:11.000000000 +1000
@@ -1108,6 +1108,23 @@ extern void skb_split(struct sk_b
extern void skb_init(void);
extern void skb_add_mtu(int mtu);
+struct skb_iter
+{
+ /* Iteration functions set these */
+ unsigned char *data;
+ unsigned int len;
+
+ /* Private to iteration */
+ unsigned int nextfrag;
+ struct sk_buff *fraglist;
+};
+
+/* Keep iterating until skb_iter_next returns false. */
+extern void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i);
+extern int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i);
+/* Call this if aborting loop before !skb_iter_next */
+extern void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i);
+
#ifdef CONFIG_NETFILTER
static inline void nf_conntrack_put(struct nf_ct_info *nfct)
{
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/core/skbuff.c .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c
--- .4882-linux-2.6.7-bk20/net/core/skbuff.c 2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/core/skbuff.c 2004-07-09 14:35:28.000000000 +1000
@@ -929,6 +929,70 @@ fault:
return -EFAULT;
}
+/* Keep iterating until skb_iter_next returns false. */
+void skb_iter_first(const struct sk_buff *skb, struct skb_iter *i)
+{
+ i->len = skb_headlen(skb);
+ i->data = (unsigned char *)skb->data;
+ i->nextfrag = 0;
+ i->fraglist = NULL;
+}
+
+int skb_iter_next(const struct sk_buff *skb, struct skb_iter *i)
+{
+ /* Unmap previous, if not head fragment. */
+ if (i->nextfrag)
+ kunmap_skb_frag(i->data);
+
+ if (i->fraglist) {
+ fraglist:
+ /* We're iterating through fraglist. */
+ if (i->nextfrag < skb_shinfo(i->fraglist)->nr_frags) {
+ i->data = kmap_skb_frag(&skb_shinfo(i->fraglist)
+ ->frags[i->nextfrag]);
+ i->len = skb_shinfo(i->fraglist)->frags[i->nextfrag]
+ .size;
+ i->nextfrag++;
+ return 1;
+ }
+ /* Fragments with fragments? Too hard! */
+ BUG_ON(skb_shinfo(i->fraglist)->frag_list);
+ i->fraglist = i->fraglist->next;
+ if (!i->fraglist)
+ goto end;
+
+ i->len = skb_headlen(i->fraglist);
+ i->data = i->fraglist->data;
+ i->nextfrag = 0;
+ return 1;
+ }
+
+ if (i->nextfrag < skb_shinfo(skb)->nr_frags) {
+ i->data = kmap_skb_frag(&skb_shinfo(skb)->frags[i->nextfrag]);
+ i->len = skb_shinfo(skb)->frags[i->nextfrag].size;
+ i->nextfrag++;
+ return 1;
+ }
+
+ i->fraglist = skb_shinfo(skb)->frag_list;
+ if (i->fraglist)
+ goto fraglist;
+
+end:
+ /* Bug trap for callers */
+ i->data = NULL;
+ return 0;
+}
+
+void skb_iter_abort(const struct sk_buff *skb, struct skb_iter *i)
+{
+ /* Unmap previous, if not head fragment. */
+ if (i->data && i->nextfrag)
+ kunmap_skb_frag(i->data);
+ /* Bug trap for callers */
+ i->data = NULL;
+}
+
/* Checksum skb data. */
unsigned int skb_checksum(const struct sk_buff *skb, int offset,
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_fragment.c 2004-06-17 08:49:53.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_fragment.c 2004-07-09 15:28:48.000000000 +1000
@@ -399,8 +399,81 @@ static inline struct ipq *ip_find(struct
return ip_frag_create(hash, iph);
}
-/* Add new segment to existing queue. */
-static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
+static int skb_data_equal(const struct sk_buff *new, int startnew,
+ const struct sk_buff *old, int startold,
+ int len)
+{
+ struct skb_iter newi, oldi;
+ int ret = 1;
+
+ /* Move to first chunk with this offset in both cases */
+ skb_iter_first(new, &newi);
+ while (newi.len < startnew) {
+ startnew -= newi.len;
+ skb_iter_next(new, &newi);
+ }
+
+ skb_iter_first(old, &oldi);
+ while (oldi.len < startold) {
+ startold -= oldi.len;
+ skb_iter_next(old, &oldi);
+ }
+
+ while (len > 0) {
+ int cmplen = len;
+
+ /* How much can we compare? */
+ if (cmplen > oldi.len - startold)
+ cmplen = oldi.len - startold;
+ if (cmplen > newi.len - startnew)
+ cmplen = newi.len - startnew;
+ if (memcmp(oldi.data+startold, newi.data+startnew, cmplen)) {
+ ret = 0;
+ break;
+ }
+ startnew += cmplen;
+ startold += cmplen;
+ if (startold == oldi.len) {
+ skb_iter_next(old, &oldi);
+ startold = 0;
+ }
+ if (startnew == newi.len) {
+ skb_iter_next(new, &newi);
+ startnew = 0;
+ }
+ len -= cmplen;
+ }
+
+ skb_iter_abort(new, &newi);
+ skb_iter_abort(old, &oldi);
+ return ret;
+}
+
+static int frag_overlap_mismatch(const struct sk_buff *new,
+ int offset,
+ const struct sk_buff *old)
+{
+ int old_offset = FRAG_CB(old)->offset;
+ int startnew, startold, len;
+
+ if (offset < old_offset) {
+ startnew = old_offset - offset;
+ startold = 0;
+ } else {
+ startnew = 0;
+ startold = offset - old_offset;
+ }
+
+ len = min(old->len - startold, new->len - startnew);
+ if (len < 0)
+ return 0;
+
+ return !skb_data_equal(new, startnew, old, startold, len);
+}
+
+/* Add new segment to existing queue. Return false if whole queue
+ * must drop. */
+static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{
struct sk_buff *prev, *next;
int flags, offset;
@@ -471,6 +544,8 @@ static void ip_frag_queue(struct ipq *qp
offset += i;
if (end <= offset)
goto err;
+ if (frag_overlap_mismatch(skb, offset, prev))
+ goto mismatch;
if (!pskb_pull(skb, i))
goto err;
if (skb->ip_summed != CHECKSUM_UNNECESSARY)
@@ -481,6 +556,9 @@ static void ip_frag_queue(struct ipq *qp
while (next && FRAG_CB(next)->offset < end) {
int i = end - FRAG_CB(next)->offset; /* overlap is 'i' bytes */
+ if (frag_overlap_mismatch(skb, offset, next))
+ goto mismatch;
+
if (i < next->len) {
/* Eat head of the next overlapped fragment
* and leave the loop. The next ones cannot overlap.
@@ -532,10 +610,17 @@ static void ip_frag_queue(struct ipq *qp
list_move_tail(&qp->lru_list, &ipq_lru_list);
write_unlock(&ipfrag_lock);
- return;
+ return 1;
err:
kfree_skb(skb);
+ return 1;
+
+mismatch:
+ /* Roughly equiv. to checksum incorrect. */
+ ipq_kill(qp);
+ kfree_skb(skb);
+ return 0;
}
@@ -650,12 +735,13 @@ struct sk_buff *ip_defrag(struct sk_buff
spin_lock(&qp->lock);
- ip_frag_queue(qp, skb);
-
- if (qp->last_in == (FIRST_IN|LAST_IN) &&
- qp->meat == qp->len)
- ret = ip_frag_reasm(qp, dev);
-
+ if (!ip_frag_queue(qp, skb))
+ ipq_kill(qp);
+ else {
+ if (qp->last_in == (FIRST_IN|LAST_IN) &&
+ qp->meat == qp->len)
+ ret = ip_frag_reasm(qp, dev);
+ }
spin_unlock(&qp->lock);
ipq_put(qp);
return ret;
diff -urpN --exclude TAGS -X /home/rusty/devel/kernel/kernel-patches/current-dontdiff --minimal .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c
--- .4882-linux-2.6.7-bk20/net/ipv4/ip_output.c 2004-07-08 15:10:12.000000000 +1000
+++ .4882-linux-2.6.7-bk20.updated/net/ipv4/ip_output.c 2004-07-10 09:44:49.000000000 +1000
@@ -582,20 +582,33 @@ slow_path:
offset = (ntohs(iph->frag_off) & IP_OFFSET) << 3;
not_last_frag = iph->frag_off & htons(IP_MF);
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left)
+ len &= ~7;
+
+ /* Try to shift initial fragment boundary if we can, to help
+ * other end detect ID wrap. */
+ if (skb->sk) {
+ unsigned int slack;
+ struct inet_opt *inet = inet_sk(skb->sk);
+
+ slack = (left % mtu);
+ if (slack)
+ /* Shift by 8 bytes per id wrap. */
+ len = mtu - (slack % ((inet->id >> 16) << 3));
+ }
+
/*
* Keep copying data until we run out.
*/
while(left > 0) {
- len = left;
- /* IF: it doesn't fit, use 'mtu' - the data space left */
- if (len > mtu)
- len = mtu;
- /* IF: we are not sending upto and including the packet end
- then align the next start on an eight byte boundary */
- if (len < left) {
- len &= ~7;
- }
/*
* Allocate buffer.
*/
@@ -674,6 +687,16 @@ slow_path:
err = output(skb2);
if (err)
goto fail;
+
+ len = left;
+ /* IF: it doesn't fit, use 'mtu' - the data space left */
+ if (len > mtu)
+ len = mtu;
+ /* IF: we are not sending upto and including the packet end
+ then align the next start on an eight byte boundary */
+ if (len < left) {
+ len &= ~7;
+ }
}
kfree_skb(skb);
IP_INC_STATS(FragOKs);
next reply other threads:[~2004-07-15 5:57 UTC|newest]
Thread overview: 11+ messages / expand[flat|nested] mbox.gz Atom feed top
2004-07-15 5:57 Rusty Russell (IBM) [this message]
2004-07-15 8:28 ` Fragment ID wrap workaround (read-only, untested) David Stevens
2004-07-15 9:27 ` Andi Kleen
2004-07-15 14:49 ` David Stevens
2004-07-15 16:24 ` John Heffner
2004-07-15 16:27 ` Andi Kleen
2004-07-15 16:54 ` David Stevens
2004-07-15 17:02 ` Andi Kleen
2004-07-27 12:38 ` Olaf Kirch
-- strict thread matches above, loose matches on Subject: below --
2004-07-15 6:36 Rusty Russell (IBM)
2004-07-15 17:34 ` Andi Kleen
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1089871078.3571.56.camel@bach \
--to=rusty@au1.ibm.com \
--cc=netdev@oss.sgi.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).