From: Jon Mason <jdmason@us.ibm.com>
To: xen-devel@lists.xensource.com
Subject: [PATCH] Fix NAT for domU checksum offload
Date: Fri, 14 Oct 2005 17:32:33 -0500 [thread overview]
Message-ID: <20051014223233.GC11252@us.ibm.com> (raw)
Below is a fix for the current problem of checksum offload not working
in a NAT'ed network. The cause is the NAT/iptables code incorrectly
modifying the TCP/UDP checksum (for the checksum offload case). The
original code assumes a valid checksum, which is not the case for
checksum offload packets (which has a complimented, partial checksum for
the hardware to use). The fix is to compliment the new address and not
compliment the old address (which is complimented in the partial
checksum), and roll that with the ip_nat_cheat_check function.
There are two "versions" of the patch below. The first version is a
diff to show the actual changes made to the ip_nat_proto_udp.c and
ip_nat_proto_tcp.c file (as it is difficult/impossible to tell from the
second patch). The second version is the one to commit to the tree
(which creates 2 new files in the sparse directory).
Thanks,
Jon
--- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-06-17 14:48:29.000000000 -0500
+++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c 2005-10-14 15:17:53.000000000 -0500
@@ -112,11 +112,19 @@ udp_manip_pkt(struct sk_buff **pskb,
newport = tuple->dst.u.udp.port;
portptr = &hdr->dest;
}
- if (hdr->check) /* 0 is a special case meaning no checksum */
- hdr->check = ip_nat_cheat_check(~oldip, newip,
+
+ if (hdr->check) { /* 0 is a special case meaning no checksum */
+ if ((*pskb)->proto_csum_blank) {
+ hdr->check = ip_nat_cheat_check(oldip, ~newip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ newport, hdr->check));
+ } else {
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
ip_nat_cheat_check(*portptr ^ 0xFFFF,
newport,
hdr->check));
+ }
+ }
*portptr = newport;
return 1;
}
--- ../xen-unstable.hg/linux-2.6.12-xen0/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-06-17 14:48:29.000000000 -0500
+++ linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c 2005-10-14 16:41:20.000000000 -0500
@@ -127,10 +127,16 @@ tcp_manip_pkt(struct sk_buff **pskb,
if (hdrsize < sizeof(*hdr))
return 1;
- hdr->check = ip_nat_cheat_check(~oldip, newip,
+ if ((*pskb)->proto_csum_blank) {
+ hdr->check = ip_nat_cheat_check(oldip, ~newip,
+ ip_nat_cheat_check(oldport ^ 0xFFFF,
+ newport, hdr->check));
+ } else {
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
ip_nat_cheat_check(oldport ^ 0xFFFF,
newport,
hdr->check));
+ }
return 1;
}
Signed-off-by: Jon Mason <jdmason@us.ibm.com>
# HG changeset patch
# User root@pentium4
# Node ID 51129b413cc4dd097b415d89c81965e51bfe3e0e
# Parent b7a71e4db06d0235d67fef66db1e78cd2238c240
Fix NAT for domU checksum offload.
diff -r b7a71e4db06d -r 51129b413cc4 linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c
--- /dev/null Tue Oct 11 21:44:55 2005
+++ b/linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_tcp.c Fri Oct 14 21:52:41 2005
@@ -0,0 +1,184 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <linux/if.h>
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+
+static int
+tcp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.tcp.port;
+ else
+ port = tuple->dst.u.tcp.port;
+
+ return ntohs(port) >= ntohs(min->tcp.port)
+ && ntohs(port) <= ntohs(max->tcp.port);
+}
+
+static int
+tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.tcp.port;
+ else
+ portptr = &tuple->dst.u.tcp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ /* Map privileged onto privileged. */
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.tcp.port);
+ range_size = ntohs(range->max.tcp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack)) {
+ return 1;
+ }
+ }
+ return 0;
+}
+
+static int
+tcp_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ struct tcphdr *hdr;
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+ u32 oldip, newip;
+ u16 *portptr, newport, oldport;
+ int hdrsize = 8; /* TCP connection tracking guarantees this much */
+
+ /* this could be a inner header returned in icmp packet; in such
+ cases we cannot update the checksum field since it is outside of
+ the 8 bytes of transport layer headers we are guaranteed */
+ if ((*pskb)->len >= hdroff + sizeof(struct tcphdr))
+ hdrsize = sizeof(struct tcphdr);
+
+ if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
+ return 0;
+
+ iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ hdr = (struct tcphdr *)((*pskb)->data + hdroff);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ newip = tuple->src.ip;
+ newport = tuple->src.u.tcp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ newip = tuple->dst.ip;
+ newport = tuple->dst.u.tcp.port;
+ portptr = &hdr->dest;
+ }
+
+ oldport = *portptr;
+ *portptr = newport;
+
+ if (hdrsize < sizeof(*hdr))
+ return 1;
+
+ if ((*pskb)->proto_csum_blank) {
+ hdr->check = ip_nat_cheat_check(oldip, ~newip,
+ ip_nat_cheat_check(oldport ^ 0xFFFF,
+ newport, hdr->check));
+ } else {
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(oldport ^ 0xFFFF,
+ newport,
+ hdr->check));
+ }
+ return 1;
+}
+
+static unsigned int
+tcp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.tcp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.tcp.port));
+
+
+ if (mask->dst.u.tcp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.tcp.port));
+
+ return len;
+}
+
+static unsigned int
+tcp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.tcp.port != 0 || range->max.tcp.port != 0xFFFF) {
+ if (range->min.tcp.port == range->max.tcp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.tcp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.tcp.port),
+ ntohs(range->max.tcp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_tcp
+= { "TCP", IPPROTO_TCP,
+ tcp_manip_pkt,
+ tcp_in_range,
+ tcp_unique_tuple,
+ tcp_print,
+ tcp_print_range
+};
diff -r b7a71e4db06d -r 51129b413cc4 linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c
--- /dev/null Tue Oct 11 21:44:55 2005
+++ b/linux-2.6-xen-sparse/net/ipv4/netfilter/ip_nat_proto_udp.c Fri Oct 14 21:52:41 2005
@@ -0,0 +1,173 @@
+/* (C) 1999-2001 Paul `Rusty' Russell
+ * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/netfilter.h>
+#include <linux/ip.h>
+#include <linux/udp.h>
+#include <linux/if.h>
+
+#include <linux/netfilter_ipv4/ip_nat.h>
+#include <linux/netfilter_ipv4/ip_nat_core.h>
+#include <linux/netfilter_ipv4/ip_nat_rule.h>
+#include <linux/netfilter_ipv4/ip_nat_protocol.h>
+
+static int
+udp_in_range(const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype,
+ const union ip_conntrack_manip_proto *min,
+ const union ip_conntrack_manip_proto *max)
+{
+ u_int16_t port;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ port = tuple->src.u.udp.port;
+ else
+ port = tuple->dst.u.udp.port;
+
+ return ntohs(port) >= ntohs(min->udp.port)
+ && ntohs(port) <= ntohs(max->udp.port);
+}
+
+static int
+udp_unique_tuple(struct ip_conntrack_tuple *tuple,
+ const struct ip_nat_range *range,
+ enum ip_nat_manip_type maniptype,
+ const struct ip_conntrack *conntrack)
+{
+ static u_int16_t port, *portptr;
+ unsigned int range_size, min, i;
+
+ if (maniptype == IP_NAT_MANIP_SRC)
+ portptr = &tuple->src.u.udp.port;
+ else
+ portptr = &tuple->dst.u.udp.port;
+
+ /* If no range specified... */
+ if (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED)) {
+ /* If it's dst rewrite, can't change port */
+ if (maniptype == IP_NAT_MANIP_DST)
+ return 0;
+
+ if (ntohs(*portptr) < 1024) {
+ /* Loose convention: >> 512 is credential passing */
+ if (ntohs(*portptr)<512) {
+ min = 1;
+ range_size = 511 - min + 1;
+ } else {
+ min = 600;
+ range_size = 1023 - min + 1;
+ }
+ } else {
+ min = 1024;
+ range_size = 65535 - 1024 + 1;
+ }
+ } else {
+ min = ntohs(range->min.udp.port);
+ range_size = ntohs(range->max.udp.port) - min + 1;
+ }
+
+ for (i = 0; i < range_size; i++, port++) {
+ *portptr = htons(min + port % range_size);
+ if (!ip_nat_used_tuple(tuple, conntrack))
+ return 1;
+ }
+ return 0;
+}
+
+static int
+udp_manip_pkt(struct sk_buff **pskb,
+ unsigned int iphdroff,
+ const struct ip_conntrack_tuple *tuple,
+ enum ip_nat_manip_type maniptype)
+{
+ struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ struct udphdr *hdr;
+ unsigned int hdroff = iphdroff + iph->ihl*4;
+ u32 oldip, newip;
+ u16 *portptr, newport;
+
+ if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
+ return 0;
+
+ iph = (struct iphdr *)((*pskb)->data + iphdroff);
+ hdr = (struct udphdr *)((*pskb)->data + hdroff);
+
+ if (maniptype == IP_NAT_MANIP_SRC) {
+ /* Get rid of src ip and src pt */
+ oldip = iph->saddr;
+ newip = tuple->src.ip;
+ newport = tuple->src.u.udp.port;
+ portptr = &hdr->source;
+ } else {
+ /* Get rid of dst ip and dst pt */
+ oldip = iph->daddr;
+ newip = tuple->dst.ip;
+ newport = tuple->dst.u.udp.port;
+ portptr = &hdr->dest;
+ }
+
+ if (hdr->check) { /* 0 is a special case meaning no checksum */
+ if ((*pskb)->proto_csum_blank) {
+ hdr->check = ip_nat_cheat_check(oldip, ~newip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ newport, hdr->check));
+ } else {
+ hdr->check = ip_nat_cheat_check(~oldip, newip,
+ ip_nat_cheat_check(*portptr ^ 0xFFFF,
+ newport,
+ hdr->check));
+ }
+ }
+ *portptr = newport;
+ return 1;
+}
+
+static unsigned int
+udp_print(char *buffer,
+ const struct ip_conntrack_tuple *match,
+ const struct ip_conntrack_tuple *mask)
+{
+ unsigned int len = 0;
+
+ if (mask->src.u.udp.port)
+ len += sprintf(buffer + len, "srcpt=%u ",
+ ntohs(match->src.u.udp.port));
+
+
+ if (mask->dst.u.udp.port)
+ len += sprintf(buffer + len, "dstpt=%u ",
+ ntohs(match->dst.u.udp.port));
+
+ return len;
+}
+
+static unsigned int
+udp_print_range(char *buffer, const struct ip_nat_range *range)
+{
+ if (range->min.udp.port != 0 || range->max.udp.port != 0xFFFF) {
+ if (range->min.udp.port == range->max.udp.port)
+ return sprintf(buffer, "port %u ",
+ ntohs(range->min.udp.port));
+ else
+ return sprintf(buffer, "ports %u-%u ",
+ ntohs(range->min.udp.port),
+ ntohs(range->max.udp.port));
+ }
+ else return 0;
+}
+
+struct ip_nat_protocol ip_nat_protocol_udp
+= { "UDP", IPPROTO_UDP,
+ udp_manip_pkt,
+ udp_in_range,
+ udp_unique_tuple,
+ udp_print,
+ udp_print_range
+};
next reply other threads:[~2005-10-14 22:32 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-10-14 22:32 Jon Mason [this message]
-- strict thread matches above, loose matches on Subject: below --
2005-10-14 23:09 [PATCH] Fix NAT for domU checksum offload Ian Pratt
2005-10-14 23:16 ` Jon Mason
2005-10-14 23:49 ` Jon Mason
2005-10-15 7:24 ` Keir Fraser
2005-10-15 9:31 Ian Pratt
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20051014223233.GC11252@us.ibm.com \
--to=jdmason@us.ibm.com \
--cc=xen-devel@lists.xensource.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.