[PATCH 1/1] bridge: forward IPv6 fragmented packets when passing netfilter

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
@ 2015-01-19  0:43 Bernhard Thaler
  2015-01-20 17:28 ` Pablo Neira Ayuso
  0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Thaler @ 2015-01-19  0:43 UTC (permalink / raw)
  To: pablo, kadlec; +Cc: netfilter-devel, coreteam, Bernhard Thaler

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

	modprobe br_netfilter
	brctl addbr br0
	brctl addif br0 eth0
	brctl addif br0 eth2
	ifconfig eth0 up
	ifconfig eth2 up
	ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

	set IPv6 address on host A:
	ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

	set IPv6 address on host B:
	ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

	ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "iptables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to use it in net/bridge/br_netfilter.c's br_nf_dev_queue_xmit() for IPv6 packets
that need to be fragmented.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented. br_parse_ip6_options() was introduced to do some validity
checks on the IPv6 packet before calling ip6_fragment() and was closely aligned
to IPv4 code as an example.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on 
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

ip6_fragment() in net/ipv6/ip6_output.c was changed due to a NULL pointer de-
reference happening when handling packets coming from br_nf_dev_queue_xmit().
When calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0 
Oops: 0000 [#1] SMP  
...

So in6_dev_get(skb->dev) is used to set a variable "idev" which is used to call
IP6_INC_STATS() later on. It is assumed that this also solves other occasions
where ip6_fragment() will be called that may cause the same crash. However,
a better fix would be to check for the missing element causing the NULL pointer
dereference and only setting it when it is missing.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
---
 include/net/ipv6.h        |    1 +
 net/bridge/br_netfilter.c |   87 ++++++++++++++++++++++++++++++++++++++++-----
 net/ipv6/ip6_output.c     |   30 ++++++++--------
 3 files changed, 94 insertions(+), 24 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..fac485e 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
@@ -34,6 +35,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +241,59 @@ drop:
 	return -1;
 }
 
+/* Equivalent to br_parse_ip_options for IPv6 */
+
+static int br_parse_ip6_options(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	ip6h = ipv6_hdr(skb);
+
+	/* Basic sanity checks
+	 * check version
+	 * check minimum header length (40 bytes)
+	 * check total length
+	 */
+	if (ip6h->version != 6)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	len = ntohs(ip6h->payload_len) + ip6h_len;
+
+	if (skb->len < len) {
+		IP6_INC_STATS_BH(dev_net(dev), idev,
+				 IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < ip6h_len) {
+		goto inhdr_error;
+	}
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +301,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -763,7 +822,6 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	int ret;
@@ -772,6 +830,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -781,17 +840,27 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
 		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ret;
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_parse_ip6_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+		ret = ip6_fragment(skb, br_dev_queue_push_xmit);
+		return ret;
+	}
+#endif
+
+	ret = br_dev_queue_push_xmit(skb);
 
 	return ret;
 }
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..f34cbb4 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -41,6 +41,7 @@
 
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_bridge.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -564,6 +565,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 	int ptr, offset = 0, err = 0;
 	u8 *prevhdr, nexthdr = 0;
 	struct net *net = dev_net(skb_dst(skb)->dev);
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
 
 	hlen = ip6_find_1stfragopt(skb, &prevhdr);
 	nexthdr = *prevhdr;
@@ -581,8 +583,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 		skb->dev = skb_dst(skb)->dev;
 		icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
-		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-			      IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 		kfree_skb(skb);
 		return -EMSGSIZE;
 	}
@@ -592,6 +593,10 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 			mtu = np->frag_size;
 	}
 	mtu -= hlen + sizeof(struct frag_hdr);
+#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+	if (skb->nf_bridge)
+		mtu -= nf_bridge_mtu_reduction(skb);
+#endif
 
 	if (skb_has_frag_list(skb)) {
 		int first_len = skb_pagelen(skb);
@@ -630,8 +635,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		*prevhdr = NEXTHDR_FRAGMENT;
 		tmp_hdr = kmemdup(skb_network_header(skb), hlen, GFP_ATOMIC);
 		if (!tmp_hdr) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				      IPSTATS_MIB_FRAGFAILS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 			return -ENOMEM;
 		}
 
@@ -681,7 +685,7 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 
 			err = output(skb);
 			if (!err)
-				IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
+				IP6_INC_STATS(net, idev,
 					      IPSTATS_MIB_FRAGCREATES);
 
 			if (err || !frag)
@@ -695,16 +699,14 @@ int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *))
 		kfree(tmp_hdr);
 
 		if (err == 0) {
-			IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
-				      IPSTATS_MIB_FRAGOKS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGOKS);
 			ip6_rt_put(rt);
 			return 0;
 		}
 
 		kfree_skb_list(frag);
 
-		IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
-			      IPSTATS_MIB_FRAGFAILS);
+		IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 		ip6_rt_put(rt);
 		return err;
 
@@ -752,8 +754,7 @@ slow_path:
 		frag = alloc_skb(len + hlen + sizeof(struct frag_hdr) +
 				 hroom + troom, GFP_ATOMIC);
 		if (!frag) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-				      IPSTATS_MIB_FRAGFAILS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 			err = -ENOMEM;
 			goto fail;
 		}
@@ -819,17 +820,16 @@ slow_path:
 		IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
 			      IPSTATS_MIB_FRAGCREATES);
 	}
-	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-		      IPSTATS_MIB_FRAGOKS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGOKS);
 	consume_skb(skb);
 	return err;
 
 fail:
-	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
-		      IPSTATS_MIB_FRAGFAILS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-19  0:43 [PATCH 1/1] bridge: forward IPv6 fragmented packets when passing netfilter Bernhard Thaler
@ 2015-01-20 17:28 ` Pablo Neira Ayuso
  2015-01-22 23:27   ` [PATCHv2 " Bernhard Thaler
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2015-01-20 17:28 UTC (permalink / raw)
  To: Bernhard Thaler; +Cc: kadlec, netfilter-devel, coreteam

On Mon, Jan 19, 2015 at 01:43:29AM +0100, Bernhard Thaler wrote:
> ip6_fragment() in net/ipv6/ip6_output.c was changed due to a NULL pointer de-
> reference happening when handling packets coming from br_nf_dev_queue_xmit().
> When calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
> like this:
> 
> BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
> IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
> PGD 3bc3f067 PUD 3bc12067 PMD 0 
> Oops: 0000 [#1] SMP  
> ...
> 
> So in6_dev_get(skb->dev) is used to set a variable "idev" which is used to call
> IP6_INC_STATS() later on. It is assumed that this also solves other occasions
> where ip6_fragment() will be called that may cause the same crash. However,
> a better fix would be to check for the missing element causing the NULL pointer
> dereference and only setting it when it is missing.

IP6_INC_STATS() handles null idev pointers. I suspect the struct
fake_rtable in struct net_bridge (see net/bridge/br_private.h) needs
to be converted to something like:

        union {
                struct rtable   fake_rtable;
                struct rt6_info fake_rt6_info;
        };

just to allocate enough room for it.

> ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
> done in the IPv4 code.

This specific change looks the same to what we have in IPv4, so no
objections.

Thanks.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCHv2 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-20 17:28 ` Pablo Neira Ayuso
@ 2015-01-22 23:27   ` Bernhard Thaler
  2015-01-22 23:49     ` Florian Westphal
  0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Thaler @ 2015-01-22 23:27 UTC (permalink / raw)
  To: pablo, kadlec; +Cc: netfilter-devel, Bernhard Thaler

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to use it in net/bridge/br_netfilter.c's br_nf_dev_queue_xmit() for IPv6 packets
that need to be fragmented.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented. br_parse_ip6_options() is introduced to do some validity
checks on the IPv6 packet before calling ip6_fragment() and is closely aligned
to IPv4 code as an example.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

ip6_fragment() in net/ipv6/ip6_output.c yields a NULL pointer dereference
happening when handling packets coming from br_nf_dev_queue_xmit().  When
calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso net/bridge/br_private.h was changed to
prevent this issue by using a union to hold either fake_rtable or fake_rt6_info.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
---

Changes since v1:
* removed changes to IP6_INC_STATS() calls in net/ipv6/ip6_output.c 
* changed net/bridge/br_private.h to use union for fake_rtable and fake_rt6_info
  as suggested by Pablo Neira Ayuso

 include/net/ipv6.h        |    1 +
 net/bridge/br_netfilter.c |   87 ++++++++++++++++++++++++++++++++++++++++-----
 net/bridge/br_private.h   |    6 +++-
 net/ipv6/ip6_output.c     |    1 +
 4 files changed, 85 insertions(+), 10 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..fac485e 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
@@ -34,6 +35,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +241,59 @@ drop:
 	return -1;
 }
 
+/* Equivalent to br_parse_ip_options for IPv6 */
+
+static int br_parse_ip6_options(struct sk_buff *skb)
+{
+	const struct ipv6hdr *ip6h;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	ip6h = ipv6_hdr(skb);
+
+	/* Basic sanity checks
+	 * check version
+	 * check minimum header length (40 bytes)
+	 * check total length
+	 */
+	if (ip6h->version != 6)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	len = ntohs(ip6h->payload_len) + ip6h_len;
+
+	if (skb->len < len) {
+		IP6_INC_STATS_BH(dev_net(dev), idev,
+				 IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < ip6h_len) {
+		goto inhdr_error;
+	}
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +301,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -763,7 +822,6 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	int ret;
@@ -772,6 +830,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -781,17 +840,27 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
 		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ret;
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_parse_ip6_options(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+		ret = ip6_fragment(skb, br_dev_queue_push_xmit);
+		return ret;
+	}
+#endif
+
+	ret = br_dev_queue_push_xmit(skb);
 
 	return ret;
 }
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d13..d9c880d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..30ff0dc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -830,6 +830,7 @@ fail:
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCHv2 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-22 23:27   ` [PATCHv2 " Bernhard Thaler
@ 2015-01-22 23:49     ` Florian Westphal
  2015-01-27  1:22       ` [PATCHv3 " Bernhard Thaler
  0 siblings, 1 reply; 10+ messages in thread
From: Florian Westphal @ 2015-01-22 23:49 UTC (permalink / raw)
  To: Bernhard Thaler; +Cc: pablo, kadlec, netfilter-devel

Bernhard Thaler <bernhard.thaler@wvnet.at> wrote:
> +/* Equivalent to br_parse_ip_options for IPv6 */
> +
> +static int br_parse_ip6_options(struct sk_buff *skb)
> +{

Its not parsing options (yes, the ipv4 counterpart
is also	a misnomer, lets at least not repeat it?).

> +	const struct ipv6hdr *ip6h;
> +	struct net_device *dev = skb->dev;
> +	struct inet6_dev *idev = in6_dev_get(skb->dev);
> +	u32 len;
> +	u8 ip6h_len = sizeof(struct ipv6hdr);
> +
> +	if (!pskb_may_pull(skb, ip6h_len))
> +		goto inhdr_error;
> +
> +	ip6h = ipv6_hdr(skb);
> +
> +	/* Basic sanity checks
> +	 * check version
> +	 * check minimum header length (40 bytes)
> +	 * check total length
> +	 */
> +	if (ip6h->version != 6)
> +		goto inhdr_error;
> +
> +	if (!pskb_may_pull(skb, ip6h_len))
> +		goto inhdr_error;

This maypull call is not needed.

> +	len = ntohs(ip6h->payload_len) + ip6h_len;
> +
> +	if (skb->len < len) {
> +		IP6_INC_STATS_BH(dev_net(dev), idev,
> +				 IPSTATS_MIB_INTRUNCATEDPKTS);
> +		goto drop;
> +	} else if (len < ip6h_len) {

How can this evaluate to true?

I think this should be refactored with what we have in
br_nf_pre_routing_ipv6(); in fact br_nf_pre_routing_ipv6 says that there
is no ipv6 nat which isn't true anymore; I think we should at the very
least zero out skb->cb[] there as well.

Perhaps factor out some common br_ip6_validate() helper that does
whats in br_nf_pre_routing_ipv6.

> +		if (br_parse_ip6_options(skb))
> +			/* Drop invalid packet */
> +			return NF_DROP;

If we call br_parse_ip6_options() (or eqivalent) in PREROUTING, do we need to call
it again here?  It seems to me we validated skb already (Also true for ipv4 counterpart)?

The IP6CB reset is needed of course.

So, to summarize:

- Not sure we need br_parse_ip6_options; we only call it from
  br_nf_dev_queue_xmit(); as long as we validated things in prerouting
  earlier we should be fine.
- The existing validation calls in br_nf_pre_routing_ipv6() should at
  least also zero IP6CB.

I agree with the other changes (even though its ugly; don't have any better
solution either).

Cheers,
Florian

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCHv3 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-22 23:49     ` Florian Westphal
@ 2015-01-27  1:22       ` Bernhard Thaler
  2015-01-27  9:39         ` Florian Westphal
  0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Thaler @ 2015-01-27  1:22 UTC (permalink / raw)
  To: pablo, kadlec; +Cc: netfilter-devel, Bernhard Thaler

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to use it in net/bridge/br_netfilter.c's br_nf_dev_queue_xmit() for IPv6 packets
that need to be fragmented.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented.
br_validate_ipv6() is introduced to do some validity checks on the IPv6 packet
before calling ip6_fragment() in br_nf_dev_queue_xmit() and is closely aligned
to br_parse_ip_options() and existing checks in br_nf_pre_routing_finish_ipv6().
Existing checks in br_nf_pre_routing_finish_ipv6() are factored into
br_validate_ipv6() as suggested by Florian Westphal and br_parse_validate_ipv6()
called from there.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

Function order is changed to have check_hbh_len() before br_validate_ipv6() to
be able to call it from there.

ip6_fragment() in net/ipv6/ip6_output.c yields a NULL pointer dereference
happening when handling packets coming from br_nf_dev_queue_xmit().  When
calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso net/bridge/br_private.h was changed to
prevent this issue by using a union to hold either fake_rtable or fake_rt6_info.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
---

Changes since v2:
* changes in net/bridge/br_netfilter.c to combine checks on IPv6 header
  already performed in br_nf_pre_routing_finish_ipv6() with newly created
  br_parse_ip6_options()
* rename br_parse_ip6_options() to br_validate_ipv6()
* br_validate_ipv6() still performed in br_nf_pre_routing_finish_ipv6()
  and br_nf_dev_queue_xmit() to make sure even locally generated packets
  with SOCK_RAW are checked
* all changes based on suggestions by Florian Westphal on PATCHv2

 include/net/ipv6.h        |    1 +
 net/bridge/br_netfilter.c |  220 +++++++++++++++++++++++++++------------------
 net/bridge/br_private.h   |    6 +-
 net/ipv6/ip6_output.c     |    1 +
 4 files changed, 139 insertions(+), 89 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..17e6a85 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/ip.h>
+#include <linux/ipv6.h>
 #include <linux/netdevice.h>
 #include <linux/skbuff.h>
 #include <linux/if_arp.h>
@@ -34,6 +35,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +241,109 @@ drop:
 	return -1;
 }
 
+/* We only check the length. A bridge shouldn't do hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_parse_ip_options for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +351,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -468,92 +577,17 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	return skb->dev;
 }
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
-	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
-	u32 pkt_len;
-	const unsigned char *nh = skb_network_header(skb);
-	int off = raw - nh;
-	int len = (raw[1] + 1) << 3;
-
-	if ((raw + len) - skb->data > skb_headlen(skb))
-		goto bad;
-
-	off += 2;
-	len -= 2;
-
-	while (len > 0) {
-		int optlen = nh[off + 1] + 2;
-
-		switch (nh[off]) {
-		case IPV6_TLV_PAD1:
-			optlen = 1;
-			break;
-
-		case IPV6_TLV_PADN:
-			break;
-
-		case IPV6_TLV_JUMBO:
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				goto bad;
-			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				goto bad;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				goto bad;
-			if (pskb_trim_rcsum(skb,
-					    pkt_len + sizeof(struct ipv6hdr)))
-				goto bad;
-			nh = skb_network_header(skb);
-			break;
-		default:
-			if (optlen > len)
-				goto bad;
-			break;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-	if (len == 0)
-		return 0;
-bad:
-	return -1;
-
-}
-
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
 					   int (*okfn)(struct sk_buff *))
 {
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
-
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
+		/* Drop invalid packet */
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -763,7 +797,6 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	int ret;
@@ -772,6 +805,7 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -781,17 +815,27 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
 		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ret;
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_validate_ipv6(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+		ret = ip6_fragment(skb, br_dev_queue_push_xmit);
+		return ret;
+	}
+#endif
+
+	ret = br_dev_queue_push_xmit(skb);
 
 	return ret;
 }
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d13..d9c880d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..30ff0dc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -830,6 +830,7 @@ fail:
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCHv3 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-27  1:22       ` [PATCHv3 " Bernhard Thaler
@ 2015-01-27  9:39         ` Florian Westphal
  2015-01-27 23:15           ` [PATCHv4 RFC " Bernhard Thaler
  0 siblings, 1 reply; 10+ messages in thread
From: Florian Westphal @ 2015-01-27  9:39 UTC (permalink / raw)
  To: Bernhard Thaler; +Cc: pablo, kadlec, netfilter-devel

Bernhard Thaler <bernhard.thaler@wvnet.at> wrote:
>         modprobe br_netfilter
>         brctl addbr br0
>         brctl addif br0 eth0
>         brctl addif br0 eth2
>         ifconfig eth0 up
>         ifconfig eth2 up
>         ifconfig br0 up
> 
> 2) place a host with an IPv6 address on each side of the bridge
> 
>         set IPv6 address on host A:
>         ip -6 addr add fd01:2345:6789:1::1/64 dev eth0
> 
>         set IPv6 address on host B:
>         ip -6 addr add fd01:2345:6789:1::2/64 dev eth0
> 
> 3) run a simple ping command on host A with packets > MTU
> 
>         ping6 -s 4000 fd01:2345:6789:1::2
> 
> 4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge
> 
> IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
> is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
> packets do not traverse the bridge any more (you see no more responses in ping's
> output).
> 
> Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
> to use it in net/bridge/br_netfilter.c's br_nf_dev_queue_xmit() for IPv6 packets
> that need to be fragmented.

I think this looks good, however afaics there is now a direct dependeny on
ipv6.ko module.  I think it would be nice if we could avoid this.

There are 2 ways to do this,
a) add fragment to nf_ipv6_ops
or
b) add fragment to pingv6_ops in include/net/ping.h

Ideally, those two should be merged into a single one, say e.g. ipv6_ops,
exported by core and wired up when ipv6 module is present, but I don't
want to push this on you, so e.g. adding fragment to nf_ipv6_ops is
fine with me.

With indirect call we could avoid ip6_fragment dependency.


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCHv4 RFC 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-27  9:39         ` Florian Westphal
@ 2015-01-27 23:15           ` Bernhard Thaler
  2015-01-30 17:17             ` Pablo Neira Ayuso
  0 siblings, 1 reply; 10+ messages in thread
From: Bernhard Thaler @ 2015-01-27 23:15 UTC (permalink / raw)
  To: pablo, kadlec; +Cc: netfilter-devel, Bernhard Thaler

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to call it indirectly from net/bridge/br_netfilter.c's br_nf_dev_queue_xmit()
via nf_ipv6_ops from include/linux/netfilter_ipv6.h when IPv6 packets need to be
fragmented. This solution was suggested by Florian Westphal in order not to have
a direct dependency on ipv6.ko.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented.
br_validate_ipv6() is introduced to do some validity checks on the IPv6 packet
before calling ip6_fragment() in br_nf_dev_queue_xmit() and is closely aligned
to br_parse_ip_options() and existing checks in br_nf_pre_routing_finish_ipv6().
Existing checks in br_nf_pre_routing_finish_ipv6() are factored into
br_validate_ipv6() as suggested by Florian Westphal and br_parse_validate_ipv6()
called from there.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

Function order in net/bridge/br_netfilter.c is changed to have check_hbh_len()
before br_validate_ipv6() to be able to call it from there.

ip6_fragment() in net/ipv6/ip6_output.c yields a NULL pointer dereference
happening when handling packets coming from br_nf_dev_queue_xmit().  When
calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso net/bridge/br_private.h was changed to
prevent this issue by using a union to hold either fake_rtable or fake_rt6_info.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
---

Changes since v3:
* added ip6_fragment() to nf_ipv6_ops to call it indirectly from 
  br_nf_dev_queue_xmit() when needed as suggested by Florian Westphal

 include/linux/netfilter_ipv6.h |    2 +
 include/net/ipv6.h             |    1 +
 net/bridge/br_netfilter.c      |  225 ++++++++++++++++++++++++----------------
 net/bridge/br_private.h        |    6 +-
 net/ipv6/ip6_output.c          |    1 +
 net/ipv6/netfilter.c           |    1 +
 6 files changed, 147 insertions(+), 89 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 64dad1cc..7024e7f 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -25,6 +25,8 @@ void ipv6_netfilter_fini(void);
 struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
+	int (*fragment)(struct sk_buff *skb,
+			    int (*output)(struct sk_buff *));
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index c190d22..186af9d 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +240,109 @@ drop:
 	return -1;
 }
 
+/* We only check the length. A bridge shouldn't do hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_parse_ip_options for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +350,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -468,92 +576,17 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	return skb->dev;
 }
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
-	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
-	u32 pkt_len;
-	const unsigned char *nh = skb_network_header(skb);
-	int off = raw - nh;
-	int len = (raw[1] + 1) << 3;
-
-	if ((raw + len) - skb->data > skb_headlen(skb))
-		goto bad;
-
-	off += 2;
-	len -= 2;
-
-	while (len > 0) {
-		int optlen = nh[off + 1] + 2;
-
-		switch (nh[off]) {
-		case IPV6_TLV_PAD1:
-			optlen = 1;
-			break;
-
-		case IPV6_TLV_PADN:
-			break;
-
-		case IPV6_TLV_JUMBO:
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				goto bad;
-			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				goto bad;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				goto bad;
-			if (pskb_trim_rcsum(skb,
-					    pkt_len + sizeof(struct ipv6hdr)))
-				goto bad;
-			nh = skb_network_header(skb);
-			break;
-		default:
-			if (optlen > len)
-				goto bad;
-			break;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-	if (len == 0)
-		return 0;
-bad:
-	return -1;
-
-}
-
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
 					   int (*okfn)(struct sk_buff *))
 {
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
-
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
+		/* Drop invalid packet */
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -763,15 +796,18 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
 	int ret;
 	int frag_max_size;
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+#endif
 
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -781,17 +817,30 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
 		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ret;
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_validate_ipv6(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+
+		if (v6ops) {
+			ret = v6ops->fragment(skb, br_dev_queue_push_xmit);
+			return ret;
+		}
+	}
+#endif
+
+	ret = br_dev_queue_push_xmit(skb);
 
 	return ret;
 }
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
-#endif
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index aea3d13..d9c880d 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..30ff0dc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -830,6 +830,7 @@ fail:
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 398377a..3080815 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,6 +191,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
+	.fragment	= ip6_fragment,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCHv4 RFC 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-27 23:15           ` [PATCHv4 RFC " Bernhard Thaler
@ 2015-01-30 17:17             ` Pablo Neira Ayuso
  2015-01-30 17:25               ` Pablo Neira Ayuso
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2015-01-30 17:17 UTC (permalink / raw)
  To: Bernhard Thaler; +Cc: netfilter-devel, fw

[-- Attachment #1: Type: text/plain, Size: 2059 bytes --]

On Wed, Jan 28, 2015 at 12:15:21AM +0100, Bernhard Thaler wrote:
> -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
>  static int br_nf_dev_queue_xmit(struct sk_buff *skb)

Made small changes to this function, see patch attached.

>  {
>  	int ret;
>  	int frag_max_size;
> +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
> +	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();

declare this variable and initialize in the scope of the if() branch.

> +#endif
>  
>  	/* This is wrong! We should preserve the original fragment
>  	 * boundaries by preserving frag_list rather than refragmenting.
>  	 */
> +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
>  	if (skb->protocol == htons(ETH_P_IP) &&
>  	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
>  	    !skb_is_gso(skb)) {
> @@ -781,17 +817,30 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
>  			return NF_DROP;
>  		IPCB(skb)->frag_max_size = frag_max_size;
>  		ret = ip_fragment(skb, br_dev_queue_push_xmit);
> -	} else
> -		ret = br_dev_queue_push_xmit(skb);
> +		return ret;

removed ret, instead simply return ip_fragment(...)

> +	}
> +#endif
> +#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
> +	if (skb->protocol == htons(ETH_P_IPV6) &&
> +	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
> +	    !skb_is_gso(skb)) {
> +		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
> +		if (br_validate_ipv6(skb))
> +			/* Drop invalid packet */
> +			return NF_DROP;
> +		IP6CB(skb)->frag_max_size = frag_max_size;
> +
> +		if (v6ops) {
> +			ret = v6ops->fragment(skb, br_dev_queue_push_xmit);
> +			return ret;
> +		}

add else here and return -EMSGSIZE. I think it's not worth even try to
send the packet when is larger than mtu.

> +	}
> +#endif
> +
> +	ret = br_dev_queue_push_xmit(skb);
>  
>  	return ret;
>  }

See patch attached with the minor changes that I indicated above.

BTW, will you also follow up with a new version of

http://patchwork.ozlabs.org/patch/418305/ ?

that applies on top of this change, which is the original patch that
you needed mainstream.

Let me know, thanks.

[-- Attachment #2: 0001-br_netfilter-forward-IPv6-fragmented-packets-when-pa.patch --]
[-- Type: text/x-diff, Size: 13837 bytes --]

>From 01ef54eb933cd5effcf34f1e0d33b90979d3a659 Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Wed, 28 Jan 2015 00:15:21 +0100
Subject: [PATCH] br_netfilter: forward IPv6 fragmented packets when passing
 bridge netfilter

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to call it indirectly from net/bridge/br_netfilter.c's br_nf_dev_queue_xmit()
via nf_ipv6_ops from include/linux/netfilter_ipv6.h when IPv6 packets need to be
fragmented. This solution was suggested by Florian Westphal in order not to have
a direct dependency on ipv6.ko.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented.
br_validate_ipv6() is introduced to do some validity checks on the IPv6 packet
before calling ip6_fragment() in br_nf_dev_queue_xmit() and is closely aligned
to br_parse_ip_options() and existing checks in br_nf_pre_routing_finish_ipv6().
Existing checks in br_nf_pre_routing_finish_ipv6() are factored into
br_validate_ipv6() as suggested by Florian Westphal and br_parse_validate_ipv6()
called from there.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

Function order in net/bridge/br_netfilter.c is changed to have check_hbh_len()
before br_validate_ipv6() to be able to call it from there.

ip6_fragment() in net/ipv6/ip6_output.c yields a NULL pointer dereference
happening when handling packets coming from br_nf_dev_queue_xmit().  When
calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso net/bridge/br_private.h was changed to
prevent this issue by using a union to hold either fake_rtable or fake_rt6_info.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |    1 +
 include/net/ipv6.h             |    1 +
 net/bridge/br_netfilter.c      |  225 ++++++++++++++++++++++++----------------
 net/bridge/br_private.h        |    6 +-
 net/ipv6/ip6_output.c          |    1 +
 net/ipv6/netfilter.c           |    1 +
 6 files changed, 143 insertions(+), 92 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 64dad1cc..7c832ba 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -25,6 +25,7 @@ void ipv6_netfilter_fini(void);
 struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
+	int (*fragment)(struct sk_buff *skb, int (*output)(struct sk_buff *));
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 65728e0..8b6d693 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +240,109 @@ drop:
 	return -1;
 }
 
+/* We only check the length. A bridge shouldn't do hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_parse_ip_options for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +350,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -468,92 +576,17 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	return skb->dev;
 }
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
-	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
-	u32 pkt_len;
-	const unsigned char *nh = skb_network_header(skb);
-	int off = raw - nh;
-	int len = (raw[1] + 1) << 3;
-
-	if ((raw + len) - skb->data > skb_headlen(skb))
-		goto bad;
-
-	off += 2;
-	len -= 2;
-
-	while (len > 0) {
-		int optlen = nh[off + 1] + 2;
-
-		switch (nh[off]) {
-		case IPV6_TLV_PAD1:
-			optlen = 1;
-			break;
-
-		case IPV6_TLV_PADN:
-			break;
-
-		case IPV6_TLV_JUMBO:
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				goto bad;
-			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				goto bad;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				goto bad;
-			if (pskb_trim_rcsum(skb,
-					    pkt_len + sizeof(struct ipv6hdr)))
-				goto bad;
-			nh = skb_network_header(skb);
-			break;
-		default:
-			if (optlen > len)
-				goto bad;
-			break;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-	if (len == 0)
-		return 0;
-bad:
-	return -1;
-
-}
-
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
 					   int (*okfn)(struct sk_buff *))
 {
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
-
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
+		/* Drop invalid packet */
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -763,15 +796,14 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	int ret;
 	int frag_max_size;
 
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
@@ -780,18 +812,29 @@ static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 			/* Drop invalid packet */
 			return NF_DROP;
 		IPCB(skb)->frag_max_size = frag_max_size;
-		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		return ip_fragment(skb, br_dev_queue_push_xmit);
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 
-	return ret;
-}
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
+		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		if (br_validate_ipv6(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = frag_max_size;
+
+		if (v6ops)
+			return v6ops->fragment(skb, br_dev_queue_push_xmit);
+		else
+			return -EMSGSIZE;
+	}
 #endif
+	return br_dev_queue_push_xmit(skb);
+}
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e8e3f36..bc65e7a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..30ff0dc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -830,6 +830,7 @@ fail:
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 398377a..3080815 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,6 +191,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
+	.fragment	= ip6_fragment,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCHv4 RFC 1/1] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-30 17:17             ` Pablo Neira Ayuso
@ 2015-01-30 17:25               ` Pablo Neira Ayuso
  2015-03-18 21:53                 ` [PATCH 2/4] " Bernhard Thaler
  0 siblings, 1 reply; 10+ messages in thread
From: Pablo Neira Ayuso @ 2015-01-30 17:25 UTC (permalink / raw)
  To: Bernhard Thaler; +Cc: netfilter-devel, fw

[-- Attachment #1: Type: text/plain, Size: 535 bytes --]

On Fri, Jan 30, 2015 at 06:17:01PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Jan 28, 2015 at 12:15:21AM +0100, Bernhard Thaler wrote:
> > -#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
> >  static int br_nf_dev_queue_xmit(struct sk_buff *skb)
> 
> Made small changes to this function, see patch attached.
> 
> >  {
> >  	int ret;
> >  	int frag_max_size;

Actually, we can also get rid of this frag_max_size variable,
otherwise we'll have a compilation warning with NF_DEFRAG disabled for
both IPv4 and IPv6.

So I made this small change too.

[-- Attachment #2: 0001-br_netfilter-forward-IPv6-fragmented-packets-when-pa.patch --]
[-- Type: text/x-diff, Size: 13884 bytes --]

>From 0e5b1418f97c0ad044257bcd57f3bb9589736db2 Mon Sep 17 00:00:00 2001
From: Bernhard Thaler <bernhard.thaler@wvnet.at>
Date: Wed, 28 Jan 2015 00:15:21 +0100
Subject: [PATCH] br_netfilter: forward IPv6 fragmented packets when passing
 bridge netfilter

Currently IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

Patch exports ip6_fragment() in include/net/ipv6.h and net/ipv6/ip6_output.c
to call it indirectly from net/bridge/br_netfilter.c's br_nf_dev_queue_xmit()
via nf_ipv6_ops from include/linux/netfilter_ipv6.h when IPv6 packets need to be
fragmented. This solution was suggested by Florian Westphal in order not to have
a direct dependency on ipv6.ko.

In net/bridge/br_netfilter.c br_nf_pre_routing_finish_ipv6() is changed to keep
track of fragment size and br_nf_dev_queue_xmit() checks for IPv6 packets that
need to be fragmented.
br_validate_ipv6() is introduced to do some validity checks on the IPv6 packet
before calling ip6_fragment() in br_nf_dev_queue_xmit() and is closely aligned
to br_parse_ip_options() and existing checks in br_nf_pre_routing_finish_ipv6().
Existing checks in br_nf_pre_routing_finish_ipv6() are factored into
br_validate_ipv6() as suggested by Florian Westphal and br_parse_validate_ipv6()
called from there.
br_nf_dev_queue_xmit() is changed to contain the relevant code depending on
CONFIG_NF_DEFRAG_IPV4 or CONFIG_NF_DEFRAG_IPV6 being used both or each for
their own.

Function order in net/bridge/br_netfilter.c is changed to have check_hbh_len()
before br_validate_ipv6() to be able to call it from there.

ip6_fragment() in net/ipv6/ip6_output.c yields a NULL pointer dereference
happening when handling packets coming from br_nf_dev_queue_xmit().  When
calling IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did crash the kernel
like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso net/bridge/br_private.h was changed to
prevent this issue by using a union to hold either fake_rtable or fake_rt6_info.

ip6_fragment() is further changed to use nf_bridge_mtu_reduction(skb) as it is
done in the IPv4 code.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_parse_ip6_options(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 include/linux/netfilter_ipv6.h |    1 +
 include/net/ipv6.h             |    1 +
 net/bridge/br_netfilter.c      |  229 +++++++++++++++++++++++-----------------
 net/bridge/br_private.h        |    6 +-
 net/ipv6/ip6_output.c          |    1 +
 net/ipv6/netfilter.c           |    1 +
 6 files changed, 143 insertions(+), 96 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index 64dad1cc..7c832ba 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -25,6 +25,7 @@ void ipv6_netfilter_fini(void);
 struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
+	int (*fragment)(struct sk_buff *skb, int (*output)(struct sk_buff *));
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 4292929..aecbead 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -495,6 +495,7 @@ struct ip6_create_arg {
 
 void ip6_frag_init(struct inet_frag_queue *q, const void *a);
 bool ip6_frag_match(const struct inet_frag_queue *q, const void *a);
+int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *));
 
 /*
  *	Equivalent of ipv4 struct ip
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 65728e0..01b01b8 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -239,6 +240,109 @@ drop:
 	return -1;
 }
 
+/* We only check the length. A bridge shouldn't do hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_parse_ip_options for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 /* PF_BRIDGE/PRE_ROUTING *********************************************/
 /* Undo the changes made for ip6tables PREROUTING and continue the
  * bridge PRE_ROUTING hook. */
@@ -246,6 +350,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 {
 	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
 	struct rtable *rt;
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -468,92 +576,17 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	return skb->dev;
 }
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
-	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
-	u32 pkt_len;
-	const unsigned char *nh = skb_network_header(skb);
-	int off = raw - nh;
-	int len = (raw[1] + 1) << 3;
-
-	if ((raw + len) - skb->data > skb_headlen(skb))
-		goto bad;
-
-	off += 2;
-	len -= 2;
-
-	while (len > 0) {
-		int optlen = nh[off + 1] + 2;
-
-		switch (nh[off]) {
-		case IPV6_TLV_PAD1:
-			optlen = 1;
-			break;
-
-		case IPV6_TLV_PADN:
-			break;
-
-		case IPV6_TLV_JUMBO:
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				goto bad;
-			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				goto bad;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				goto bad;
-			if (pskb_trim_rcsum(skb,
-					    pkt_len + sizeof(struct ipv6hdr)))
-				goto bad;
-			nh = skb_network_header(skb);
-			break;
-		default:
-			if (optlen > len)
-				goto bad;
-			break;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-	if (len == 0)
-		return 0;
-bad:
-	return -1;
-
-}
-
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
 					   int (*okfn)(struct sk_buff *))
 {
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
-
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
+		/* Drop invalid packet */
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -763,35 +796,41 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	int ret;
-	int frag_max_size;
-
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	if (skb->protocol == htons(ETH_P_IP) &&
 	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
 	    !skb_is_gso(skb)) {
-		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
 		if (br_parse_ip_options(skb))
 			/* Drop invalid packet */
 			return NF_DROP;
-		IPCB(skb)->frag_max_size = frag_max_size;
-		ret = ip_fragment(skb, br_dev_queue_push_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		IPCB(skb)->frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		return ip_fragment(skb, br_dev_queue_push_xmit);
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6) &&
+	    skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu &&
+	    !skb_is_gso(skb)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+
+		if (br_validate_ipv6(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
 
-	return ret;
-}
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
+		if (v6ops)
+			return v6ops->fragment(skb, br_dev_queue_push_xmit);
+		else
+			return -EMSGSIZE;
+	}
 #endif
+	return br_dev_queue_push_xmit(skb);
+}
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index e8e3f36..bc65e7a 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index ce69a12..30ff0dc 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -830,6 +830,7 @@ fail:
 	kfree_skb(skb);
 	return err;
 }
+EXPORT_SYMBOL(ip6_fragment);
 
 static inline int ip6_rt_check(const struct rt6key *rt_key,
 			       const struct in6_addr *fl_addr,
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 398377a..3080815 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,6 +191,7 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
+	.fragment	= ip6_fragment,
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

* [PATCH 2/4] bridge: forward IPv6 fragmented packets when passing netfilter
  2015-01-30 17:25               ` Pablo Neira Ayuso
@ 2015-03-18 21:53                 ` Bernhard Thaler
  0 siblings, 0 replies; 10+ messages in thread
From: Bernhard Thaler @ 2015-03-18 21:53 UTC (permalink / raw)
  To: pablo, kadlec; +Cc: netfilter-devel, fw, Bernhard Thaler

IPv6 fragmented packets are not forwarded on an ethernet bridge
with netfilter ip6_tables loaded. e.g. steps to reproduce

1) create a simple bridge like this

        modprobe br_netfilter
        brctl addbr br0
        brctl addif br0 eth0
        brctl addif br0 eth2
        ifconfig eth0 up
        ifconfig eth2 up
        ifconfig br0 up

2) place a host with an IPv6 address on each side of the bridge

        set IPv6 address on host A:
        ip -6 addr add fd01:2345:6789:1::1/64 dev eth0

        set IPv6 address on host B:
        ip -6 addr add fd01:2345:6789:1::2/64 dev eth0

3) run a simple ping command on host A with packets > MTU

        ping6 -s 4000 fd01:2345:6789:1::2

4) wait some time and run e.g. "ip6tables -t nat -nvL" on the bridge

IPv6 fragmented packets traverse the bridge cleanly until "ip6tables -t nat -nvL"
is run. As soon as it is run (and netfilter modules are loaded) IPv6 fragmented
packets do not traverse the bridge any more (you see no more responses in ping's
output).

After applying this patch IPv6 fragmented packets traverse the bridge cleanly
in above scenario.

Signed-off-by: Bernhard Thaler <bernhard.thaler@wvnet.at>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
Call ip6_fragment() indirectly via nf_ipv6_ops from br_nf_dev_queue_xmit().
This solution was suggested by Florian Westphal in order not to have a direct
dependency on ipv6.ko.

Introduce br_validate_ipv6() to do some validity checks on the IPv6 packet
in br_nf_pre_routing_finish_ipv6() and br_nf_dev_queue_xmit().
Factor checks in br_nf_pre_routing_finish_ipv6() into br_parse_validate_ipv6()
as suggested by Florian Westphal.

Change function order to have check_hbh_len() before br_validate_ipv6() to
call it from there.

ip6_fragment() yields a NULL pointer dereference when handling packets coming
from br_nf_dev_queue_xmit().  IP6_INC_STATS(), ip6_dst_idev(skb_dst(skb)) did
crash the kernel like this:

BUG: unable to handle kernel NULL pointer dereference at 000000000000037a
IP: [<ffffffff814cba2a>] ip6_fragment+0x99a/0x1290
PGD 3bc3f067 PUD 3bc12067 PMD 0
Oops: 0000 [#1] SMP
...

As suggested by Pablo Neira Ayuso use union in net_bridge to store either
fake_rtable or fake_rt6_info. This solves NULL pointer dereference above.

After applying this patch, in the same setup as stated above fragmented IPv6
packets traverse the bridge even after running "ip6tables -t nat -nvL" / net-
filter modules loaded. This was tested using overlong IPv6 ICMP ping packets
as described above.
Some tests were performed crafting invalid headers (e.g. ipv6 version field
set to 5) to test checks in br_validate_ipv6(), however these packets do
not seem to reach changed code parts and seem to be dropped at an earlier
stage (more testing needed with invalid / fuzzed packets).

Changes since v4:
* rebase to davem's current net-next
* remove EXPORT_SYMBOL() for ip6_fragment() 
* simplify conditionals within br_nf_dev_queue_xmit() 

 include/linux/netfilter_ipv6.h |    1 +
 net/bridge/br_netfilter.c      |  234 +++++++++++++++++++++++-----------------
 net/bridge/br_private.h        |    6 +-
 net/ipv6/netfilter.c           |    3 +-
 4 files changed, 145 insertions(+), 99 deletions(-)

diff --git a/include/linux/netfilter_ipv6.h b/include/linux/netfilter_ipv6.h
index e2d1969..7755d5c 100644
--- a/include/linux/netfilter_ipv6.h
+++ b/include/linux/netfilter_ipv6.h
@@ -26,6 +26,7 @@ struct nf_ipv6_ops {
 	int (*chk_addr)(struct net *net, const struct in6_addr *addr,
 			const struct net_device *dev, int strict);
 	void (*route_input)(struct sk_buff *skb);
+	int (*fragment)(struct sk_buff *skb, int (*output)(struct sk_buff *));
 };
 
 extern const struct nf_ipv6_ops __rcu *nf_ipv6_ops;
diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c
index 775d638..0e129fb 100644
--- a/net/bridge/br_netfilter.c
+++ b/net/bridge/br_netfilter.c
@@ -34,6 +34,7 @@
 
 #include <net/ip.h>
 #include <net/ipv6.h>
+#include <net/addrconf.h>
 #include <net/route.h>
 #include <net/netfilter/br_netfilter.h>
 
@@ -245,6 +246,109 @@ drop:
 	return -1;
 }
 
+/* We only check the length. A bridge shouldn't do hop-by-hop stuff anyway */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_parse_ip_options for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
 static void nf_bridge_update_protocol(struct sk_buff *skb)
 {
 	if (skb->nf_bridge->mask & BRNF_8021Q)
@@ -342,6 +446,10 @@ static int br_nf_pre_routing_finish_ipv6(struct sk_buff *skb)
 	struct rtable *rt;
 	struct net_device *dev = skb->dev;
 	const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+	int frag_max_size;
+
+	frag_max_size = IP6CB(skb)->frag_max_size;
+	BR_INPUT_SKB_CB(skb)->frag_max_size = frag_max_size;
 
 	if (nf_bridge->mask & BRNF_PKT_TYPE) {
 		skb->pkt_type = PACKET_OTHERHOST;
@@ -543,92 +651,17 @@ static struct net_device *setup_pre_routing(struct sk_buff *skb)
 	return skb->dev;
 }
 
-/* We only check the length. A bridge shouldn't do any hop-by-hop stuff anyway */
-static int check_hbh_len(struct sk_buff *skb)
-{
-	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
-	u32 pkt_len;
-	const unsigned char *nh = skb_network_header(skb);
-	int off = raw - nh;
-	int len = (raw[1] + 1) << 3;
-
-	if ((raw + len) - skb->data > skb_headlen(skb))
-		goto bad;
-
-	off += 2;
-	len -= 2;
-
-	while (len > 0) {
-		int optlen = nh[off + 1] + 2;
-
-		switch (nh[off]) {
-		case IPV6_TLV_PAD1:
-			optlen = 1;
-			break;
-
-		case IPV6_TLV_PADN:
-			break;
-
-		case IPV6_TLV_JUMBO:
-			if (nh[off + 1] != 4 || (off & 3) != 2)
-				goto bad;
-			pkt_len = ntohl(*(__be32 *) (nh + off + 2));
-			if (pkt_len <= IPV6_MAXPLEN ||
-			    ipv6_hdr(skb)->payload_len)
-				goto bad;
-			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
-				goto bad;
-			if (pskb_trim_rcsum(skb,
-					    pkt_len + sizeof(struct ipv6hdr)))
-				goto bad;
-			nh = skb_network_header(skb);
-			break;
-		default:
-			if (optlen > len)
-				goto bad;
-			break;
-		}
-		off += optlen;
-		len -= optlen;
-	}
-	if (len == 0)
-		return 0;
-bad:
-	return -1;
-
-}
-
 /* Replicate the checks that IPv6 does on packet reception and pass the packet
- * to ip6tables, which doesn't support NAT, so things are fairly simple. */
+ * to ip6tables.
+ */
 static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
 					   struct sk_buff *skb,
 					   const struct net_device *in,
 					   const struct net_device *out,
 					   int (*okfn)(struct sk_buff *))
 {
-	const struct ipv6hdr *hdr;
-	u32 pkt_len;
-
-	if (skb->len < sizeof(struct ipv6hdr))
-		return NF_DROP;
-
-	if (!pskb_may_pull(skb, sizeof(struct ipv6hdr)))
-		return NF_DROP;
-
-	hdr = ipv6_hdr(skb);
-
-	if (hdr->version != 6)
-		return NF_DROP;
-
-	pkt_len = ntohs(hdr->payload_len);
-
-	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
-		if (pkt_len + sizeof(struct ipv6hdr) > skb->len)
-			return NF_DROP;
-		if (pskb_trim_rcsum(skb, pkt_len + sizeof(struct ipv6hdr)))
-			return NF_DROP;
-	}
-	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+	if (br_validate_ipv6(skb))
+		/* Drop invalid packet */
 		return NF_DROP;
 
 	nf_bridge_put(skb->nf_bridge);
@@ -839,7 +872,7 @@ static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
 	return NF_STOLEN;
 }
 
-#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4) || IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
 static bool nf_bridge_copy_header(struct sk_buff *skb)
 {
 	int err;
@@ -866,38 +899,45 @@ static int br_nf_push_frag_xmit(struct sk_buff *skb)
 
 	return br_dev_queue_push_xmit(skb);
 }
+#endif
 
 static int br_nf_dev_queue_xmit(struct sk_buff *skb)
 {
-	int ret;
-	int frag_max_size;
 	unsigned int mtu_reserved;
 
-	if (skb_is_gso(skb) || skb->protocol != htons(ETH_P_IP))
+	mtu_reserved = nf_bridge_mtu_reduction(skb);
+	if (skb_is_gso(skb) || skb->len + mtu_reserved <= skb->dev->mtu)
 		return br_dev_queue_push_xmit(skb);
 
-	mtu_reserved = nf_bridge_mtu_reduction(skb);
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
 	/* This is wrong! We should preserve the original fragment
 	 * boundaries by preserving frag_list rather than refragmenting.
 	 */
-	if (skb->len + mtu_reserved > skb->dev->mtu) {
-		frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+	if (skb->protocol == htons(ETH_P_IP)) {
 		if (br_parse_ip_options(skb))
 			/* Drop invalid packet */
 			return NF_DROP;
-		IPCB(skb)->frag_max_size = frag_max_size;
-		ret = ip_fragment(skb, br_nf_push_frag_xmit);
-	} else
-		ret = br_dev_queue_push_xmit(skb);
+		IPCB(skb)->frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+		return ip_fragment(skb, br_nf_push_frag_xmit);
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
 
-	return ret;
-}
-#else
-static int br_nf_dev_queue_xmit(struct sk_buff *skb)
-{
-        return br_dev_queue_push_xmit(skb);
-}
+		if (br_validate_ipv6(skb))
+			/* Drop invalid packet */
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
+
+		if (v6ops)
+			return v6ops->fragment(skb, br_nf_push_frag_xmit);
+		else
+			return -EMSGSIZE;
+	}
 #endif
+	return br_dev_queue_push_xmit(skb);
+}
 
 /* PF_BRIDGE/POST_ROUTING ********************************************/
 static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index b46fa0c..e464a0f 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -18,6 +18,7 @@
 #include <linux/netpoll.h>
 #include <linux/u64_stats_sync.h>
 #include <net/route.h>
+#include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
 #define BR_HASH_BITS 8
@@ -214,7 +215,10 @@ struct net_bridge
 	spinlock_t			hash_lock;
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
-	struct rtable 			fake_rtable;
+	union {
+		struct rtable		fake_rtable;
+		struct rt6_info		fake_rt6_info;
+	};
 	bool				nf_call_iptables;
 	bool				nf_call_ip6tables;
 	bool				nf_call_arptables;
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index 0cd8ec9..d8cdbe9 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -191,7 +191,8 @@ static __sum16 nf_ip6_checksum_partial(struct sk_buff *skb, unsigned int hook,
 
 static const struct nf_ipv6_ops ipv6ops = {
 	.chk_addr	= ipv6_chk_addr,
-	.route_input    = ip6_route_input
+	.route_input    = ip6_route_input,
+	.fragment	= ip6_fragment
 };
 
 static const struct nf_afinfo nf_ip6_afinfo = {
-- 
1.7.10.4


^ permalink raw reply related	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-03-18 21:59 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-01-19  0:43 [PATCH 1/1] bridge: forward IPv6 fragmented packets when passing netfilter Bernhard Thaler
2015-01-20 17:28 ` Pablo Neira Ayuso
2015-01-22 23:27   ` [PATCHv2 " Bernhard Thaler
2015-01-22 23:49     ` Florian Westphal
2015-01-27  1:22       ` [PATCHv3 " Bernhard Thaler
2015-01-27  9:39         ` Florian Westphal
2015-01-27 23:15           ` [PATCHv4 RFC " Bernhard Thaler
2015-01-30 17:17             ` Pablo Neira Ayuso
2015-01-30 17:25               ` Pablo Neira Ayuso
2015-03-18 21:53                 ` [PATCH 2/4] " Bernhard Thaler

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).