Netdev List
 help / color / mirror / Atom feed
* [IPV6 02/10]: Move nextheader offset to the IP6CB
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[IPV6]: Move nextheader offset to the IP6CB

Move nextheader offset to the IP6CB to make it possible to pass a
packet to ip6_input_finish multiple times and have it skip already
parsed headers. As a nice side effect this gets rid of the manual
hopopts skipping in ip6_input_finish.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 7398b0fba81fbae5139c7a23d27477424125e8b2
tree 872d80dd650b8c27492cc23b8b9e768ab10494da
parent b426272e4885bc398c9c01e42486669cd832706f
author Patrick McHardy <kaber@trash.net> Fri, 06 Jan 2006 21:17:42 +0100
committer Patrick McHardy <kaber@trash.net> Fri, 06 Jan 2006 21:17:42 +0100

 include/linux/ipv6.h    |    1 +
 include/net/protocol.h  |    2 +-
 include/net/xfrm.h      |    6 +++---
 net/dccp/ipv6.c         |    2 +-
 net/ipv6/exthdrs.c      |   19 ++++++++++++-------
 net/ipv6/icmp.c         |    4 ++--
 net/ipv6/ip6_input.c    |   21 ++++++---------------
 net/ipv6/ip6_tunnel.c   |    2 +-
 net/ipv6/reassembly.c   |   11 +++++------
 net/ipv6/tcp_ipv6.c     |    2 +-
 net/ipv6/udp.c          |    2 +-
 net/ipv6/xfrm6_input.c  |    8 ++++----
 net/ipv6/xfrm6_tunnel.c |    6 +++---
 net/sctp/ipv6.c         |    2 +-
 14 files changed, 42 insertions(+), 46 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 93bbed5..5cfc715 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -191,6 +191,7 @@ struct inet6_skb_parm {
 	__u16			srcrt;
 	__u16			dst1;
 	__u16			lastopt;
+	__u32			nhoff;
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/net/protocol.h b/include/net/protocol.h
index 63f7db9..6dc5970 100644
--- a/include/net/protocol.h
+++ b/include/net/protocol.h
@@ -43,7 +43,7 @@ struct net_protocol {
 #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE)
 struct inet6_protocol 
 {
-	int	(*handler)(struct sk_buff **skb, unsigned int *nhoffp);
+	int	(*handler)(struct sk_buff **skb);
 
 	void	(*err_handler)(struct sk_buff *skb,
 			       struct inet6_skb_parm *opt,
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 07d7b50..297d09d 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -831,7 +831,7 @@ struct xfrm_tunnel {
 };
 
 struct xfrm6_tunnel {
-	int (*handler)(struct sk_buff **pskb, unsigned int *nhoffp);
+	int (*handler)(struct sk_buff **pskb);
 	void (*err_handler)(struct sk_buff *skb, struct inet6_skb_parm *opt,
 			    int type, int code, int offset, __u32 info);
 };
@@ -868,8 +868,8 @@ extern int xfrm4_rcv(struct sk_buff *skb
 extern int xfrm4_output(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
-extern int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi);
-extern int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
+extern int xfrm6_rcv(struct sk_buff **pskb);
 extern int xfrm6_tunnel_register(struct xfrm6_tunnel *handler);
 extern int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler);
 extern u32 xfrm6_tunnel_alloc_spi(xfrm_address_t *saddr);
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index c609dc7..e138bbc 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -1028,7 +1028,7 @@ discard:
 	return 0;
 }
 
-static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int dccp_v6_rcv(struct sk_buff **pskb)
 {
 	const struct dccp_hdr *dh;
 	struct sk_buff *skb = *pskb;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index 113374d..2a1e7e4 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -152,7 +152,7 @@ static struct tlvtype_proc tlvprocdestop
 	{-1,			NULL}
 };
 
-static int ipv6_destopt_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_destopt_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -169,7 +169,7 @@ static int ipv6_destopt_rcv(struct sk_bu
 
 	if (ip6_parse_tlv(tlvprocdestopt_lst, skb)) {
 		skb->h.raw += ((skb->h.raw[1]+1)<<3);
-		*nhoffp = opt->dst1;
+		opt->nhoff = opt->dst1;
 		return 1;
 	}
 
@@ -192,7 +192,7 @@ void __init ipv6_destopt_init(void)
   NONE header. No data in packet.
  ********************************/
 
-static int ipv6_nodata_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_nodata_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 
@@ -215,7 +215,7 @@ void __init ipv6_nodata_init(void)
   Routing header.
  ********************************/
 
-static int ipv6_rthdr_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_rthdr_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp;
 	struct inet6_skb_parm *opt = IP6CB(skb);
@@ -249,7 +249,7 @@ looped_back:
 		skb->h.raw += (hdr->hdrlen + 1) << 3;
 		opt->dst0 = opt->dst1;
 		opt->dst1 = 0;
-		*nhoffp = (&hdr->nexthdr) - skb->nh.raw;
+		opt->nhoff = (&hdr->nexthdr) - skb->nh.raw;
 		return 1;
 	}
 
@@ -487,9 +487,14 @@ static struct tlvtype_proc tlvprochopopt
 
 int ipv6_parse_hopopts(struct sk_buff *skb, int nhoff)
 {
-	IP6CB(skb)->hop = sizeof(struct ipv6hdr);
-	if (ip6_parse_tlv(tlvprochopopt_lst, skb))
+	struct inet6_skb_parm *opt = IP6CB(skb);
+
+	opt->hop = sizeof(struct ipv6hdr);
+	if (ip6_parse_tlv(tlvprochopopt_lst, skb)) {
+		skb->h.raw += (skb->h.raw[1]+1)<<3;
+		opt->nhoff = sizeof(struct ipv6hdr);
 		return sizeof(struct ipv6hdr);
+	}
 	return -1;
 }
 
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 6ec6a2b..53c81fc 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -79,7 +79,7 @@ DEFINE_SNMP_STAT(struct icmpv6_mib, icmp
 static DEFINE_PER_CPU(struct socket *, __icmpv6_socket) = NULL;
 #define icmpv6_socket	__get_cpu_var(__icmpv6_socket)
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp);
+static int icmpv6_rcv(struct sk_buff **pskb);
 
 static struct inet6_protocol icmpv6_protocol = {
 	.handler	=	icmpv6_rcv,
@@ -581,7 +581,7 @@ static void icmpv6_notify(struct sk_buff
  *	Handle icmp messages
  */
 
-static int icmpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int icmpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct net_device *dev = skb->dev;
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index a6026d2..13d7241 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -97,6 +97,9 @@ int ipv6_rcv(struct sk_buff *skb, struct
 	if (hdr->version != 6)
 		goto err;
 
+	skb->h.raw = (u8 *)(hdr + 1);
+	IP6CB(skb)->nhoff = offsetof(struct ipv6hdr, nexthdr);
+
 	pkt_len = ntohs(hdr->payload_len);
 
 	/* pkt_len may be zero if Jumbo payload option is present */
@@ -111,8 +114,7 @@ int ipv6_rcv(struct sk_buff *skb, struct
 	}
 
 	if (hdr->nexthdr == NEXTHDR_HOP) {
-		skb->h.raw = (u8*)(hdr+1);
-		if (ipv6_parse_hopopts(skb, offsetof(struct ipv6hdr, nexthdr)) < 0) {
+		if (ipv6_parse_hopopts(skb, IP6CB(skb)->nhoff) < 0) {
 			IP6_INC_STATS_BH(IPSTATS_MIB_INHDRERRORS);
 			return 0;
 		}
@@ -143,26 +145,15 @@ static inline int ip6_input_finish(struc
 	int nexthdr;
 	u8 hash;
 
-	skb->h.raw = skb->nh.raw + sizeof(struct ipv6hdr);
-
 	/*
 	 *	Parse extension headers
 	 */
 
-	nexthdr = skb->nh.ipv6h->nexthdr;
-	nhoff = offsetof(struct ipv6hdr, nexthdr);
-
-	/* Skip hop-by-hop options, they are already parsed. */
-	if (nexthdr == NEXTHDR_HOP) {
-		nhoff = sizeof(struct ipv6hdr);
-		nexthdr = skb->h.raw[0];
-		skb->h.raw += (skb->h.raw[1]+1)<<3;
-	}
-
 	rcu_read_lock();
 resubmit:
 	if (!pskb_pull(skb, skb->h.raw - skb->data))
 		goto discard;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	raw_sk = sk_head(&raw_v6_htable[nexthdr & (MAX_INET_PROTOS - 1)]);
@@ -194,7 +185,7 @@ resubmit:
 		    !xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) 
 			goto discard;
 		
-		ret = ipprot->handler(&skb, &nhoff);
+		ret = ipprot->handler(&skb);
 		if (ret > 0)
 			goto resubmit;
 		else if (ret == 0)
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index e315d0f..f079621 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -510,7 +510,7 @@ static inline void ip6ip6_ecn_decapsulat
  **/
 
 static int 
-ip6ip6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+ip6ip6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct ipv6hdr *ipv6h;
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 5d316cb..15e1456 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -581,7 +581,6 @@ err:
  *	the last and the first frames arrived and all the bits are here.
  */
 static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff **skb_in,
-			  unsigned int *nhoffp,
 			  struct net_device *dev)
 {
 	struct sk_buff *fp, *head = fq->fragments;
@@ -654,6 +653,7 @@ static int ip6_frag_reasm(struct frag_qu
 	head->dev = dev;
 	skb_set_timestamp(head, &fq->stamp);
 	head->nh.ipv6h->payload_len = htons(payload_len);
+	IP6CB(head)->nhoff = nhoff;
 
 	*skb_in = head;
 
@@ -663,7 +663,6 @@ static int ip6_frag_reasm(struct frag_qu
 
 	IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 	fq->fragments = NULL;
-	*nhoffp = nhoff;
 	return 1;
 
 out_oversize:
@@ -678,7 +677,7 @@ out_fail:
 	return -1;
 }
 
-static int ipv6_frag_rcv(struct sk_buff **skbp, unsigned int *nhoffp)
+static int ipv6_frag_rcv(struct sk_buff **skbp)
 {
 	struct sk_buff *skb = *skbp; 
 	struct net_device *dev = skb->dev;
@@ -710,7 +709,7 @@ static int ipv6_frag_rcv(struct sk_buff 
 		skb->h.raw += sizeof(struct frag_hdr);
 		IP6_INC_STATS_BH(IPSTATS_MIB_REASMOKS);
 
-		*nhoffp = (u8*)fhdr - skb->nh.raw;
+		IP6CB(skb)->nhoff = (u8*)fhdr - skb->nh.raw;
 		return 1;
 	}
 
@@ -722,11 +721,11 @@ static int ipv6_frag_rcv(struct sk_buff 
 
 		spin_lock(&fq->lock);
 
-		ip6_frag_queue(fq, skb, fhdr, *nhoffp);
+		ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff);
 
 		if (fq->last_in == (FIRST_IN|LAST_IN) &&
 		    fq->meat == fq->len)
-			ret = ip6_frag_reasm(fq, skbp, nhoffp, dev);
+			ret = ip6_frag_reasm(fq, skbp, dev);
 
 		spin_unlock(&fq->lock);
 		fq_put(fq, NULL);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2947bc5..a25f4e8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1153,7 +1153,7 @@ ipv6_pktoptions:
 	return 0;
 }
 
-static int tcp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int tcp_v6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct tcphdr *th;	
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index d8538dc..c476488 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -435,7 +435,7 @@ out:
 	read_unlock(&udp_hash_lock);
 }
 
-static int udpv6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int udpv6_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct sock *sk;
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 28c29d7..1079e47 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -26,7 +26,7 @@ static inline void ipip6_ecn_decapsulate
 		IP6_ECN_set_ce(inner_iph);
 }
 
-int xfrm6_rcv_spi(struct sk_buff **pskb, unsigned int *nhoffp, u32 spi)
+int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi)
 {
 	struct sk_buff *skb = *pskb;
 	int err;
@@ -38,7 +38,7 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 	int nexthdr;
 	unsigned int nhoff;
 
-	nhoff = *nhoffp;
+	nhoff = IP6CB(skb)->nhoff;
 	nexthdr = skb->nh.raw[nhoff];
 
 	seq = 0;
@@ -144,7 +144,7 @@ drop:
 
 EXPORT_SYMBOL(xfrm6_rcv_spi);
 
-int xfrm6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+int xfrm6_rcv(struct sk_buff **pskb)
 {
-	return xfrm6_rcv_spi(pskb, nhoffp, 0);
+	return xfrm6_rcv_spi(pskb, 0);
 }
diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c
index fbef782..da09ff2 100644
--- a/net/ipv6/xfrm6_tunnel.c
+++ b/net/ipv6/xfrm6_tunnel.c
@@ -397,7 +397,7 @@ int xfrm6_tunnel_deregister(struct xfrm6
 
 EXPORT_SYMBOL(xfrm6_tunnel_deregister);
 
-static int xfrm6_tunnel_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int xfrm6_tunnel_rcv(struct sk_buff **pskb)
 {
 	struct sk_buff *skb = *pskb;
 	struct xfrm6_tunnel *handler = xfrm6_tunnel_handler;
@@ -405,11 +405,11 @@ static int xfrm6_tunnel_rcv(struct sk_bu
 	u32 spi;
 
 	/* device-like_ip6ip6_handler() */
-	if (handler && handler->handler(pskb, nhoffp) == 0)
+	if (handler && handler->handler(pskb) == 0)
 		return 0;
 
 	spi = xfrm6_tunnel_spi_lookup((xfrm_address_t *)&iph->saddr);
-	return xfrm6_rcv_spi(pskb, nhoffp, spi);
+	return xfrm6_rcv_spi(pskb, spi);
 }
 
 static void xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index 15c0516..04c7fab 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -905,7 +905,7 @@ static struct inet_protosw sctpv6_stream
 	.flags         = SCTP_PROTOSW_FLAG,
 };
 
-static int sctp6_rcv(struct sk_buff **pskb, unsigned int *nhoffp)
+static int sctp6_rcv(struct sk_buff **pskb)
 {
 	return sctp_rcv(*pskb) ? -1 : 0;
 }

^ permalink raw reply related

* [IPV4/6 03/10]: Netfilter IPsec input hooks
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[IPV4/6]: Netfilter IPsec input hooks

When the innermost transform uses transport mode the decapsulated packet
is not visible to netfilter. Pass the packet through the PRE_ROUTING and
LOCAL_IN hooks again before handing it to upper layer protocols to make
netfilter-visibility symetrical to the output path.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 0ab5982305cae5d8e4fb41d8bb849ee1808946fc
tree 6fce75eaee3b9b8d4f34470d8988906e93a22b19
parent 7398b0fba81fbae5139c7a23d27477424125e8b2
author Patrick McHardy <kaber@trash.net> Fri, 06 Jan 2006 21:47:45 +0100
committer Patrick McHardy <kaber@trash.net> Fri, 06 Jan 2006 21:47:45 +0100

 include/net/ipv6.h     |    2 ++
 net/ipv4/xfrm4_input.c |   31 +++++++++++++++++++++++++++++++
 net/ipv6/ip6_input.c   |    2 +-
 net/ipv6/xfrm6_input.c |   13 +++++++++++++
 4 files changed, 47 insertions(+), 1 deletions(-)

diff --git a/include/net/ipv6.h b/include/net/ipv6.h
index 860bbac..3b1d963 100644
--- a/include/net/ipv6.h
+++ b/include/net/ipv6.h
@@ -418,6 +418,8 @@ extern int			ipv6_rcv(struct sk_buff *sk
 					 struct packet_type *pt,
 					 struct net_device *orig_dev);
 
+extern int			ip6_rcv_finish(struct sk_buff *skb);
+
 /*
  *	upper-layer output functions
  */
diff --git a/net/ipv4/xfrm4_input.c b/net/ipv4/xfrm4_input.c
index 2d3849c..850d919 100644
--- a/net/ipv4/xfrm4_input.c
+++ b/net/ipv4/xfrm4_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
 #include <net/xfrm.h>
@@ -45,6 +47,23 @@ static int xfrm4_parse_spi(struct sk_buf
 	return xfrm_parse_spi(skb, nexthdr, spi, seq);
 }
 
+#ifdef CONFIG_NETFILTER
+static inline int xfrm4_rcv_encap_finish(struct sk_buff *skb)
+{
+	struct iphdr *iph = skb->nh.iph;
+
+	if (skb->dst == NULL) {
+		if (ip_route_input(skb, iph->daddr, iph->saddr, iph->tos,
+		                   skb->dev))
+			goto drop;
+	}
+	return dst_input(skb);
+drop:
+	kfree_skb(skb);
+	return NET_RX_DROP;
+}
+#endif
+
 int xfrm4_rcv_encap(struct sk_buff *skb, __u16 encap_type)
 {
 	int err;
@@ -137,6 +156,8 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
 	memcpy(skb->sp->x+skb->sp->len, xfrm_vec, xfrm_nr*sizeof(struct sec_decap_state));
 	skb->sp->len += xfrm_nr;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -145,7 +166,17 @@ int xfrm4_rcv_encap(struct sk_buff *skb,
 		netif_rx(skb);
 		return 0;
 	} else {
+#ifdef CONFIG_NETFILTER
+		__skb_push(skb, skb->data - skb->nh.raw);
+		skb->nh.iph->tot_len = htons(skb->len);
+		ip_send_check(skb->nh.iph);
+
+		NF_HOOK(PF_INET, NF_IP_PRE_ROUTING, skb, skb->dev, NULL,
+		        xfrm4_rcv_encap_finish);
+		return 0;
+#else
 		return -skb->nh.iph->protocol;
+#endif
 	}
 
 drop_unlock:
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 13d7241..29f7359 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -48,7 +48,7 @@
 
 
 
-static inline int ip6_rcv_finish( struct sk_buff *skb) 
+inline int ip6_rcv_finish( struct sk_buff *skb) 
 {
 	if (skb->dst == NULL)
 		ip6_route_input(skb);
diff --git a/net/ipv6/xfrm6_input.c b/net/ipv6/xfrm6_input.c
index 1079e47..1ca2da6 100644
--- a/net/ipv6/xfrm6_input.c
+++ b/net/ipv6/xfrm6_input.c
@@ -11,6 +11,8 @@
 
 #include <linux/module.h>
 #include <linux/string.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
 #include <net/dsfield.h>
 #include <net/inet_ecn.h>
 #include <net/ip.h>
@@ -121,6 +123,8 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 	skb->sp->len += xfrm_nr;
 	skb->ip_summed = CHECKSUM_NONE;
 
+	nf_reset(skb);
+
 	if (decaps) {
 		if (!(skb->dev->flags&IFF_LOOPBACK)) {
 			dst_release(skb->dst);
@@ -129,7 +133,16 @@ int xfrm6_rcv_spi(struct sk_buff **pskb,
 		netif_rx(skb);
 		return -1;
 	} else {
+#ifdef CONFIG_NETFILTER
+		skb->nh.ipv6h->payload_len = htons(skb->len);
+		__skb_push(skb, skb->data - skb->nh.raw);
+
+		NF_HOOK(PF_INET6, NF_IP6_PRE_ROUTING, skb, skb->dev, NULL,
+		        ip6_rcv_finish);
+		return -1;
+#else
 		return 1;
+#endif
 	}
 
 drop_unlock:

^ permalink raw reply related

* [IPV4 04/10]: reset IPCB flags when neccessary
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[IPV4]: reset IPCB flags when neccessary

Reset IPSKB_XFRM_TUNNEL_SIZE flags in ipip and ip_gre hard_start_xmit
function before the packet reenters IP. This is neccessary so the
encapsulated packets are checked not to be oversized in xfrm4_output.c
again. Reset all flags in sit when a packet changes its address family.

Also remove some obsolete IPSKB flags.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 0c72987fae9d4f21d80910ccca2beea4f77e1dc5
tree 71aa056fbac413a11a4d29053595e73803656974
parent 0ab5982305cae5d8e4fb41d8bb849ee1808946fc
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 00:42:27 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 00:42:27 +0100

 include/net/ip.h  |    8 +++-----
 net/ipv4/ip_gre.c |    1 +
 net/ipv4/ipip.c   |    1 +
 net/ipv6/sit.c    |    2 ++
 4 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/include/net/ip.h b/include/net/ip.h
index 7bb5804..52f4d9c 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -37,11 +37,9 @@ struct inet_skb_parm
 	struct ip_options	opt;		/* Compiled IP options		*/
 	unsigned char		flags;
 
-#define IPSKB_MASQUERADED	1
-#define IPSKB_TRANSLATED	2
-#define IPSKB_FORWARDED		4
-#define IPSKB_XFRM_TUNNEL_SIZE	8
-#define IPSKB_FRAG_COMPLETE	16
+#define IPSKB_FORWARDED		1
+#define IPSKB_XFRM_TUNNEL_SIZE	2
+#define IPSKB_FRAG_COMPLETE	4
 };
 
 struct ipcm_cookie
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 912c42f..65c3a91 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,6 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_b
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 35571cf..078b59b 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,6 +621,7 @@ static int ipip_tunnel_xmit(struct sk_bu
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index 577d497..02872ae 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -381,6 +381,7 @@ static int ipip6_rcv(struct sk_buff *skb
 		skb->mac.raw = skb->nh.raw;
 		skb->nh.raw = skb->data;
 		memset(&(IPCB(skb)->opt), 0, sizeof(struct ip_options));
+		IPCB(skb)->flags = 0;
 		skb->protocol = htons(ETH_P_IPV6);
 		skb->pkt_type = PACKET_HOST;
 		tunnel->stat.rx_packets++;
@@ -552,6 +553,7 @@ static int ipip6_tunnel_xmit(struct sk_b
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+	IPCB(skb)->flags = 0;
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 

^ permalink raw reply related

* [NETFILTER 05/10]: Fix xfrm lookup in ip_route_me_harder/ip6_route_me_harder
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Fix xfrm lookup in ip_route_me_harder/ip6_route_me_harder

ip_route_me_harder doesn't use the port numbers of the xfrm lookup and
uses ip_route_input for non-local addresses which doesn't do a xfrm
lookup, ip6_route_me_harder doesn't do a xfrm lookup at all.

Use xfrm_decode_session and do the lookup manually, make sure both
only do the lookup if the packet hasn't been transformed already.

Makeing sure the lookup only happens once needs a new field in the
IP6CB, which exceeds the size of skb->cb. The size of skb->cb is
increased to 48b. Apparently the IPv6 mobile extensions need some
more room anyway.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit d923d6ec05f49c2b746a7e8c6d0a71f3bcc35b4e
tree 7005afd82b323b88784953b776f9d95671d4b9f3
parent 0c72987fae9d4f21d80910ccca2beea4f77e1dc5
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 00:57:41 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 00:57:41 +0100

 include/linux/ipv6.h    |    3 +++
 include/linux/skbuff.h  |    2 +-
 include/net/ip.h        |    3 ++-
 include/net/xfrm.h      |    2 +-
 net/ipv4/ip_gre.c       |    2 +-
 net/ipv4/ipip.c         |    2 +-
 net/ipv4/netfilter.c    |   12 ++++++++++--
 net/ipv4/xfrm4_output.c |    1 +
 net/ipv6/netfilter.c    |    9 ++++++++-
 net/ipv6/xfrm6_output.c |    1 +
 net/xfrm/xfrm_policy.c  |    9 +++++----
 11 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h
index 5cfc715..9c8f4c9 100644
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -192,6 +192,9 @@ struct inet6_skb_parm {
 	__u16			dst1;
 	__u16			lastopt;
 	__u32			nhoff;
+	__u16			flags;
+
+#define IP6SKB_XFRM_TRANSFORMED	1
 };
 
 #define IP6CB(skb)	((struct inet6_skb_parm*)((skb)->cb))
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 483cfc4..e5fd66c 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -251,7 +251,7 @@ struct sk_buff {
 	 * want to keep them across layers you have to do a skb_clone()
 	 * first. This is owned by whoever has the skb queued ATM.
 	 */
-	char			cb[40];
+	char			cb[48];
 
 	unsigned int		len,
 				data_len,
diff --git a/include/net/ip.h b/include/net/ip.h
index 52f4d9c..a494d04 100644
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -39,7 +39,8 @@ struct inet_skb_parm
 
 #define IPSKB_FORWARDED		1
 #define IPSKB_XFRM_TUNNEL_SIZE	2
-#define IPSKB_FRAG_COMPLETE	4
+#define IPSKB_XFRM_TRANSFORMED	4
+#define IPSKB_FRAG_COMPLETE	8
 };
 
 struct ipcm_cookie
diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index 297d09d..d6111a2 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -668,7 +668,7 @@ static inline int xfrm6_policy_check(str
 	return xfrm_policy_check(sk, dir, skb, AF_INET6);
 }
 
-
+extern int xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family);
 extern int __xfrm_route_forward(struct sk_buff *skb, unsigned short family);
 
 static inline int xfrm_route_forward(struct sk_buff *skb, unsigned short family)
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 65c3a91..de16e94 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -832,7 +832,7 @@ static int ipgre_tunnel_xmit(struct sk_b
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, gre_hlen);
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 078b59b..bbd85f5 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -621,7 +621,7 @@ static int ipip_tunnel_xmit(struct sk_bu
 	skb->h.raw = skb->nh.raw;
 	skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
 	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
-	IPCB(skb)->flags &= ~IPSKB_XFRM_TUNNEL_SIZE;
+	IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE|IPSKB_XFRM_TRANSFORMED);
 	dst_release(skb->dst);
 	skb->dst = &rt->u.dst;
 
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index ae0779d..4c637a1 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -7,11 +7,13 @@
 #include <linux/netfilter.h>
 #include <linux/netfilter_ipv4.h>
 
+#include <linux/ip.h>
 #include <linux/tcp.h>
 #include <linux/udp.h>
 #include <linux/icmp.h>
 #include <net/route.h>
-#include <linux/ip.h>
+#include <net/xfrm.h>
+#include <net/ip.h>
 
 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
 int ip_route_me_harder(struct sk_buff **pskb)
@@ -33,7 +35,6 @@ int ip_route_me_harder(struct sk_buff **
 #ifdef CONFIG_IP_ROUTE_FWMARK
 		fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
 #endif
-		fl.proto = iph->protocol;
 		if (ip_route_output_key(&rt, &fl) != 0)
 			return -1;
 
@@ -60,6 +61,13 @@ int ip_route_me_harder(struct sk_buff **
 	if ((*pskb)->dst->error)
 		return -1;
 
+#ifdef CONFIG_XFRM
+	if (!(IPCB(*pskb)->flags & IPSKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(*pskb, &fl, AF_INET) == 0)
+		if (xfrm_lookup(&(*pskb)->dst, &fl, (*pskb)->sk, 0))
+			return -1;
+#endif
+
 	/* Change in oif may mean change in hh_len. */
 	hh_len = (*pskb)->dst->dev->hard_header_len;
 	if (skb_headroom(*pskb) < hh_len) {
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 51fabb8..160c488 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -140,6 +140,7 @@ static int xfrm4_output_one(struct sk_bu
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IPCB(skb)->flags |= IPSKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c
index f8626eb..b636783 100644
--- a/net/ipv6/netfilter.c
+++ b/net/ipv6/netfilter.c
@@ -10,6 +10,7 @@
 #include <net/dst.h>
 #include <net/ipv6.h>
 #include <net/ip6_route.h>
+#include <net/xfrm.h>
 
 int ip6_route_me_harder(struct sk_buff *skb)
 {
@@ -21,11 +22,17 @@ int ip6_route_me_harder(struct sk_buff *
 		{ .ip6_u =
 		  { .daddr = iph->daddr,
 		    .saddr = iph->saddr, } },
-		.proto = iph->nexthdr,
 	};
 
 	dst = ip6_route_output(skb->sk, &fl);
 
+#ifdef CONFIG_XFRM
+	if (!(IP6CB(skb)->flags & IP6SKB_XFRM_TRANSFORMED) &&
+	    xfrm_decode_session(skb, &fl, AF_INET6) == 0)
+		if (xfrm_lookup(&skb->dst, &fl, skb->sk, 0))
+			return -1;
+#endif
+
 	if (dst->error) {
 		IP6_INC_STATS(IPSTATS_MIB_OUTNOROUTES);
 		LIMIT_NETDEBUG(KERN_DEBUG "ip6_route_me_harder: No more route.\n");
diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c
index fc0ea38..8024217 100644
--- a/net/ipv6/xfrm6_output.c
+++ b/net/ipv6/xfrm6_output.c
@@ -139,6 +139,7 @@ static int xfrm6_output_one(struct sk_bu
 		x = dst->xfrm;
 	} while (x && !x->props.mode);
 
+	IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED;
 	err = 0;
 
 out_exit:
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 64a4473..f2edc92 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -951,8 +951,8 @@ xfrm_policy_ok(struct xfrm_tmpl *tmpl, s
 	return start;
 }
 
-static int
-_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
+int
+xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned short family)
 {
 	struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family);
 
@@ -963,6 +963,7 @@ _decode_session(struct sk_buff *skb, str
 	xfrm_policy_put_afinfo(afinfo);
 	return 0;
 }
+EXPORT_SYMBOL(xfrm_decode_session);
 
 static inline int secpath_has_tunnel(struct sec_path *sp, int k)
 {
@@ -982,7 +983,7 @@ int __xfrm_policy_check(struct sock *sk,
 	u8 fl_dir = policy_to_flow_dir(dir);
 	u32 sk_sid;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
@@ -1055,7 +1056,7 @@ int __xfrm_route_forward(struct sk_buff 
 {
 	struct flowi fl;
 
-	if (_decode_session(skb, &fl, family) < 0)
+	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
 
 	return xfrm_lookup(&skb->dst, &fl, NULL, 0) == 0;

^ permalink raw reply related

* [NETFILTER 06/10]: Use conntrack information to determine if packet was NATed
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Use conntrack information to determine if packet was NATed

Preparation for IPsec support for NAT:
Use conntrack information instead of saving the saving and comparing the
addresses to determine if a packet was NATed and needs to be rerouted to
make it easier to extend the key.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 3ba04601534a6f050fb695c92ddc62d403078cc6
tree 01e3981c9fae596a30f0ce815f9695f71417724b
parent d923d6ec05f49c2b746a7e8c6d0a71f3bcc35b4e
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:02:59 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:02:59 +0100

 net/ipv4/netfilter/ip_nat_standalone.c |   34 ++++++++++++++++++--------------
 1 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index f04111f..1bb5089 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -162,18 +162,20 @@ ip_nat_in(unsigned int hooknum,
           const struct net_device *out,
           int (*okfn)(struct sk_buff *))
 {
-	u_int32_t saddr, daddr;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 
-	saddr = (*pskb)->nh.iph->saddr;
-	daddr = (*pskb)->nh.iph->daddr;
-
 	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN
-	    && ((*pskb)->nh.iph->saddr != saddr
-	        || (*pskb)->nh.iph->daddr != daddr)) {
-		dst_release((*pskb)->dst);
-		(*pskb)->dst = NULL;
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.ip !=
+		    ct->tuplehash[!dir].tuple.dst.ip) {
+			dst_release((*pskb)->dst);
+			(*pskb)->dst = NULL;
+		}
 	}
 	return ret;
 }
@@ -200,7 +202,8 @@ ip_nat_local_fn(unsigned int hooknum,
 		const struct net_device *out,
 		int (*okfn)(struct sk_buff *))
 {
-	u_int32_t saddr, daddr;
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
 	unsigned int ret;
 
 	/* root is playing with raw sockets. */
@@ -208,14 +211,15 @@ ip_nat_local_fn(unsigned int hooknum,
 	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	saddr = (*pskb)->nh.iph->saddr;
-	daddr = (*pskb)->nh.iph->daddr;
-
 	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
 	if (ret != NF_DROP && ret != NF_STOLEN
-	    && ((*pskb)->nh.iph->saddr != saddr
-		|| (*pskb)->nh.iph->daddr != daddr))
-		return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.dst.ip !=
+		    ct->tuplehash[!dir].tuple.src.ip)
+			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	}
 	return ret;
 }
 

^ permalink raw reply related

* [NETFILTER 07/10]: Redo policy lookups after NAT when neccessary
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Redo policy lookups after NAT when neccessary

When NAT changes the key used for the xfrm lookup it needs to be done
again. If a new policy is returned in POST_ROUTING the packet needs
to be passed to xfrm4_output_one manually after all hooks were called
because POST_ROUTING is called with fixed okfn (ip_finish_output).

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 00b264ce82ccc3216e3e4b0175dd1337eeb16f24
tree e774053af87fff87b29e652c6a90588c30695224
parent 3ba04601534a6f050fb695c92ddc62d403078cc6
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:18:04 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:18:04 +0100

 include/net/xfrm.h                     |    1 +
 net/ipv4/ip_output.c                   |    5 +++++
 net/ipv4/netfilter/ip_nat_standalone.c |   27 +++++++++++++++++++++++++--
 net/ipv4/xfrm4_output.c                |    2 +-
 4 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/include/net/xfrm.h b/include/net/xfrm.h
index d6111a2..d09ca0e 100644
--- a/include/net/xfrm.h
+++ b/include/net/xfrm.h
@@ -866,6 +866,7 @@ extern int xfrm_state_mtu(struct xfrm_st
 extern int xfrm_init_state(struct xfrm_state *x);
 extern int xfrm4_rcv(struct sk_buff *skb);
 extern int xfrm4_output(struct sk_buff *skb);
+extern int xfrm4_output_finish(struct sk_buff *skb);
 extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler);
 extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler);
 extern int xfrm6_rcv_spi(struct sk_buff **pskb, u32 spi);
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 71da318..5b42dbe 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -202,6 +202,11 @@ static inline int ip_finish_output2(stru
 
 static inline int ip_finish_output(struct sk_buff *skb)
 {
+#if defined(CONFIG_NETFILTER) && defined(CONFIG_XFRM)
+	/* Policy lookup after SNAT yielded a new policy */
+	if (skb->dst->xfrm != NULL)
+		return xfrm4_output_finish(skb);
+#endif
 	if (skb->len > dst_mtu(skb->dst) &&
 	    !(skb_shinfo(skb)->ufo_size || skb_shinfo(skb)->tso_size))
 		return ip_fragment(skb, ip_finish_output2);
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index 1bb5089..b518697 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -187,12 +187,30 @@ ip_nat_out(unsigned int hooknum,
 	   const struct net_device *out,
 	   int (*okfn)(struct sk_buff *))
 {
+	struct ip_conntrack *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int ret;
+
 	/* root is playing with raw sockets. */
 	if ((*pskb)->len < sizeof(struct iphdr)
 	    || (*pskb)->nh.iph->ihl * 4 < sizeof(struct iphdr))
 		return NF_ACCEPT;
 
-	return ip_nat_fn(hooknum, pskb, in, out, okfn);
+	ret = ip_nat_fn(hooknum, pskb, in, out, okfn);
+	if (ret != NF_DROP && ret != NF_STOLEN
+	    && (ct = ip_conntrack_get(*pskb, &ctinfo)) != NULL) {
+		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
+
+		if (ct->tuplehash[dir].tuple.src.ip !=
+		    ct->tuplehash[!dir].tuple.dst.ip
+#ifdef CONFIG_XFRM
+		    || ct->tuplehash[dir].tuple.src.u.all !=
+		       ct->tuplehash[!dir].tuple.dst.u.all
+#endif
+		    )
+			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
+	}
+	return ret;
 }
 
 static unsigned int
@@ -217,7 +235,12 @@ ip_nat_local_fn(unsigned int hooknum,
 		enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
 
 		if (ct->tuplehash[dir].tuple.dst.ip !=
-		    ct->tuplehash[!dir].tuple.src.ip)
+		    ct->tuplehash[!dir].tuple.src.ip
+#ifdef CONFIG_XFRM
+		    || ct->tuplehash[dir].tuple.dst.u.all !=
+		       ct->tuplehash[dir].tuple.src.u.all
+#endif
+		    )
 			return ip_route_me_harder(pskb) == 0 ? ret : NF_DROP;
 	}
 	return ret;
diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c
index 160c488..d4df0dd 100644
--- a/net/ipv4/xfrm4_output.c
+++ b/net/ipv4/xfrm4_output.c
@@ -152,7 +152,7 @@ error_nolock:
 	goto out_exit;
 }
 
-static int xfrm4_output_finish(struct sk_buff *skb)
+int xfrm4_output_finish(struct sk_buff *skb)
 {
 	int err;
 

^ permalink raw reply related

* [NETFILTER 08/10]: Keep conntrack reference until IPsec policy checks are done
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Keep conntrack reference until IPsec policy checks are done

Keep the conntrack reference until policy checks have been performed for
IPsec NAT support. The reference needs to be dropped before a packet is
queued to avoid having the conntrack module unloadable.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 7573dacaa96b4bd8ddc1be7139d17f7c95010413
tree 63a51c62e2d7a4730df5339a0f7d22f1121c8750
parent 00b264ce82ccc3216e3e4b0175dd1337eeb16f24
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:22:09 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:22:09 +0100

 net/dccp/ipv4.c     |    1 +
 net/ipv4/ip_input.c |   15 ++++++---------
 net/ipv4/raw.c      |    1 +
 net/ipv4/tcp_ipv4.c |    1 +
 net/ipv4/udp.c      |    2 ++
 net/sctp/input.c    |    1 +
 6 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 3f24467..23ba177 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -1099,6 +1099,7 @@ int dccp_v4_destroy_sock(struct sock *sk
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
+	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index e45846a..18d7fad 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -185,7 +185,6 @@ int ip_call_ra_chain(struct sk_buff *skb
 					raw_rcv(last, skb2);
 			}
 			last = sk;
-			nf_reset(skb);
 		}
 	}
 
@@ -204,10 +203,6 @@ static inline int ip_local_deliver_finis
 
 	__skb_pull(skb, ihl);
 
-	/* Free reference early: we don't need it any more, and it may
-           hold ip_conntrack module loaded indefinitely. */
-	nf_reset(skb);
-
         /* Point into the IP datagram, just past the header. */
         skb->h.raw = skb->data;
 
@@ -232,10 +227,12 @@ static inline int ip_local_deliver_finis
 		if ((ipprot = rcu_dereference(inet_protos[hash])) != NULL) {
 			int ret;
 
-			if (!ipprot->no_policy &&
-			    !xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-				kfree_skb(skb);
-				goto out;
+			if (!ipprot->no_policy) {
+				if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
+					kfree_skb(skb);
+					goto out;
+				}
+				nf_reset(skb);
 			}
 			ret = ipprot->handler(skb);
 			if (ret < 0) {
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 4b0d7e4..165a4d8 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -255,6 +255,7 @@ int raw_rcv(struct sock *sk, struct sk_b
 		kfree_skb(skb);
 		return NET_RX_DROP;
 	}
+	nf_reset(skb);
 
 	skb_push(skb, skb->data - skb->nh.raw);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index e9f83e5..6ea3539 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1080,6 +1080,7 @@ process:
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	if (sk_filter(sk, skb, 0))
 		goto discard_and_relse;
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 223abaa..0084047 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -989,6 +989,7 @@ static int udp_queue_rcv_skb(struct sock
 		kfree_skb(skb);
 		return -1;
 	}
+	nf_reset(skb);
 
 	if (up->encap_type) {
 		/*
@@ -1149,6 +1150,7 @@ int udp_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
 		goto drop;
+	nf_reset(skb);
 
 	/* No socket. Drop packet silently, if checksum is wrong */
 	if (udp_checksum_complete(skb))
diff --git a/net/sctp/input.c b/net/sctp/input.c
index 238f1bf..4aa6fc6 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -225,6 +225,7 @@ int sctp_rcv(struct sk_buff *skb)
 
 	if (!xfrm_policy_check(sk, XFRM_POLICY_IN, skb, family))
 		goto discard_release;
+	nf_reset(skb);
 
 	ret = sk_filter(sk, skb, 1);
 	if (ret)

^ permalink raw reply related

* [NETFILTER 09/10]: Handle NAT in IPsec policy checks
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Handle NAT in IPsec policy checks

Handle NAT of decapsulated IPsec packets by reconstructing the struct flowi
of the original packet from the conntrack information for IPsec policy
checks.

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit dcaa54019ab79d6db75572a239fdc0df0d8019be
tree cf231c2e385835e3c7e25772c86c480e3646fd84
parent 7573dacaa96b4bd8ddc1be7139d17f7c95010413
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:27:39 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:27:39 +0100

 include/linux/netfilter.h              |   16 ++++++++++
 net/dccp/ipv4.c                        |    2 +
 net/ipv4/netfilter.c                   |    3 ++
 net/ipv4/netfilter/ip_nat_standalone.c |   50 +++++++++++++++++++++++++++++++-
 net/xfrm/xfrm_policy.c                 |    2 +
 5 files changed, 70 insertions(+), 3 deletions(-)

diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
index 79bb977..84506df 100644
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -274,6 +274,20 @@ struct nf_queue_rerouter {
 extern int nf_register_queue_rerouter(int pf, struct nf_queue_rerouter *rer);
 extern int nf_unregister_queue_rerouter(int pf);
 
+#include <net/flow.h>
+extern void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family)
+{
+#ifdef CONFIG_IP_NF_NAT_NEEDED
+	void (*decodefn)(struct sk_buff *, struct flowi *);
+
+	if (family == AF_INET && (decodefn = ip_nat_decode_session) != NULL)
+		decodefn(skb, fl);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 #include <linux/proc_fs.h>
 extern struct proc_dir_entry *proc_net_netfilter;
@@ -282,6 +296,8 @@ extern struct proc_dir_entry *proc_net_n
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
+static inline void
+nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, int family) {}
 #endif /*CONFIG_NETFILTER*/
 
 #endif /*__KERNEL__*/
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 23ba177..00f9832 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -986,6 +986,7 @@ int dccp_v4_rcv(struct sk_buff *skb)
 
 	if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
 		goto discard_and_relse;
+	nf_reset(skb);
 
 	return sk_receive_skb(sk, skb);
 
@@ -1099,7 +1100,6 @@ int dccp_v4_destroy_sock(struct sock *sk
 		kfree_skb(sk->sk_send_head);
 		sk->sk_send_head = NULL;
 	}
-	nf_reset(skb);
 
 	/* Clean up a referenced DCCP bind bucket. */
 	if (inet_csk(sk)->icsk_bind_hash != NULL)
diff --git a/net/ipv4/netfilter.c b/net/ipv4/netfilter.c
index 4c637a1..3321092 100644
--- a/net/ipv4/netfilter.c
+++ b/net/ipv4/netfilter.c
@@ -86,6 +86,9 @@ int ip_route_me_harder(struct sk_buff **
 }
 EXPORT_SYMBOL(ip_route_me_harder);
 
+void (*ip_nat_decode_session)(struct sk_buff *, struct flowi *);
+EXPORT_SYMBOL(ip_nat_decode_session);
+
 /*
  * Extra routing may needed on local out, as the QUEUE target never
  * returns control to the table.
diff --git a/net/ipv4/netfilter/ip_nat_standalone.c b/net/ipv4/netfilter/ip_nat_standalone.c
index b518697..8b8a1f0 100644
--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -55,6 +55,44 @@
 			         : ((hooknum) == NF_IP_LOCAL_IN ? "LOCAL_IN"  \
 				    : "*ERROR*")))
 
+#ifdef CONFIG_XFRM
+static void nat_decode_session(struct sk_buff *skb, struct flowi *fl)
+{
+	struct ip_conntrack *ct;
+	struct ip_conntrack_tuple *t;
+	enum ip_conntrack_info ctinfo;
+	enum ip_conntrack_dir dir;
+	unsigned long statusbit;
+
+	ct = ip_conntrack_get(skb, &ctinfo);
+	if (ct == NULL)
+		return;
+	dir = CTINFO2DIR(ctinfo);
+	t = &ct->tuplehash[dir].tuple;
+
+	if (dir == IP_CT_DIR_ORIGINAL)
+		statusbit = IPS_DST_NAT;
+	else
+		statusbit = IPS_SRC_NAT;
+
+	if (ct->status & statusbit) {
+		fl->fl4_dst = t->dst.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_dport = t->dst.u.tcp.port;
+	}
+
+	statusbit ^= IPS_NAT_MASK;
+
+	if (ct->status & statusbit) {
+		fl->fl4_src = t->src.ip;
+		if (t->dst.protonum == IPPROTO_TCP ||
+		    t->dst.protonum == IPPROTO_UDP)
+			fl->fl_ip_sport = t->src.u.tcp.port;
+	}
+}
+#endif
+		
 static unsigned int
 ip_nat_fn(unsigned int hooknum,
 	  struct sk_buff **pskb,
@@ -330,10 +368,14 @@ static int init_or_cleanup(int init)
 
 	if (!init) goto cleanup;
 
+#ifdef CONFIG_XFRM
+	BUG_ON(ip_nat_decode_session != NULL);
+	ip_nat_decode_session = nat_decode_session;
+#endif
 	ret = ip_nat_rule_init();
 	if (ret < 0) {
 		printk("ip_nat_init: can't setup rules.\n");
-		goto cleanup_nothing;
+		goto cleanup_decode_session;
 	}
 	ret = nf_register_hook(&ip_nat_in_ops);
 	if (ret < 0) {
@@ -381,7 +423,11 @@ static int init_or_cleanup(int init)
 	nf_unregister_hook(&ip_nat_in_ops);
  cleanup_rule_init:
 	ip_nat_rule_cleanup();
- cleanup_nothing:
+ cleanup_decode_session:
+#ifdef CONFIG_XFRM
+	ip_nat_decode_session = NULL;
+	synchronize_net();
+#endif
 	return ret;
 }
 
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index f2edc92..59614a9 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -22,6 +22,7 @@
 #include <linux/workqueue.h>
 #include <linux/notifier.h>
 #include <linux/netdevice.h>
+#include <linux/netfilter.h>
 #include <linux/module.h>
 #include <net/xfrm.h>
 #include <net/ip.h>
@@ -985,6 +986,7 @@ int __xfrm_policy_check(struct sock *sk,
 
 	if (xfrm_decode_session(skb, &fl, family) < 0)
 		return 0;
+	nf_nat_decode_session(skb, &fl, family);
 
 	sk_sid = security_sk_sid(sk, &fl, fl_dir);
 

^ permalink raw reply related

* [NETFILTER 10/10]: Add ipt_policy/ip6t_policy matches
From: Patrick McHardy @ 2006-01-07  1:09 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev, netfilter-devel, Patrick McHardy
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

[NETFILTER]: Add ipt_policy/ip6t_policy matches

Signed-off-by: Patrick McHardy <kaber@trash.net>

---
commit 77b5a9142e3a53b375c6c21ec80fdbe890a8403c
tree e9da4deaa832b50dfa746ed1e3fa873972dcbc42
parent dcaa54019ab79d6db75572a239fdc0df0d8019be
author Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:30:29 +0100
committer Patrick McHardy <kaber@trash.net> Sat, 07 Jan 2006 01:30:29 +0100

 include/linux/netfilter_ipv4/ipt_policy.h  |   52 ++++++++
 include/linux/netfilter_ipv6/ip6t_policy.h |   52 ++++++++
 net/ipv4/netfilter/Kconfig                 |   10 ++
 net/ipv4/netfilter/Makefile                |    1 
 net/ipv4/netfilter/ipt_policy.c            |  170 +++++++++++++++++++++++++++
 net/ipv6/netfilter/Kconfig                 |   10 ++
 net/ipv6/netfilter/Makefile                |    1 
 net/ipv6/netfilter/ip6t_policy.c           |  175 ++++++++++++++++++++++++++++
 8 files changed, 471 insertions(+), 0 deletions(-)

diff --git a/include/linux/netfilter_ipv4/ipt_policy.h b/include/linux/netfilter_ipv4/ipt_policy.h
new file mode 100644
index 0000000..7fd1bec
--- /dev/null
+++ b/include/linux/netfilter_ipv4/ipt_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IPT_POLICY_H
+#define _IPT_POLICY_H
+
+#define IPT_POLICY_MAX_ELEM	4
+
+enum ipt_policy_flags
+{
+	IPT_POLICY_MATCH_IN	= 0x1,
+	IPT_POLICY_MATCH_OUT	= 0x2,
+	IPT_POLICY_MATCH_NONE	= 0x4,
+	IPT_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ipt_policy_modes
+{
+	IPT_POLICY_MODE_TRANSPORT,
+	IPT_POLICY_MODE_TUNNEL
+};
+
+struct ipt_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ipt_policy_elem
+{
+	u_int32_t	saddr;
+	u_int32_t	smask;
+	u_int32_t	daddr;
+	u_int32_t	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ipt_policy_spec	match;
+	struct ipt_policy_spec	invert;
+};
+
+struct ipt_policy_info
+{
+	struct ipt_policy_elem pol[IPT_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IPT_POLICY_H */
diff --git a/include/linux/netfilter_ipv6/ip6t_policy.h b/include/linux/netfilter_ipv6/ip6t_policy.h
new file mode 100644
index 0000000..5a93afc
--- /dev/null
+++ b/include/linux/netfilter_ipv6/ip6t_policy.h
@@ -0,0 +1,52 @@
+#ifndef _IP6T_POLICY_H
+#define _IP6T_POLICY_H
+
+#define IP6T_POLICY_MAX_ELEM	4
+
+enum ip6t_policy_flags
+{
+	IP6T_POLICY_MATCH_IN		= 0x1,
+	IP6T_POLICY_MATCH_OUT		= 0x2,
+	IP6T_POLICY_MATCH_NONE		= 0x4,
+	IP6T_POLICY_MATCH_STRICT	= 0x8,
+};
+
+enum ip6t_policy_modes
+{
+	IP6T_POLICY_MODE_TRANSPORT,
+	IP6T_POLICY_MODE_TUNNEL
+};
+
+struct ip6t_policy_spec
+{
+	u_int8_t	saddr:1,
+			daddr:1,
+			proto:1,
+			mode:1,
+			spi:1,
+			reqid:1;
+};
+
+struct ip6t_policy_elem
+{
+	struct in6_addr	saddr;
+	struct in6_addr	smask;
+	struct in6_addr	daddr;
+	struct in6_addr	dmask;
+	u_int32_t	spi;
+	u_int32_t	reqid;
+	u_int8_t	proto;
+	u_int8_t	mode;
+
+	struct ip6t_policy_spec	match;
+	struct ip6t_policy_spec	invert;
+};
+
+struct ip6t_policy_info
+{
+	struct ip6t_policy_elem pol[IP6T_POLICY_MAX_ELEM];
+	u_int16_t flags;
+	u_int16_t len;
+};
+
+#endif /* _IP6T_POLICY_H */
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index 88a6065..a9893ec 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -487,6 +487,16 @@ config IP_NF_MATCH_STRING
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP_NF_MATCH_POLICY
+       tristate "IPsec policy match support"
+       depends on IP_NF_IPTABLES && XFRM
+       help
+         Policy matching allows you to match packets based on the
+         IPsec policy that was used during decapsulation/will
+         be used during encapsulation.
+
+         To compile it as a module, choose M here.  If unsure, say N.
+
 # `filter', generic and specific targets
 config IP_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile
index d0a447e..549b01a 100644
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -72,6 +72,7 @@ obj-$(CONFIG_IP_NF_MATCH_TCPMSS) += ipt_
 obj-$(CONFIG_IP_NF_MATCH_REALM) += ipt_realm.o
 obj-$(CONFIG_IP_NF_MATCH_ADDRTYPE) += ipt_addrtype.o
 obj-$(CONFIG_IP_NF_MATCH_PHYSDEV) += ipt_physdev.o
+obj-$(CONFIG_IP_NF_MATCH_POLICY) += ipt_policy.o
 obj-$(CONFIG_IP_NF_MATCH_COMMENT) += ipt_comment.o
 obj-$(CONFIG_IP_NF_MATCH_STRING) += ipt_string.o
 
diff --git a/net/ipv4/netfilter/ipt_policy.c b/net/ipv4/netfilter/ipt_policy.c
new file mode 100644
index 0000000..709debc
--- /dev/null
+++ b/net/ipv4/netfilter/ipt_policy.c
@@ -0,0 +1,170 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv4/ipt_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ipt_policy_elem *e)
+{
+#define MATCH(x,y)	(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+
+	return MATCH(saddr, x->props.saddr.a4 & e->smask) &&
+	       MATCH(daddr, x->id.daddr.a4 & e->dmask) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ipt_policy_info *info)
+{
+	const struct ipt_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IPT_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo, int offset, int *hotdrop)
+{
+	const struct ipt_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IPT_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IPT_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IPT_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ipt_ip *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ipt_policy_info *info = matchinfo;
+
+	if (matchsize != IPT_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ipt_policy: matchsize %u != %zu\n",
+		       matchsize, IPT_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IPT_POLICY_MATCH_IN|IPT_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ipt_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_PRE_ROUTING | 1 << NF_IP_LOCAL_IN)
+	    && info->flags & IPT_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ipt_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP_POST_ROUTING | 1 << NF_IP_LOCAL_OUT)
+	    && info->flags & IPT_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ipt_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IPT_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ipt_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ipt_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ipt_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ipt_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index 04912f9..105dd69 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -179,6 +179,16 @@ config IP6_NF_MATCH_PHYSDEV
 
 	  To compile it as a module, choose M here.  If unsure, say N.
 
+config IP6_NF_MATCH_POLICY
+	tristate "IPsec policy match support"
+	depends on IP6_NF_IPTABLES && XFRM
+	help
+	  Policy matching allows you to match packets based on the
+	  IPsec policy that was used during decapsulation/will
+	  be used during encapsulation.
+
+	  To compile it as a module, choose M here.  If unsure, say N.
+
 # The targets
 config IP6_NF_FILTER
 	tristate "Packet filtering"
diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile
index 9ab5b2c..c0c809b 100644
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -13,6 +13,7 @@ obj-$(CONFIG_IP6_NF_MATCH_OPTS) += ip6t_
 obj-$(CONFIG_IP6_NF_MATCH_IPV6HEADER) += ip6t_ipv6header.o
 obj-$(CONFIG_IP6_NF_MATCH_FRAG) += ip6t_frag.o
 obj-$(CONFIG_IP6_NF_MATCH_AHESP) += ip6t_esp.o ip6t_ah.o
+obj-$(CONFIG_IP6_NF_MATCH_POLICY) += ip6t_policy.o
 obj-$(CONFIG_IP6_NF_MATCH_EUI64) += ip6t_eui64.o
 obj-$(CONFIG_IP6_NF_MATCH_MULTIPORT) += ip6t_multiport.o
 obj-$(CONFIG_IP6_NF_MATCH_OWNER) += ip6t_owner.o
diff --git a/net/ipv6/netfilter/ip6t_policy.c b/net/ipv6/netfilter/ip6t_policy.c
new file mode 100644
index 0000000..13fedad
--- /dev/null
+++ b/net/ipv6/netfilter/ip6t_policy.c
@@ -0,0 +1,175 @@
+/* IP tables module for matching IPsec policy
+ *
+ * Copyright (c) 2004,2005 Patrick McHardy, <kaber@trash.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/init.h>
+#include <net/xfrm.h>
+
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#include <linux/netfilter_ipv6/ip6t_policy.h>
+
+MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
+MODULE_DESCRIPTION("IPtables IPsec policy matching module");
+MODULE_LICENSE("GPL");
+
+
+static inline int
+match_xfrm_state(struct xfrm_state *x, const struct ip6t_policy_elem *e)
+{
+#define MATCH_ADDR(x,y,z)	(!e->match.x || \
+				 ((ip6_masked_addrcmp((z), &e->x, &e->y)) == 0) ^ e->invert.x)
+#define MATCH(x,y)		(!e->match.x || ((e->x == (y)) ^ e->invert.x))
+	
+	return MATCH_ADDR(saddr, smask, (struct in6_addr *)&x->props.saddr.a6) &&
+	       MATCH_ADDR(daddr, dmask, (struct in6_addr *)&x->id.daddr.a6) &&
+	       MATCH(proto, x->id.proto) &&
+	       MATCH(mode, x->props.mode) &&
+	       MATCH(spi, x->id.spi) &&
+	       MATCH(reqid, x->props.reqid);
+}
+
+static int
+match_policy_in(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct sec_path *sp = skb->sp;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (sp == NULL)
+		return -1;
+	if (strict && info->len != sp->len)
+		return 0;
+
+	for (i = sp->len - 1; i >= 0; i--) {
+		pos = strict ? i - sp->len + 1 : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(sp->x[i].xvec, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int
+match_policy_out(const struct sk_buff *skb, const struct ip6t_policy_info *info)
+{
+	const struct ip6t_policy_elem *e;
+	struct dst_entry *dst = skb->dst;
+	int strict = info->flags & IP6T_POLICY_MATCH_STRICT;
+	int i, pos;
+
+	if (dst->xfrm == NULL)
+		return -1;
+
+	for (i = 0; dst && dst->xfrm; dst = dst->child, i++) {
+		pos = strict ? i : 0;
+		if (pos >= info->len)
+			return 0;
+		e = &info->pol[pos];
+
+		if (match_xfrm_state(dst->xfrm, e)) {
+			if (!strict)
+				return 1;
+		} else if (strict)
+			return 0;
+	}
+
+	return strict ? 1 : 0;
+}
+
+static int match(const struct sk_buff *skb,
+                 const struct net_device *in,
+                 const struct net_device *out,
+                 const void *matchinfo,
+		 int offset,
+		 unsigned int protoff,
+		 int *hotdrop)
+{
+	const struct ip6t_policy_info *info = matchinfo;
+	int ret;
+
+	if (info->flags & IP6T_POLICY_MATCH_IN)
+		ret = match_policy_in(skb, info);
+	else
+		ret = match_policy_out(skb, info);
+
+	if (ret < 0)
+		ret = info->flags & IP6T_POLICY_MATCH_NONE ? 1 : 0;
+	else if (info->flags & IP6T_POLICY_MATCH_NONE)
+		ret = 0;
+
+	return ret;
+}
+
+static int checkentry(const char *tablename, const struct ip6t_ip6 *ip,
+                      void *matchinfo, unsigned int matchsize,
+                      unsigned int hook_mask)
+{
+	struct ip6t_policy_info *info = matchinfo;
+
+	if (matchsize != IP6T_ALIGN(sizeof(*info))) {
+		printk(KERN_ERR "ip6t_policy: matchsize %u != %zu\n",
+		       matchsize, IP6T_ALIGN(sizeof(*info)));
+		return 0;
+	}
+	if (!(info->flags & (IP6T_POLICY_MATCH_IN|IP6T_POLICY_MATCH_OUT))) {
+		printk(KERN_ERR "ip6t_policy: neither incoming nor "
+		                "outgoing policy selected\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_PRE_ROUTING | 1 << NF_IP6_LOCAL_IN)
+	    && info->flags & IP6T_POLICY_MATCH_OUT) {
+		printk(KERN_ERR "ip6t_policy: output policy not valid in "
+		                "PRE_ROUTING and INPUT\n");
+		return 0;
+	}
+	if (hook_mask & (1 << NF_IP6_POST_ROUTING | 1 << NF_IP6_LOCAL_OUT)
+	    && info->flags & IP6T_POLICY_MATCH_IN) {
+		printk(KERN_ERR "ip6t_policy: input policy not valid in "
+		                "POST_ROUTING and OUTPUT\n");
+		return 0;
+	}
+	if (info->len > IP6T_POLICY_MAX_ELEM) {
+		printk(KERN_ERR "ip6t_policy: too many policy elements\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static struct ip6t_match policy_match = {
+	.name		= "policy",
+	.match		= match,
+	.checkentry 	= checkentry,
+	.me		= THIS_MODULE,
+};
+
+static int __init init(void)
+{
+	return ip6t_register_match(&policy_match);
+}
+
+static void __exit fini(void)
+{
+	ip6t_unregister_match(&policy_match);
+}
+
+module_init(init);
+module_exit(fini);

^ permalink raw reply related

* Re: [NETFILTER 00/10]: Netfilter IPsec support
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2006-01-07  2:21 UTC (permalink / raw)
  To: kaber; +Cc: netdev, netfilter-devel, usagi-core, davem
In-Reply-To: <20060107010855.8712.68786.sendpatchset@localhost.localdomain>

In article <20060107010855.8712.68786.sendpatchset@localhost.localdomain> (at Sat,  7 Jan 2006 02:09:30 +0100 (MET)), Patrick McHardy <kaber@trash.net> says:

> following are the remaining patches for netfilter IPsec support.
> They are missing the common-case optimization for inner transport mode
> SAs on the input path, but since its just an optimization, I think
> it can also be done later. One note: unfortunately I had to increase

I definitely want to do it before 2.6.16.
Anyway, we'll test this series of patches.  Thank you.

--yoshfuji

^ permalink raw reply

* Re: [NETFILTER 00/10]: Netfilter IPsec support
From: Patrick McHardy @ 2006-01-07  2:29 UTC (permalink / raw)
  To: yoshfuji; +Cc: netdev, netfilter-devel, usagi-core, davem
In-Reply-To: <20060106.202140.97921155.yoshfuji@linux-ipv6.org>

YOSHIFUJI Hideaki / ^[$B5HF#1QL@^[ wrote:
> In article <20060107010855.8712.68786.sendpatchset@localhost.localdomain> (at Sat,  7 Jan 2006 02:09:30 +0100 (MET)), Patrick McHardy <kaber@trash.net> says:
> 
> 
>>following are the remaining patches for netfilter IPsec support.
>>They are missing the common-case optimization for inner transport mode
>>SAs on the input path, but since its just an optimization, I think
>>it can also be done later. One note: unfortunately I had to increase
> 
> 
> I definitely want to do it before 2.6.16.

That shouldn't be a problem, its not hard, but I think it might get
a bit ugly.

> Anyway, we'll test this series of patches.

Thanks.

^ permalink raw reply

* Re: [2.6 patch] fix ipvs compilation
From: Joe Kappus @ 2006-01-07  3:30 UTC (permalink / raw)
  To: David S. Miller; +Cc: bunk, wensong, horms, ja, netdev, linux-kernel
In-Reply-To: <20060106.131536.88566896.davem@davemloft.net>

On 1/6/06, David S. Miller <davem@davemloft.net> wrote:
> From: Joe <joecool1029@gmail.com>
> Date: Thu, 5 Jan 2006 23:43:52 -0500
>
> > Thats not all either,  ./net/ipv4/netfilter/ipt_helper.c has the same
> > error and the same fix.
> >
> > Here's the patch for this one.  Sorry for the dupe.. i sent the last
> > as html by accident.
>
> Applied, please provide a "Signed-off-by:" line with your patch
> next time.
>
> Thanks.
>

Why not then, we'll do this one as well since it needs it.

Signed-off-by: Joe Kappus <joecool1029@gmail.com>

--- ./net/ipv4/netfilter/ip_conntrack_proto_sctp.c.old  2006-01-06
22:27:08.885583023 -0500
+++ ./net/ipv4/netfilter/ip_conntrack_proto_sctp.c      2006-01-06
22:27:44.606582972 -0500
@@ -16,6 +16,7 @@
 #include <linux/types.h>
 #include <linux/sched.h>
 #include <linux/timer.h>
+#include <linux/interrupt.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
 #include <linux/in.h>

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: David S. Miller @ 2006-01-07  7:10 UTC (permalink / raw)
  To: ak; +Cc: paulmck, dada1, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <200601070209.02157.ak@suse.de>

From: Andi Kleen <ak@suse.de>
Date: Sat, 7 Jan 2006 02:09:01 +0100

> I always disliked the per chain spinlocks even for other hash tables like
> TCP/UDP multiplex - it would be much nicer to use a much smaller separately 
> hashed lock table and save cache. In this case the special case of using
> a one entry only lock hash table makes sense.

I used to think they were a great technique.  But in each case I
thought they could be applied, better schemes have come along.
In the case of the page cache we went to a per-address-space tree,
and here in the routing cache we went to RCU.

There are RCU patches around for the TCP hashes and I'd like to
put those in at some point as well.  In fact, they'd be even
more far reaching since Arnaldo abstracted away the socket
hashing stuff into an inet_hashtables subsystem.

^ permalink raw reply

* Re: [2.6 patch] fix ipvs compilation
From: David S. Miller @ 2006-01-07  7:15 UTC (permalink / raw)
  To: joecool1029; +Cc: bunk, wensong, horms, ja, netdev, linux-kernel
In-Reply-To: <d4757e600601061930v3fc113c1t9d71c951613abf09@mail.gmail.com>

From: Joe Kappus <joecool1029@gmail.com>
Date: Fri, 6 Jan 2006 22:30:56 -0500

> Why not then, we'll do this one as well since it needs it.
> 
> Signed-off-by: Joe Kappus <joecool1029@gmail.com>

Your email client corrupted the patch, I fixed it up manually
this time, but next time I won't be so nice so please get this
working.

Thanks.

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: Eric Dumazet @ 2006-01-07  7:34 UTC (permalink / raw)
  To: Andi Kleen
  Cc: David S. Miller, paulmck, alan, torvalds, linux-kernel, dipankar,
	manfred, netdev
In-Reply-To: <200601070209.02157.ak@suse.de>

Andi Kleen a écrit :
> 
> I always disliked the per chain spinlocks even for other hash tables like
> TCP/UDP multiplex - it would be much nicer to use a much smaller separately 
> hashed lock table and save cache. In this case the special case of using
> a one entry only lock hash table makes sense.
> 

I agree, I do use a hashed spinlock array on my local tree for TCP, mainly to 
reduce the hash table size by a 2 factor.

Eric

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: David S. Miller @ 2006-01-07  7:44 UTC (permalink / raw)
  To: dada1; +Cc: ak, paulmck, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <43BF6F0B.4060108@cosmosbay.com>

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 07 Jan 2006 08:34:35 +0100

> I agree, I do use a hashed spinlock array on my local tree for TCP,
> mainly to reduce the hash table size by a 2 factor.

So what do you think about going to a single spinlock for the
routing cache?

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: Eric Dumazet @ 2006-01-07  7:53 UTC (permalink / raw)
  To: David S. Miller
  Cc: ak, paulmck, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <20060106.234440.53993868.davem@davemloft.net>

David S. Miller a écrit :
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Sat, 07 Jan 2006 08:34:35 +0100
> 
>> I agree, I do use a hashed spinlock array on my local tree for TCP,
>> mainly to reduce the hash table size by a 2 factor.
> 
> So what do you think about going to a single spinlock for the
> routing cache?

I have no problem with this, since the biggest server I have is 4 way, but are 
you sure big machines wont suffer from this single spinlock ?

Also I dont understand what you want to do after this single spinlock patch.
How is it supposed to help the 'ip route flush cache' problem ?

In my case, I have about 600.000 dst-entries :

# grep ip_dst /proc/slabinfo
ip_dst_cache      616250 622440    320   12    1 : tunables   54   27    8 : 
slabdata  51870  51870      0


Eric

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: Eric Dumazet @ 2006-01-07  8:30 UTC (permalink / raw)
  To: David S. Miller
  Cc: ak, paulmck, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <20060106.161721.124249301.davem@davemloft.net>

David S. Miller a écrit :
> 
> Eric, how important do you honestly think the per-hashchain spinlocks
> are?  That's the big barrier from making rt_secret_rebuild() a simple
> rehash instead of flushing the whole table as it does now.
> 

No problem for me in going to a single spinlock.
I did the hashed spinlock patch in order to reduce the size of the route hash 
table and not hurting big NUMA machines. If you think a single spinlock is OK, 
that's even better !

> The lock is only grabbed for updates, and the access to these locks is
> random and as such probably non-local when taken anyways.  Back before
> we used RCU for reads, this array-of-spinlock thing made a lot more
> sense.
> 
> I mean something like this patch:
> 

> +static DEFINE_SPINLOCK(rt_hash_lock);


Just one point : This should be cache_line aligned, and use one full cache 
line to avoid false sharing at least. (If a cpu takes the lock, no need to 
invalidate *rt_hash_table for all other cpus)

Eric

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: David S. Miller @ 2006-01-07  8:36 UTC (permalink / raw)
  To: dada1; +Cc: ak, paulmck, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <43BF7390.6050005@cosmosbay.com>

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Sat, 07 Jan 2006 08:53:52 +0100

> I have no problem with this, since the biggest server I have is 4
> way, but are you sure big machines wont suffer from this single
> spinlock ?

It is the main question.

> Also I dont understand what you want to do after this single
> spinlock patch.  How is it supposed to help the 'ip route flush
> cache' problem ?  In my case, I have about 600.000 dst-entries :

I don't claim to have a solution to this problem currently.

Doing RCU and going through the whole DST GC machinery is overkill for
an active system.  So, perhaps a very simple solution will do:

1) On rt_run_flush(), do not rt_free(), instead collect all active
   routing cache entries onto a global list, begin a timer to
   fire in 10 seconds (or some sysctl configurable amount).

2) When a new routing cache entry is needed, check the global
   list appended to in #1 above first, failing that do dst_alloc()
   as is done currently.

3) If timer expires, rt_free() any entries in the global list.

The missing trick is how to ensure RCU semantics when reallocating
from the global list.

The idea is that an active system will immediately repopulate itself
with all of these entries just flushed from the table.

RCU really doesn't handle this kind of problem very well.  It truly
excels when work is generated by process context work, not interrupt
work.

^ permalink raw reply

* Re: State of the Union: Wireless
From: Denis Vlasenko @ 2006-01-07 14:49 UTC (permalink / raw)
  To: Johannes Berg; +Cc: Jeff Garzik, netdev, linux-kernel
In-Reply-To: <1136547084.4037.41.camel@localhost>

On Friday 06 January 2006 13:31, Johannes Berg wrote:
> On Fri, 2006-01-06 at 12:00 +0100, Michael Buesch wrote:
> 
> > * "master" interface as real device node
> > * Virtual interfaces (net_devices)
> 
> I didn't want to spam the netdev wiki with this (yet) so I collected
> some more structured things outside. Anyone feel free to edit:
> http://softmac.sipsolutions.net/802.11

I am confused.

There is
http://softmac.sipsolutions.net/softmac-snapshot.tar.bz2
at http://softmac.sipsolutions.net/SoftMAC,
page also says "Projects using this layer: * Broadcom 43xx driver"

but Broadcom driver page at ftp://ftp.berlios.de/pub/bcm43xx/snapshots/softmac/
has ftp://ftp.berlios.de/pub/bcm43xx/snapshots/softmac/ieee80211softmac-20060107.tar.bz2
which is not the same. For example, ieee80211softmac.h file exists in both
tarballs but is not identical.

Suppose one wants to use softmac in a project. What tarball contains
the bleeding edge of softmac?

> I'll move that content to the netdev wiki if anyone else thinks it would
> be a good way forward to start with requirements, API issues and
> similar.
> 
> Until we get there, we'll fix up softmac to make it usable for most
> people in basic station mode without any kind of virtual devices, which
> will need some slight changes to the current ieee80211.
--
vda

^ permalink raw reply

* [RFC: 2.6 patch] net/ipv4/ip_output.c: make ip_fragment() static
From: Adrian Bunk @ 2006-01-07 18:15 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

Since there's no longer any external user of ip_fragment() we can make 
it static.


Signed-off-by: Adrian Bunk <bunk@stusta.de>

---

 include/net/ip.h     |    1 -
 net/ipv4/ip_output.c |    5 +++--
 2 files changed, 3 insertions(+), 3 deletions(-)

--- linux-2.6.15-mm2-full/include/net/ip.h.old	2006-01-07 17:12:04.000000000 +0100
+++ linux-2.6.15-mm2-full/include/net/ip.h	2006-01-07 17:12:11.000000000 +0100
@@ -95,7 +95,6 @@
 extern int		ip_mr_input(struct sk_buff *skb);
 extern int		ip_output(struct sk_buff *skb);
 extern int		ip_mc_output(struct sk_buff *skb);
-extern int		ip_fragment(struct sk_buff *skb, int (*out)(struct sk_buff*));
 extern int		ip_do_nat(struct sk_buff *skb);
 extern void		ip_send_check(struct iphdr *ip);
 extern int		ip_queue_xmit(struct sk_buff *skb, int ipfragok);
--- linux-2.6.15-mm2-full/net/ipv4/ip_output.c.old	2006-01-07 17:12:21.000000000 +0100
+++ linux-2.6.15-mm2-full/net/ipv4/ip_output.c	2006-01-07 17:21:33.000000000 +0100
@@ -85,6 +85,8 @@
 
 int sysctl_ip_default_ttl = IPDEFTTL;
 
+static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*));
+
 /* Generate a checksum for an outgoing IP datagram. */
 __inline__ void ip_send_check(struct iphdr *iph)
 {
@@ -409,7 +411,7 @@
  *	single device frame, and queue such a frame for sending.
  */
 
-int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
+static int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
 {
 	struct iphdr *iph;
 	int raw = 0;
@@ -1391,7 +1393,6 @@
 #endif
 }
 
-EXPORT_SYMBOL(ip_fragment);
 EXPORT_SYMBOL(ip_generic_getfrag);
 EXPORT_SYMBOL(ip_queue_xmit);
 EXPORT_SYMBOL(ip_send_check);

^ permalink raw reply

* [2.6 patch] net/ipv6/: small cleanups
From: Adrian Bunk @ 2006-01-07 18:17 UTC (permalink / raw)
  To: netdev; +Cc: linux-kernel

This patch contains the following cleanups:
- addrconf.c: make addrconf_dad_stop() static
- inet6_connection_sock.c should #include <net/inet6_connection_sock.h>
  for getting the prototypes of it's global functions


Signed-off-by: Adrian Bunk <bunk@stusta.de>

---

 net/ipv6/addrconf.c              |    2 +-
 net/ipv6/inet6_connection_sock.c |    1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

--- linux-2.6.15-mm2-full/net/ipv6/addrconf.c.old	2006-01-07 17:30:04.000000000 +0100
+++ linux-2.6.15-mm2-full/net/ipv6/addrconf.c	2006-01-07 17:30:13.000000000 +0100
@@ -1228,7 +1228,7 @@
 
 /* Gets referenced address, destroys ifaddr */
 
-void addrconf_dad_stop(struct inet6_ifaddr *ifp)
+static void addrconf_dad_stop(struct inet6_ifaddr *ifp)
 {
 	if (ifp->flags&IFA_F_PERMANENT) {
 		spin_lock_bh(&ifp->lock);
--- linux-2.6.15-mm2-full/net/ipv6/inet6_connection_sock.c.old	2006-01-07 17:30:42.000000000 +0100
+++ linux-2.6.15-mm2-full/net/ipv6/inet6_connection_sock.c	2006-01-07 17:30:57.000000000 +0100
@@ -25,6 +25,7 @@
 #include <net/inet_hashtables.h>
 #include <net/ip6_route.h>
 #include <net/sock.h>
+#include <net/inet6_connection_sock.h>
 
 int inet6_csk_bind_conflict(const struct sock *sk,
 			    const struct inet_bind_bucket *tb)

^ permalink raw reply

* Re: [PATCH, RFC] RCU : OOM avoidance and lower latency
From: Paul E. McKenney @ 2006-01-07 20:30 UTC (permalink / raw)
  To: David S. Miller
  Cc: dada1, ak, alan, torvalds, linux-kernel, dipankar, manfred,
	netdev
In-Reply-To: <20060107.003625.50986701.davem@davemloft.net>

On Sat, Jan 07, 2006 at 12:36:25AM -0800, David S. Miller wrote:
> From: Eric Dumazet <dada1@cosmosbay.com>
> Date: Sat, 07 Jan 2006 08:53:52 +0100
> 
> > I have no problem with this, since the biggest server I have is 4
> > way, but are you sure big machines wont suffer from this single
> > spinlock ?
> 
> It is the main question.
> 
> > Also I dont understand what you want to do after this single
> > spinlock patch.  How is it supposed to help the 'ip route flush
> > cache' problem ?  In my case, I have about 600.000 dst-entries :
> 
> I don't claim to have a solution to this problem currently.
> 
> Doing RCU and going through the whole DST GC machinery is overkill for
> an active system.  So, perhaps a very simple solution will do:
> 
> 1) On rt_run_flush(), do not rt_free(), instead collect all active
>    routing cache entries onto a global list, begin a timer to
>    fire in 10 seconds (or some sysctl configurable amount).
> 
> 2) When a new routing cache entry is needed, check the global
>    list appended to in #1 above first, failing that do dst_alloc()
>    as is done currently.
> 
> 3) If timer expires, rt_free() any entries in the global list.
> 
> The missing trick is how to ensure RCU semantics when reallocating
> from the global list.

The straightforward ways of doing this require a per-entry lock in
addition to the dst_entry reference count -- lots of read-side overhead.

More complex approaches use a generation number that is incremented
when adding to or removing from the global list.  When the generation
number overflows, unconditionally rt_free() it rather than adding
to the global list again.  Then there needs to be some clever code
on the read side to detect the case when the generation number
changes while acquiring a reference.  And memory barriers.  Also
lots of read-side overhead.  Also, it is now -always- necessary to
acquire a reference on the read-side.

> The idea is that an active system will immediately repopulate itself
> with all of these entries just flushed from the table.
> 
> RCU really doesn't handle this kind of problem very well.  It truly
> excels when work is generated by process context work, not interrupt
> work.

Sounds like a challenge to me.  ;-)

Well, one possible way to attack Eric's workload might be the following:

o	Size the hash table to strike the appropriate balance between
	read-side search overhead and memory consumption.  Call the
	number of hash-chain headers N.

o	Create a hashed array of locks sized to allow the update to
	proceed sufficiently quickly.  Call the number of locks M,
	probably a power of two.  This means that M CPUs can be doing
	the update in parallel.

o	Create an array of M^2 list headers (call it xfer[][]), but
	since this is only needed during an update, it can be allocated
	and deallocated if need be.  (Me, with my big-server experience,
	would probably just create the array, since M is not likely to
	be too large.  But your mileage may vary.  And you really only
	need M*(M-1) list headers, but that makes the index calculation
	a bit more annoying.)

o	Use a two-phase update.  In the first phase, each updating
	CPU acquires the corresponding lock and removes entries from
	the corresponding partition of the hash table.  If the new
	location of a given entry falls into the same partition, it
	is added back to the appropriate hash chain of that partition.
	Otherwise, add the entry to xfer[dst][src], where "src" and 
	"dst" are indexes of the corresponding partitions.

o	When all CPUs finish removing entries from their partition,
	they check into a barrier.  Once all have checked in, they
	can start the second phase of the update.

o	In the second phase, each CPU removes the entries from the
	xfer array that are destined for its partition and adds them
	to the hash chain that they are destined for.

Some commentary and variations, in the hope that this inspires someone
to come up with an even better idea:

o	Unless M is at least three, there is no performance gain
	over a single global lock with a single CPU doing the update,
	since each element must now undergo four list operations rather
	than just two.

o	The xfer[][] array must have each entry cache-aligned, or
	you lose big on cacheline effects.  Note that it is -not-
	sufficient to simply align the rows or the columns, since
	each CPU has its own column when inserting and its own
	row when removing from xfer[][].

o	And the data-skew effects are less severe if this procedure
	runs from process context.  A spinning barrier must be used
	otherwise.  But note that the per-partition locks could remain
	spinlocks, only the barrier need involve sleeping (in case
	that helps, am getting a bit ahead of my understanding of
	this part of the kernel).

	So I half-agree with Dave -- this works better if the bulk
	update is done in process context, however, updates involving
	single entries being individually added and removed from the
	hash table can be done from interrupt context.

o	One (more complex!) way to reduce the effect of data skew to
	partition the array based on the number of elements in each
	partition rather than by the number of chains, but this would
	require a second barrier that is completed before dropping locks
	in order to avoid messing up concurrent single-element updates.

	And this only handles the phase-1 data-skew effects. It is
	possible to handle phase-2 data skew as well, but it takes an
	up-front full-table scan and so it is not clear how it could
	possibly be a performance win.

o	Note that routing slows down significantly while an update is
	in progress, because on average only half of the entries are
	in the hash table during the update.  I have no idea whether
	or not this is acceptable.  I am assuming that the same
	mechanism that prevents two concurrent route-cache misses
	from inserting two entries from the same route would also
	prevent adding a new entry when one was already in the
	xfer array.

Thoughts?

							Thanx, Paul

^ permalink raw reply

* Re: [RFC: 2.6 patch] net/ipv4/ip_output.c: make ip_fragment() static
From: David S. Miller @ 2006-01-07 21:23 UTC (permalink / raw)
  To: bunk; +Cc: netdev, linux-kernel
In-Reply-To: <20060107181533.GO3774@stusta.de>

From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 7 Jan 2006 19:15:33 +0100

> Since there's no longer any external user of ip_fragment() we can make 
> it static.
> 
> Signed-off-by: Adrian Bunk <bunk@stusta.de>

Works for me, applied.

Thanks.

^ permalink raw reply

* Re: [2.6 patch] net/ipv6/: small cleanups
From: David S. Miller @ 2006-01-07 21:24 UTC (permalink / raw)
  To: bunk; +Cc: netdev, linux-kernel
In-Reply-To: <20060107181723.GP3774@stusta.de>

From: Adrian Bunk <bunk@stusta.de>
Date: Sat, 7 Jan 2006 19:17:23 +0100

> This patch contains the following cleanups:
> - addrconf.c: make addrconf_dad_stop() static
> - inet6_connection_sock.c should #include <net/inet6_connection_sock.h>
>   for getting the prototypes of it's global functions
> 
> Signed-off-by: Adrian Bunk <bunk@stusta.de>

Also applied, thanks.

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox