All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
@ 2004-06-24  4:04 Yasuyuki Kozakai
  2004-06-24  8:13 ` Andras Kis-Szabo
  2004-06-24 11:26 ` Patrick McHardy
  0 siblings, 2 replies; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-24  4:04 UTC (permalink / raw)
  To: netfilter-devel; +Cc: laforge, kisza, usagi-core

[-- Attachment #1: Type: Text/Plain, Size: 1718 bytes --]


Hi, folks,

In the current kernel, skb is linearized by skb_linearize() in ip6_tables.c.
I suggest removing this, and this patch is the 1st step to do it.

To remove skb_linearize(), this patch changes the API of match() like
ip_tables.h

	int (*match)(const struct sk_buff *skb,
		     const struct net_device *in,
		     const struct net_device *out,
		     const void *matchinfo,
		     int offset,
		     unsigned int protoff,
		     int *hotdrop);

"protoff" is the offset of transport protocol header from skb->data.
match modules can get the transport protocol header without skipping IPv6
extension headers.

This patch also changes

	- ip6_packet_match(), tcp_match(), udp_match(), icmp_match() in
	  ip6_tables.c are changed to follow the above API.

	- In all match module, the only arguments of match() are changed.

This patch doesn't remove skb_linearize() yet since more changes are needed
to every match modules. After all modules are changed, we'll be able to just
remove skb_linearize().

Moreover, I optimize and ip6_packet_match() in this patch. In this current
kernel, IPv6 extension headers are skipped many times since ip6t_do_table()
calls ip6_packet_match() per filtering rule.

This patch changes this behavior so that IPv6 extension headers are skipped
at once in ip6t_do_table() unless IP6T_CONTINUE is returned from target.

I know that this optimization is not related with removing skb_linearize().
But I don't want to change ip6_packet_match() many time.

If no objections and no bugs, I want this patch to be applied to mainline
kernel.

comments ?

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>



[-- Attachment #2: ip6tables.patch --]
[-- Type: Text/Plain, Size: 21222 bytes --]

diff -Nur linux-2.6.7/include/linux/netfilter_ipv6/ip6_tables.h linux-2.6.7-ip6tables/include/linux/netfilter_ipv6/ip6_tables.h
--- linux-2.6.7/include/linux/netfilter_ipv6/ip6_tables.h	2004-06-16 14:20:04.000000000 +0900
+++ linux-2.6.7-ip6tables/include/linux/netfilter_ipv6/ip6_tables.h	2004-06-23 23:45:43.000000000 +0900
@@ -361,8 +361,7 @@
 		     const struct net_device *out,
 		     const void *matchinfo,
 		     int offset,
-		     const void *hdr,
-		     u_int16_t datalen,
+		     unsigned int protoff,
 		     int *hotdrop);
 
 	/* Called when user tries to insert an entry of this type. */
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6_tables.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6_tables.c
--- linux-2.6.7/net/ipv6/netfilter/ip6_tables.c	2004-06-16 14:19:53.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6_tables.c	2004-06-24 00:00:18.542088784 +0900
@@ -157,14 +157,16 @@
 /* Returns whether matches rule or not. */
 static inline int
 ip6_packet_match(const struct sk_buff *skb,
-		 const struct ipv6hdr *ipv6,
 		 const char *indev,
 		 const char *outdev,
 		 const struct ip6t_ip6 *ip6info,
-		 int isfrag)
+		 u8 *proto,
+		 unsigned int *protoff,
+		 int *isfrag)
 {
 	size_t i;
 	unsigned long ret;
+	const struct ipv6hdr *ipv6 = skb->nh.ipv6h;
 
 #define FWINV(bool,invflg) ((bool) ^ !!(ip6info->invflags & invflg))
 
@@ -215,9 +217,13 @@
 	/* look for the desired protocol header */
 	if((ip6info->flags & IP6T_F_PROTO)) {
 		u_int8_t currenthdr = ipv6->nexthdr;
-		struct ipv6_opt_hdr *hdrptr;
-		u_int16_t ptr;		/* Header offset in skb */
+		struct ipv6_opt_hdr hdr;
+		unsigned int ptr;	/* Header offset in skb */
 		u_int16_t hdrlen;	/* Header */
+		u_int16_t fragoff = 0;
+
+		if (*protoff != 0)
+			goto skip;
 
 		ptr = IPV6_HDR_LEN;
 
@@ -233,31 +239,47 @@
 				(currenthdr == IPPROTO_ESP))
 				return 0;
 
-	                hdrptr = (struct ipv6_opt_hdr *)(skb->data + ptr);
+			if (skb_copy_bits(skb, ptr, &hdr, sizeof(hdr)))
+				BUG();
+
 
 			/* Size calculation */
-	                if (currenthdr == IPPROTO_FRAGMENT) {
+			if (currenthdr == IPPROTO_FRAGMENT) {
+				if (skb_copy_bits(skb,
+						  ptr+offsetof(struct frag_hdr,
+							       frag_off),
+						  &fragoff, sizeof(fragoff)))
+					return 0;
+
+				fragoff = ntohs(fragoff) & ~0x7;
 	                        hdrlen = 8;
 	                } else if (currenthdr == IPPROTO_AH)
-	                        hdrlen = (hdrptr->hdrlen+2)<<2;
+	                        hdrlen = (hdr.hdrlen+2)<<2;
 	                else
-	                        hdrlen = ipv6_optlen(hdrptr);
+	                        hdrlen = ipv6_optlen(&hdr);
 
-			currenthdr = hdrptr->nexthdr;
+			currenthdr = hdr.nexthdr;
 	                ptr += hdrlen;
 			/* ptr is too large */
 	                if ( ptr > skb->len ) 
 				return 0;
+			if (fragoff)
+				break;
 		}
 
+		*proto = currenthdr;
+		*protoff = ptr;
+		*isfrag = fragoff;
+skip:
+
 		/* currenthdr contains the protocol header */
 
 		dprintf("Packet protocol %hi ?= %s%hi.\n",
-				currenthdr, 
+				*proto, 
 				ip6info->invflags & IP6T_INV_PROTO ? "!":"",
 				ip6info->proto);
 
-		if (ip6info->proto == currenthdr) {
+		if (ip6info->proto == *proto) {
 			if(ip6info->invflags & IP6T_INV_PROTO) {
 				return 0;
 			}
@@ -309,13 +331,12 @@
 	     const struct net_device *in,
 	     const struct net_device *out,
 	     int offset,
-	     const void *hdr,
-	     u_int16_t datalen,
+	     unsigned int protoff,
 	     int *hotdrop)
 {
 	/* Stop iteration if it doesn't match */
 	if (!m->u.kernel.match->match(skb, in, out, m->data,
-				      offset, hdr, datalen, hotdrop))
+				      offset, protoff, hotdrop))
 		return 1;
 	else
 		return 0;
@@ -337,10 +358,9 @@
 	      void *userdata)
 {
 	static const char nulldevname[IFNAMSIZ];
-	u_int16_t offset = 0;
-	struct ipv6hdr *ipv6;
-	void *protohdr;
-	u_int16_t datalen;
+	int offset = 0;
+	unsigned int protoff = 0;
+	u8 proto = 0;
 	int hotdrop = 0;
 	/* Initializing verdict to NF_DROP keeps gcc happy. */
 	unsigned int verdict = NF_DROP;
@@ -353,9 +373,6 @@
 		return NF_DROP;
 
 	/* Initialization */
-	ipv6 = (*pskb)->nh.ipv6h;
-	protohdr = (u_int32_t *)((char *)ipv6 + IPV6_HDR_LEN);
-	datalen = (*pskb)->len - IPV6_HDR_LEN;
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
@@ -392,17 +409,19 @@
 		IP_NF_ASSERT(e);
 		IP_NF_ASSERT(back);
 		(*pskb)->nfcache |= e->nfcache;
-		if (ip6_packet_match(*pskb, ipv6, indev, outdev, 
-			&e->ipv6, offset)) {
+		if (ip6_packet_match(*pskb, indev, outdev, 
+			&e->ipv6, &proto, &protoff, &offset)) {
 			struct ip6t_entry_target *t;
 
 			if (IP6T_MATCH_ITERATE(e, do_match,
 					       *pskb, in, out,
-					       offset, protohdr,
-					       datalen, &hotdrop) != 0)
+					       offset, protoff, &hotdrop) != 0)
 				goto no_match;
 
-			ADD_COUNTER(e->counters, ntohs(ipv6->payload_len) + IPV6_HDR_LEN, 1);
+			ADD_COUNTER(e->counters,
+				    ntohs((*pskb)->nh.ipv6h->payload_len)
+					  + IPV6_HDR_LEN,
+				    1);
 
 			t = ip6t_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -459,9 +478,9 @@
 					= 0x57acc001;
 #endif
 				/* Target might have changed stuff. */
-				ipv6 = (*pskb)->nh.ipv6h;
-				protohdr = (u_int32_t *)((void *)ipv6 + IPV6_HDR_LEN);
-				datalen = (*pskb)->len - IPV6_HDR_LEN;
+				offset = 0;
+				proto = 0;
+				protoff = 0;
 
 				if (verdict == IP6T_CONTINUE)
 					e = (void *)e + e->next_offset;
@@ -1534,23 +1553,25 @@
 
 static int
 tcp_find_option(u_int8_t option,
-		const struct tcphdr *tcp,
-		u_int16_t datalen,
+		const struct sk_buff *skb,
+		unsigned int tcpoff,
+		unsigned int optlen,
 		int invert,
 		int *hotdrop)
 {
-	unsigned int i = sizeof(struct tcphdr);
-	const u_int8_t *opt = (u_int8_t *)tcp;
+	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
+	char opt[60 - sizeof(struct tcphdr)];
+	unsigned int i;
 
 	duprintf("tcp_match: finding option\n");
 	/* If we don't have the whole header, drop packet. */
-	if (tcp->doff * 4 < sizeof(struct tcphdr) ||
-	    tcp->doff * 4 > datalen) {
+	if (skb_copy_bits(skb, tcpoff + sizeof(struct tcphdr),
+			  opt, optlen) < 0) {
 		*hotdrop = 1;
 		return 0;
 	}
 
-	while (i < tcp->doff * 4) {
+	for (i = 0; i < optlen; ) {
 		if (opt[i] == option) return !invert;
 		if (opt[i] < 2) i++;
 		else i += opt[i+1]?:1;
@@ -1565,27 +1586,30 @@
 	  const struct net_device *out,
 	  const void *matchinfo,
 	  int offset,
-	  const void *hdr,
-	  u_int16_t datalen,
+	  unsigned int protoff,
 	  int *hotdrop)
 {
-	const struct tcphdr *tcp;
+	struct tcphdr tcph;
 	const struct ip6t_tcp *tcpinfo = matchinfo;
-	int tcpoff;
-	u8 nexthdr = skb->nh.ipv6h->nexthdr;
-
-	/* To quote Alan:
 
-	   Don't allow a fragment of TCP 8 bytes in. Nobody normal
-	   causes this. Its a cracker trying to break in by doing a
-	   flag overwrite to pass the direction checks.
-	*/
+	if (offset) {
+		/* To quote Alan:
 
-	if (offset == 1) {
-		duprintf("Dropping evil TCP offset=1 frag.\n");
-		*hotdrop = 1;
+		   Don't allow a fragment of TCP 8 bytes in. Nobody normal
+		   causes this. Its a cracker trying to break in by doing a
+		   flag overwrite to pass the direction checks.
+		*/
+		if (offset == 1) {
+			duprintf("Dropping evil TCP offset=1 frag.\n");
+			*hotdrop = 1;
+		}
+		/* Must not be a fragment. */
 		return 0;
-	} else if (offset == 0 && datalen < sizeof(struct tcphdr)) {
+	}
+
+#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
+
+	if (skb_copy_bits(skb, protoff, &tcph, sizeof(tcph)) < 0) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
@@ -1593,45 +1617,30 @@
 		return 0;
 	}
 
-	tcpoff = (u8*)(skb->nh.ipv6h + 1) - skb->data;
-	tcpoff = ipv6_skip_exthdr(skb, tcpoff, &nexthdr, skb->len - tcpoff);
-	if (tcpoff < 0 || tcpoff > skb->len) {
-		duprintf("tcp_match: cannot skip exthdr. Dropping.\n");
-		*hotdrop = 1;
-		return 0;
-	} else if (nexthdr == IPPROTO_FRAGMENT)
-		return 0;
-	else if (nexthdr != IPPROTO_TCP ||
-		 skb->len - tcpoff < sizeof(struct tcphdr)) {
-		/* cannot be occured */
-		duprintf("tcp_match: cannot get TCP header. Dropping.\n");
-		*hotdrop = 1;
-		return 0;
+	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
+			ntohs(tcph.source),
+			!!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT)))
+		return 0;
+	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
+			ntohs(tcph.dest),
+			!!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT)))
+		return 0;
+	if (!FWINVTCP((((unsigned char *)&tcph)[13] & tcpinfo->flg_mask)
+		      == tcpinfo->flg_cmp,
+		      IP6T_TCP_INV_FLAGS))
+		return 0;
+	if (tcpinfo->option) {
+		if (tcph.doff * 4 < sizeof(tcph)) {
+			*hotdrop = 1;
+			return 0;
+		}
+		if (!tcp_find_option(tcpinfo->option, skb, protoff,
+				     tcph.doff*4 - sizeof(tcph),
+				     tcpinfo->invflags & IP6T_TCP_INV_OPTION,
+				     hotdrop))
+			return 0;
 	}
-
-	tcp = (struct tcphdr *)(skb->data + tcpoff);
-
-	/* FIXME: Try tcp doff >> packet len against various stacks --RR */
-
-#define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
-
-	/* Must not be a fragment. */
-	return !offset
-		&& port_match(tcpinfo->spts[0], tcpinfo->spts[1],
-			      ntohs(tcp->source),
-			      !!(tcpinfo->invflags & IP6T_TCP_INV_SRCPT))
-		&& port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
-			      ntohs(tcp->dest),
-			      !!(tcpinfo->invflags & IP6T_TCP_INV_DSTPT))
-		&& FWINVTCP((((unsigned char *)tcp)[13]
-			     & tcpinfo->flg_mask)
-			    == tcpinfo->flg_cmp,
-			    IP6T_TCP_INV_FLAGS)
-		&& (!tcpinfo->option
-		    || tcp_find_option(tcpinfo->option, tcp, datalen,
-				       tcpinfo->invflags
-				       & IP6T_TCP_INV_OPTION,
-				       hotdrop));
+	return 1;
 }
 
 /* Called when user tries to insert an entry of this type. */
@@ -1657,16 +1666,17 @@
 	  const struct net_device *out,
 	  const void *matchinfo,
 	  int offset,
-	  const void *hdr,
-	  u_int16_t datalen,
+	  unsigned int protoff,
 	  int *hotdrop)
 {
-	const struct udphdr *udp;
+	struct udphdr udph;
 	const struct ip6t_udp *udpinfo = matchinfo;
-	int udpoff;
-	u8 nexthdr = skb->nh.ipv6h->nexthdr;
 
-	if (offset == 0 && datalen < sizeof(struct udphdr)) {
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	if (skb_copy_bits(skb, protoff, &udph, sizeof(udph)) < 0) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil UDP tinygram.\n");
@@ -1674,30 +1684,11 @@
 		return 0;
 	}
 
-	udpoff = (u8*)(skb->nh.ipv6h + 1) - skb->data;
-	udpoff = ipv6_skip_exthdr(skb, udpoff, &nexthdr, skb->len - udpoff);
-	if (udpoff < 0 || udpoff > skb->len) {
-		duprintf("udp_match: cannot skip exthdr. Dropping.\n");
-		*hotdrop = 1;
-		return 0;
-	} else if (nexthdr == IPPROTO_FRAGMENT)
-		return 0;
-	else if (nexthdr != IPPROTO_UDP ||
-		 skb->len - udpoff < sizeof(struct udphdr)) {
-		duprintf("udp_match: cannot get UDP header. Dropping.\n");
-		*hotdrop = 1;
-		return 0;
-	}
-
-	udp = (struct udphdr *)(skb->data + udpoff);
-
-	/* Must not be a fragment. */
-	return !offset
-		&& port_match(udpinfo->spts[0], udpinfo->spts[1],
-			      ntohs(udp->source),
-			      !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT))
+	return port_match(udpinfo->spts[0], udpinfo->spts[1],
+			  ntohs(udph.source),
+			  !!(udpinfo->invflags & IP6T_UDP_INV_SRCPT))
 		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
-			      ntohs(udp->dest),
+			      ntohs(udph.dest),
 			      !!(udpinfo->invflags & IP6T_UDP_INV_DSTPT));
 }
 
@@ -1747,14 +1738,17 @@
 	   const struct net_device *out,
 	   const void *matchinfo,
 	   int offset,
-	   const void *hdr,
-	   u_int16_t datalen,
+	   unsigned int protoff,
 	   int *hotdrop)
 {
-	const struct icmp6hdr *icmp = hdr;
+	struct icmp6hdr icmp;
 	const struct ip6t_icmp *icmpinfo = matchinfo;
 
-	if (offset == 0 && datalen < 2) {
+	/* Must not be a fragment. */
+	if (offset)
+		return 0;
+
+	if (skb_copy_bits(skb, protoff, &icmp, sizeof(icmp)) < 0) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil ICMP tinygram.\n");
@@ -1762,13 +1756,11 @@
 		return 0;
 	}
 
-	/* Must not be a fragment. */
-	return !offset
-		&& icmp6_type_code_match(icmpinfo->type,
-					icmpinfo->code[0],
-					icmpinfo->code[1],
-					icmp->icmp6_type, icmp->icmp6_code,
-					!!(icmpinfo->invflags&IP6T_ICMP_INV));
+	return icmp6_type_code_match(icmpinfo->type,
+				     icmpinfo->code[0],
+				     icmpinfo->code[1],
+				     icmp.icmp6_type, icmp.icmp6_code,
+				     !!(icmpinfo->invflags&IP6T_ICMP_INV));
 }
 
 /* Called when user tries to insert an entry of this type. */
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_ah.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_ah.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_ah.c	2004-06-16 14:18:58.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_ah.c	2004-06-23 23:45:43.000000000 +0900
@@ -45,8 +45,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
        struct ip_auth_hdr *ah = NULL;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_dst.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_dst.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_dst.c	2004-06-16 14:18:56.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_dst.c	2004-06-23 23:45:43.000000000 +0900
@@ -60,8 +60,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
        struct ipv6_opt_hdr *optsh = NULL;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_esp.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_esp.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_esp.c	2004-06-16 14:19:36.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_esp.c	2004-06-23 23:45:43.000000000 +0900
@@ -45,8 +45,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
 	struct ip_esp_hdr *esp = NULL;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_eui64.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_eui64.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_eui64.c	2004-06-16 14:18:52.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_eui64.c	2004-06-23 23:45:43.000000000 +0900
@@ -24,8 +24,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
 
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_frag.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_frag.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_frag.c	2004-06-16 14:19:01.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_frag.c	2004-06-23 23:45:43.000000000 +0900
@@ -70,8 +70,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
        struct fraghdr *frag = NULL;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_hbh.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_hbh.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_hbh.c	2004-06-16 14:19:52.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_hbh.c	2004-06-23 23:45:43.000000000 +0900
@@ -59,8 +59,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
        struct ipv6_opt_hdr *optsh = NULL;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_hl.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_hl.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_hl.c	2004-06-16 14:19:42.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_hl.c	2004-06-23 23:45:43.000000000 +0900
@@ -20,7 +20,7 @@
 
 static int match(const struct sk_buff *skb, const struct net_device *in,
 		 const struct net_device *out, const void *matchinfo,
-		 int offset, const void *hdr, u_int16_t datalen,
+		 int offset, unsigned int protoff,
 		 int *hotdrop)
 {
 	const struct ip6t_hl_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_ipv6header.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_ipv6header.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_ipv6header.c	2004-06-16 14:20:26.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_ipv6header.c	2004-06-23 23:45:43.000000000 +0900
@@ -31,8 +31,7 @@
 		 const struct net_device *out,
 		 const void *matchinfo,
 		 int offset,
-		 const void *protohdr,
-		 u_int16_t datalen,
+		 unsigned int protoff,
 		 int *hotdrop)
 {
 	const struct ip6t_ipv6header_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_length.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_length.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_length.c	2004-06-16 14:20:16.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_length.c	2004-06-23 23:45:43.000000000 +0900
@@ -23,8 +23,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
 	const struct ip6t_length_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_limit.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_limit.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_limit.c	2004-06-16 14:19:02.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_limit.c	2004-06-23 23:45:43.000000000 +0900
@@ -57,8 +57,7 @@
 		const struct net_device *out,
 		const void *matchinfo,
 		int offset,
-		const void *hdr,
-		u_int16_t datalen,
+		unsigned int protoff,
 		int *hotdrop)
 {
 	struct ip6t_rateinfo *r = ((struct ip6t_rateinfo *)matchinfo)->master;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_mac.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_mac.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_mac.c	2004-06-16 14:20:03.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_mac.c	2004-06-23 23:45:43.000000000 +0900
@@ -25,8 +25,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
     const struct ip6t_mac_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_mark.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_mark.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_mark.c	2004-06-16 14:20:26.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_mark.c	2004-06-23 23:45:43.000000000 +0900
@@ -24,8 +24,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
 	const struct ip6t_mark_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_multiport.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_multiport.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_multiport.c	2004-06-16 14:20:26.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_multiport.c	2004-06-23 23:45:43.000000000 +0900
@@ -53,15 +53,14 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
-	const struct udphdr *udp = hdr;
+	const struct udphdr *udp = (const struct udphdr *)(skb->data + protoff);
 	const struct ip6t_multiport *multiinfo = matchinfo;
 
 	/* Must be big enough to read ports. */
-	if (offset == 0 && datalen < sizeof(struct udphdr)) {
+	if (offset == 0 && skb->len - protoff < sizeof(struct udphdr)) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 			duprintf("ip6t_multiport:"
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_owner.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_owner.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_owner.c	2004-06-16 14:19:52.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_owner.c	2004-06-23 23:45:43.000000000 +0900
@@ -92,8 +92,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *hdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
 	const struct ip6t_owner_info *info = matchinfo;
diff -Nur linux-2.6.7/net/ipv6/netfilter/ip6t_rt.c linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_rt.c
--- linux-2.6.7/net/ipv6/netfilter/ip6t_rt.c	2004-06-16 14:19:02.000000000 +0900
+++ linux-2.6.7-ip6tables/net/ipv6/netfilter/ip6t_rt.c	2004-06-23 23:45:43.000000000 +0900
@@ -47,8 +47,7 @@
       const struct net_device *out,
       const void *matchinfo,
       int offset,
-      const void *protohdr,
-      u_int16_t datalen,
+      unsigned int protoff,
       int *hotdrop)
 {
        struct ipv6_rt_hdr *route = NULL;

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24  4:04 [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization Yasuyuki Kozakai
@ 2004-06-24  8:13 ` Andras Kis-Szabo
  2004-06-24 10:12   ` Yasuyuki Kozakai
  2004-06-24 11:26 ` Patrick McHardy
  1 sibling, 1 reply; 29+ messages in thread
From: Andras Kis-Szabo @ 2004-06-24  8:13 UTC (permalink / raw)
  To: Yasuyuki Kozakai; +Cc: netfilter-devel, Harald Welte, usagi-core

Hi Yasuyuki,

you saved my life :)
When I tried to do the same a was not able to track behaviour of
skb->nh. (There was some grey point, like when the packet leaves the
machine over the tunnel interface.)

> "protoff" is the offset of transport protocol header from skb->data.
> match modules can get the transport protocol header without skipping IPv6
> extension headers.
(Its OK, since we have some code which uses the extension headers.)

> 	- In all match module, the only arguments of match() are changed.
I think it is not a problem since the ip6tables - as I know - not a
'core' product. We can break the compatibility / usability of these
extensions. (And it will push us to fix the problems and sync the
matches with the IPv4 parts.)
Could you add an #error string right after the definitions in the
matches?
In this case the compilation of the matches will cause strange error
messages - and someone have to answer all of them. An error message
should help a little bit. (Eg.: Known error point. This part is under
reconstruction. Please, be patient. etc.)

Many thanks again,

Best regards,

Andras

-- 
Andras Kis-Szabo <kisza@securityaudit.hu>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24  8:13 ` Andras Kis-Szabo
@ 2004-06-24 10:12   ` Yasuyuki Kozakai
  2004-06-24 10:24     ` Jozsef Kadlecsik
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-24 10:12 UTC (permalink / raw)
  To: kisza; +Cc: netfilter-devel, laforge, usagi-core


Hi, Andras,

From: Andras Kis-Szabo <kisza@securityaudit.hu>
Date: Thu, 24 Jun 2004 10:13:50 +0200

> Hi Yasuyuki,
> 
> you saved my life :)

I'm glad.

> When I tried to do the same a was not able to track behaviour of
> skb->nh. (There was some grey point, like when the packet leaves the
> machine over the tunnel interface.)

Tunnel interfaces reset nh.raw to new address of network protocol header.
Otherwide kernel can be down in many parts of IPv6 stack.

BTW, if I remember correctly, you also tryed to rewrite match modules which
find extension headers. Have you already finished ? If so, you'll save my life,
too :)

> > 	- In all match module, the only arguments of match() are changed.
> I think it is not a problem since the ip6tables - as I know - not a
> 'core' product. We can break the compatibility / usability of these
> extensions. (And it will push us to fix the problems and sync the
> matches with the IPv4 parts.)
> Could you add an #error string right after the definitions in the
> matches?
>
> In this case the compilation of the matches will cause strange error
> messages - and someone have to answer all of them. An error message
> should help a little bit. (Eg.: Known error point. This part is under
> reconstruction. Please, be patient. etc.)

my patch doesn't remove skb_linearize() and I'll keep it until all IPv6
modules in main kernel has changed. and if match module uses old API,
compilation errors are occurred and user can notice and report that.
Then I think adding #error string to match module is not needed.

But I'm reminded by you that target modules doesn't cause compiling error.

OK, I try to solve this problem... But how same problem was managed when
removing skb_linearize() from "ip_tables.c" ? Maybe the order of arguments
of target() were changed ?

Anyone knows about that ?

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 10:12   ` Yasuyuki Kozakai
@ 2004-06-24 10:24     ` Jozsef Kadlecsik
  2004-06-24 10:35       ` Yasuyuki Kozakai
  0 siblings, 1 reply; 29+ messages in thread
From: Jozsef Kadlecsik @ 2004-06-24 10:24 UTC (permalink / raw)
  To: Yasuyuki Kozakai; +Cc: kisza, netfilter-devel, laforge, usagi-core

On Thu, 24 Jun 2004, Yasuyuki Kozakai wrote:

> OK, I try to solve this problem... But how same problem was managed when
> removing skb_linearize() from "ip_tables.c" ? Maybe the order of arguments
> of target() were changed ?

Exactly!

/* Registration hooks for targets. */
struct ipt_target
{
[...]
        /* Returns verdict.  Argument order changed since 2.4, as this
           must now handle non-linear skbs, using skb_copy_bits and
           skb_ip_make_writable. */
        unsigned int (*target)(struct sk_buff **pskb,
[...]

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 10:24     ` Jozsef Kadlecsik
@ 2004-06-24 10:35       ` Yasuyuki Kozakai
  0 siblings, 0 replies; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-24 10:35 UTC (permalink / raw)
  To: kadlec; +Cc: yasuyuki.kozakai, kisza, netfilter-devel, laforge, usagi-core


Jozsef, thank you for your information!

I'll change target() in ip6_tables.h like that.

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Subject: Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
Date: Thu, 24 Jun 2004 12:24:15 +0200 (CEST)

> On Thu, 24 Jun 2004, Yasuyuki Kozakai wrote:
> 
> > OK, I try to solve this problem... But how same problem was managed when
> > removing skb_linearize() from "ip_tables.c" ? Maybe the order of arguments
> > of target() were changed ?
> 
> Exactly!
> 
> /* Registration hooks for targets. */
> struct ipt_target
> {
> [...]
>         /* Returns verdict.  Argument order changed since 2.4, as this
>            must now handle non-linear skbs, using skb_copy_bits and
>            skb_ip_make_writable. */
>         unsigned int (*target)(struct sk_buff **pskb,
> [...]
> 
> Best regards,
> Jozsef
> -
> E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
> PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
> Address : KFKI Research Institute for Particle and Nuclear Physics
>           H-1525 Budapest 114, POB. 49, Hungary
> 
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24  4:04 [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization Yasuyuki Kozakai
  2004-06-24  8:13 ` Andras Kis-Szabo
@ 2004-06-24 11:26 ` Patrick McHardy
  2004-06-24 11:50   ` Jozsef Kadlecsik
  2004-06-25  9:53   ` Harald Welte
  1 sibling, 2 replies; 29+ messages in thread
From: Patrick McHardy @ 2004-06-24 11:26 UTC (permalink / raw)
  To: Yasuyuki Kozakai; +Cc: netfilter-devel, laforge, kisza, usagi-core

Yasuyuki Kozakai wrote:
> Hi, folks,
> 
> In the current kernel, skb is linearized by skb_linearize() in ip6_tables.c.
> I suggest removing this, and this patch is the 1st step to do it.
> 
> To remove skb_linearize(), this patch changes the API of match() like
> ip_tables.h

I'm not sure the way iptables does it is really the right way. We call
skb_copy_bits for anything that needs to be matched after the ip_header.
Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
100 times, for a total of 2000 bytes. One call to skb_linearize would
most likely be less expensive. I'm thinking about putting the copied
protocol header in the control buffer, this would reduce this extensive
copying a lot. We could also do some common preprocessing steps in one
place, like converting things to host byte order.

Comments anyone ?

Regards
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 11:26 ` Patrick McHardy
@ 2004-06-24 11:50   ` Jozsef Kadlecsik
  2004-06-24 13:04     ` Yasuyuki Kozakai
  2004-06-25  9:53   ` Harald Welte
  1 sibling, 1 reply; 29+ messages in thread
From: Jozsef Kadlecsik @ 2004-06-24 11:50 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Yasuyuki Kozakai, netfilter-devel, laforge, kisza, usagi-core

On Thu, 24 Jun 2004, Patrick McHardy wrote:

> Yasuyuki Kozakai wrote:
> > Hi, folks,
> >
> > In the current kernel, skb is linearized by skb_linearize() in ip6_tables.c.
> > I suggest removing this, and this patch is the 1st step to do it.
> >
> > To remove skb_linearize(), this patch changes the API of match() like
> > ip_tables.h
>
> I'm not sure the way iptables does it is really the right way. We call
> skb_copy_bits for anything that needs to be matched after the ip_header.
> Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
> 100 times, for a total of 2000 bytes. One call to skb_linearize would
> most likely be less expensive. I'm thinking about putting the copied
> protocol header in the control buffer, this would reduce this extensive
> copying a lot. We could also do some common preprocessing steps in one
> place, like converting things to host byte order.

Wouldn't an skb_ip[6]_make_headers_writable function based on
skb_ip_make_writable more optimal?

The function would make the full IP[v6], transport protocol (and IPv6
option protocol) headers writable and could be called from nf_hook_slow.

Opinions?

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 11:50   ` Jozsef Kadlecsik
@ 2004-06-24 13:04     ` Yasuyuki Kozakai
  2004-06-24 13:25       ` Jozsef Kadlecsik
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-24 13:04 UTC (permalink / raw)
  To: kadlec; +Cc: kaber, yasuyuki.kozakai, netfilter-devel, laforge, kisza,
	usagi-core


Hi,

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 24 Jun 2004 13:50:27 +0200 (CEST)

> On Thu, 24 Jun 2004, Patrick McHardy wrote:
>
> > I'm not sure the way iptables does it is really the right way. We call
> > skb_copy_bits for anything that needs to be matched after the ip_header.
> > Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
> > 100 times, for a total of 2000 bytes. One call to skb_linearize would
> > most likely be less expensive. I'm thinking about putting the copied
> > protocol header in the control buffer, this would reduce this extensive
> > copying a lot. We could also do some common preprocessing steps in one
> > place, like converting things to host byte order.
> 
> Wouldn't an skb_ip[6]_make_headers_writable function based on
> skb_ip_make_writable more optimal?

I think so in the case of IPv4.

> The function would make the full IP[v6], transport protocol (and IPv6
> option protocol) headers writable and could be called from nf_hook_slow.
> 
> Opinions?

skb is not required to be writable. if skb_headlen(skb) is greater than
the length required to be linear, copying is not needed even though skb is
shared or clone. And one issue is that the argument "skb" in match() is const.
Can we really remove it ?

And I'm not sure about IPv6. At least, I don't like to skip IPv6 extension
header in skb_ip6_make_writable() to figure out that skb should be copied
or pulled.

Regards,
-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

> 
> Best regards,
> Jozsef
> -
> E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
> PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
> Address : KFKI Research Institute for Particle and Nuclear Physics
>           H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 13:04     ` Yasuyuki Kozakai
@ 2004-06-24 13:25       ` Jozsef Kadlecsik
  2004-06-24 13:48         ` (usagi-core 18584) " YOSHIFUJI Hideaki / 吉藤英明
  2004-06-24 15:06         ` Yasuyuki Kozakai
  0 siblings, 2 replies; 29+ messages in thread
From: Jozsef Kadlecsik @ 2004-06-24 13:25 UTC (permalink / raw)
  To: Yasuyuki Kozakai
  Cc: Patrick McHardy, netfilter-devel, Harald Welte, kisza, usagi-core

Hi,

On Thu, 24 Jun 2004, Yasuyuki Kozakai wrote:

> From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
> Date: Thu, 24 Jun 2004 13:50:27 +0200 (CEST)
>
> > On Thu, 24 Jun 2004, Patrick McHardy wrote:
> >
> > > I'm not sure the way iptables does it is really the right way. We call
> > > skb_copy_bits for anything that needs to be matched after the ip_header.
> > > Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
> > > 100 times, for a total of 2000 bytes. One call to skb_linearize would
> > > most likely be less expensive. I'm thinking about putting the copied
> > > protocol header in the control buffer, this would reduce this extensive
> > > copying a lot. We could also do some common preprocessing steps in one
> > > place, like converting things to host byte order.
> >
> > Wouldn't an skb_ip[6]_make_headers_writable function based on
> > skb_ip_make_writable more optimal?
>
> I think so in the case of IPv4.
>
> > The function would make the full IP[v6], transport protocol (and IPv6
> > option protocol) headers writable and could be called from nf_hook_slow.
>
> skb is not required to be writable. if skb_headlen(skb) is greater than
> the length required to be linear, copying is not needed even though skb is
> shared or clone.

Your're of course right. For matching we don't really need the headers to
be writable, linear is just enough. However mangle and NAT requires
writable headers because typically we modify the headers only (let's not
consider the conntrack/NAT protocol helpers).

> And one issue is that the argument "skb" in match() is const. Can we
> really remove it ?

I'm confused: why should we remove it? The function would be called
directly from nf_hook_slow and not from the matches.

> And I'm not sure about IPv6. At least, I don't like to skip IPv6 extension
> header in skb_ip6_make_writable() to figure out that skb should be copied
> or pulled.

I'm unfamiliar with the deep magic behind IPv6: I simply assumed that
linear headers up to the transport protocol headers in IPv6 would just be
fine and enough for the matches. That'd require an ugly processing header
by header in skb_ip6_make_headers_linear ;-), I think.

I'll try to cook some code to see how it'd come out.

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: (usagi-core 18584) Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 13:25       ` Jozsef Kadlecsik
@ 2004-06-24 13:48         ` YOSHIFUJI Hideaki / 吉藤英明
  2004-06-24 15:06         ` Yasuyuki Kozakai
  1 sibling, 0 replies; 29+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2004-06-24 13:48 UTC (permalink / raw)
  To: usagi-core, kadlec
  Cc: yasuyuki.kozakai, kaber, netfilter-devel, laforge, kisza

In article <Pine.LNX.4.33.0406241510140.8190-100000@blackhole.kfki.hu> (at Thu, 24 Jun 2004 15:25:54 +0200 (CEST)), Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> says:

> I'm unfamiliar with the deep magic behind IPv6: I simply assumed that
> linear headers up to the transport protocol headers in IPv6 would just be
> fine and enough for the matches. That'd require an ugly processing header
> by header in skb_ip6_make_headers_linear ;-), I think.

It is full of beauty to process header by header.
Skip processing extension header is BAD idea.
You cannot (and shouldn't) see next header (or payload)
until you really process the current header.

--yoshfuji

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 13:25       ` Jozsef Kadlecsik
  2004-06-24 13:48         ` (usagi-core 18584) " YOSHIFUJI Hideaki / 吉藤英明
@ 2004-06-24 15:06         ` Yasuyuki Kozakai
  2004-06-24 16:50           ` Patrick McHardy
  1 sibling, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-24 15:06 UTC (permalink / raw)
  To: kadlec; +Cc: yasuyuki.kozakai, kaber, netfilter-devel, laforge, kisza,
	usagi-core


Hi,

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Thu, 24 Jun 2004 15:25:54 +0200 (CEST)

> > > The function would make the full IP[v6], transport protocol (and IPv6
> > > option protocol) headers writable and could be called from nf_hook_slow.
> >
> > skb is not required to be writable. if skb_headlen(skb) is greater than
> > the length required to be linear, copying is not needed even though skb is
> > shared or clone.
> 
> Your're of course right. For matching we don't really need the headers to
> be writable, linear is just enough. However mangle and NAT requires
> writable headers because typically we modify the headers only (let's not
> consider the conntrack/NAT protocol helpers).
> 
> > And one issue is that the argument "skb" in match() is const. Can we
> > really remove it ?
> 
> I'm confused: why should we remove it? The function would be called
> directly from nf_hook_slow and not from the matches.

Sorry, I misstook. But why nf_hook_slow() ? I think users expect that
skb is not linearized if ip_tables.ko is not loaded.
Then how about ipt_do_table() or hooks in iptable_filter.c ?

> > And I'm not sure about IPv6. At least, I don't like to skip IPv6 extension
> > header in skb_ip6_make_writable() to figure out that skb should be copied
> > or pulled.
> 
> I'm unfamiliar with the deep magic behind IPv6: I simply assumed that
> linear headers up to the transport protocol headers in IPv6 would just be
> fine and enough for the matches. That'd require an ugly processing header
> by header in skb_ip6_make_headers_linear ;-), I think.

In the first place, I don't understand why skb_ip_make_writable() checks
required length is up to the transport protocol header or not.
If skb_ip6_make_headers_linear() will check that too, extension headers are
skipped many times. That's why I don't like it. I want to reduce the number of
times of skipping extension headers as possible.

Regards,

> I'll try to cook some code to see how it'd come out.
> 
> Best regards,
> Jozsef
> -
> E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
> PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
> Address : KFKI Research Institute for Particle and Nuclear Physics
>           H-1525 Budapest 114, POB. 49, Hungary

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 15:06         ` Yasuyuki Kozakai
@ 2004-06-24 16:50           ` Patrick McHardy
  2004-06-25  4:57             ` Yasuyuki Kozakai
  0 siblings, 1 reply; 29+ messages in thread
From: Patrick McHardy @ 2004-06-24 16:50 UTC (permalink / raw)
  To: Yasuyuki Kozakai; +Cc: kadlec, netfilter-devel, laforge, kisza, usagi-core

Yasuyuki Kozakai wrote:

>>I'm confused: why should we remove it? The function would be called
>>directly from nf_hook_slow and not from the matches.
>>
>
>Sorry, I misstook. But why nf_hook_slow() ? I think users expect that
>skb is not linearized if ip_tables.ko is not loaded.
>Then how about ipt_do_table() or hooks in iptable_filter.c ?
>
It would be nice if the skb_copy_bits calls in conntrack and nat copying
only the transport protocol headers could be removed as well. We could
register for the PREROUTING hook with highest priority and linearize the
headers when a global flag is set. ip_conntrack and ip_tables would both
set this flag.

>>>And I'm not sure about IPv6. At least, I don't like to skip IPv6 extension
>>>header in skb_ip6_make_writable() to figure out that skb should be copied
>>>or pulled.
>>>
>>I'm unfamiliar with the deep magic behind IPv6: I simply assumed that
>>linear headers up to the transport protocol headers in IPv6 would just be
>>fine and enough for the matches. That'd require an ugly processing header
>>by header in skb_ip6_make_headers_linear ;-), I think.
>>
>
>In the first place, I don't understand why skb_ip_make_writable() checks
>required length is up to the transport protocol header or not.
>If skb_ip6_make_headers_linear() will check that too, extension headers are
>skipped many times. That's why I don't like it. I want to reduce the number of
>times of skipping extension headers as possible.
>
I also don't understand this, I thought the linear part of an skb is
skb_headlen(skb), why the distinction between different protocols ?

Regards
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 16:50           ` Patrick McHardy
@ 2004-06-25  4:57             ` Yasuyuki Kozakai
  2004-06-25 10:01               ` Jozsef Kadlecsik
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-25  4:57 UTC (permalink / raw)
  To: kaber; +Cc: kadlec, netfilter-devel, laforge, kisza, usagi-core


Hi, 

From: Patrick McHardy <kaber@trash.net>
Date: Thu, 24 Jun 2004 18:50:32 +0200

> Yasuyuki Kozakai wrote:
> >
> >Sorry, I misstook. But why nf_hook_slow() ? I think users expect that
> >skb is not linearized if ip_tables.ko is not loaded.
> >Then how about ipt_do_table() or hooks in iptable_filter.c ?
> >
> It would be nice if the skb_copy_bits calls in conntrack and nat copying
> only the transport protocol headers could be removed as well. We could
> register for the PREROUTING hook with highest priority and linearize the
> headers when a global flag is set. ip_conntrack and ip_tables would both
> set this flag.

flag is good idea. But in the case of IPv6, the fact remains that
skipping IPv6 ext headers are needed to find transport protocol.

> >In the first place, I don't understand why skb_ip_make_writable() checks
> >required length is up to the transport protocol header or not.
> >If skb_ip6_make_headers_linear() will check that too, extension headers are
> >skipped many times. That's why I don't like it. I want to reduce the number of
> >times of skipping extension headers as possible.
> >
> I also don't understand this, I thought the linear part of an skb is
> skb_headlen(skb), why the distinction between different protocols ?

It looks like this function was introduced in 2.5.70 by Rusty Russel.

Anyway, I think the function which requires to skip IPv6 extension headers
should not be called in nf_hook_slow() since skipping is occurred 2 times
- nf_hook_slow() and ip6_do_table(). It's better to linearize header by header
while finding transport protocol header in ip6_do_table().

> Regards
> Patrick

Regards,
-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-24 11:26 ` Patrick McHardy
  2004-06-24 11:50   ` Jozsef Kadlecsik
@ 2004-06-25  9:53   ` Harald Welte
  2004-06-28 20:31     ` Patrick McHardy
  2004-07-06 10:20     ` Patrick McHardy
  1 sibling, 2 replies; 29+ messages in thread
From: Harald Welte @ 2004-06-25  9:53 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Yasuyuki Kozakai, netfilter-devel, kisza, usagi-core

On Thu, Jun 24, 2004 at 01:26:10PM +0200, Patrick McHardy wrote:
> Yasuyuki Kozakai wrote:
> >Hi, folks,
> >
> >In the current kernel, skb is linearized by skb_linearize() in 
> >ip6_tables.c.
> >I suggest removing this, and this patch is the 1st step to do it.
> >
> >To remove skb_linearize(), this patch changes the API of match() like
> >ip_tables.h
> 
> I'm not sure the way iptables does it is really the right way. We call
> skb_copy_bits for anything that needs to be matched after the ip_header.

yes.

> Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
> 100 times, for a total of 2000 bytes. 

but we're talking about local copies to the stack.  those copies are 20
byte copies to the cache, and most likely would never have to hit memory
at all.  

> One call to skb_linearize would most likely be less expensive. I'm
> thinking about putting the copied protocol header in the control
> buffer, this would reduce this extensive copying a lot. We could also
> do some common preprocessing steps in one place, like converting
> things to host byte order.

yes, it would be only one copy... but to real memory..

> Comments anyone ?

Maybe we need some benchmarking on this.

> Regards
> Patrick

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-25  4:57             ` Yasuyuki Kozakai
@ 2004-06-25 10:01               ` Jozsef Kadlecsik
  2004-06-26  7:25                 ` Yasuyuki Kozakai
  2004-07-21 21:36                 ` Harald Welte
  0 siblings, 2 replies; 29+ messages in thread
From: Jozsef Kadlecsik @ 2004-06-25 10:01 UTC (permalink / raw)
  To: Yasuyuki Kozakai
  Cc: Patrick McHardy, netfilter-devel, Harald Welte, kisza, usagi-core

Hi,

On Fri, 25 Jun 2004, Yasuyuki Kozakai wrote:

> > >Sorry, I misstook. But why nf_hook_slow() ? I think users expect that
> > >skb is not linearized if ip_tables.ko is not loaded.
> > >Then how about ipt_do_table() or hooks in iptable_filter.c ?
> > >
> > It would be nice if the skb_copy_bits calls in conntrack and nat copying
> > only the transport protocol headers could be removed as well. We could
> > register for the PREROUTING hook with highest priority and linearize the
> > headers when a global flag is set. ip_conntrack and ip_tables would both
> > set this flag.
>
> flag is good idea. But in the case of IPv6, the fact remains that
> skipping IPv6 ext headers are needed to find transport protocol.

- conntrack requires linearized protocol headers
	- with the TCP window tracking code it means the complete
	  TCP header, including the options due to the SACK support
- nat requires writable protocol headers, including the TCP options
  due to the SACK support
- raw/filter tables require linearized protocol headers in general (we can
  safely assume port matching rules :-)
- mark table requires writable protocol headers if we mangle the packet
  headers

So I think when any component of netfilter is enabled, we can assume that
at least linearized protocol headers are required. Therefore I suggested
to add such a function to nf_hook_slow.

We cannot add (back) a general skb_linearize: that was a constant complain
against netfilter from performance reason and was therefore removed.

In order to spare skipping again and again the extension headers in IPv6,
wouldn't it make sense to add a new argument to the hook functions which
argument would point then to the protocol header?

Best regards,
Jozsef
-
E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
Address : KFKI Research Institute for Particle and Nuclear Physics
          H-1525 Budapest 114, POB. 49, Hungary

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-25 10:01               ` Jozsef Kadlecsik
@ 2004-06-26  7:25                 ` Yasuyuki Kozakai
  2004-07-21 21:36                 ` Harald Welte
  1 sibling, 0 replies; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-06-26  7:25 UTC (permalink / raw)
  To: kadlec; +Cc: yasuyuki.kozakai, kaber, netfilter-devel, laforge, kisza,
	usagi-core


Hi,

From: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu>
Date: Fri, 25 Jun 2004 12:01:37 +0200 (CEST)

> > > >Sorry, I misstook. But why nf_hook_slow() ? I think users expect that
> > > >skb is not linearized if ip_tables.ko is not loaded.
> > > >Then how about ipt_do_table() or hooks in iptable_filter.c ?
> > > >
> > > It would be nice if the skb_copy_bits calls in conntrack and nat copying
> > > only the transport protocol headers could be removed as well. We could
> > > register for the PREROUTING hook with highest priority and linearize the
> > > headers when a global flag is set. ip_conntrack and ip_tables would both
> > > set this flag.
> >
> > flag is good idea. But in the case of IPv6, the fact remains that
> > skipping IPv6 ext headers are needed to find transport protocol.
> 
> - conntrack requires linearized protocol headers
> 	- with the TCP window tracking code it means the complete
> 	  TCP header, including the options due to the SACK support
> - nat requires writable protocol headers, including the TCP options
>   due to the SACK support
> - raw/filter tables require linearized protocol headers in general (we can
>   safely assume port matching rules :-)
> - mark table requires writable protocol headers if we mangle the packet
>   headers

That's right at this point.

> So I think when any component of netfilter is enabled, we can assume that
> at least linearized protocol headers are required. Therefore I suggested
> to add such a function to nf_hook_slow.

As you said, I think nf_hook_slow is best point for IPv4 modules.
And I found that the issue of IPv6 can be solved by adding flag to the
argument nf_hook_register(). This flag tell whether request to linearize
or not.

And I have a (maybe) last issue. Are we allowed to add protocol dependent
codes to nf_hook_slow() ?

> We cannot add (back) a general skb_linearize: that was a constant complain
> against netfilter from performance reason and was therefore removed.

I think so, too.

> In order to spare skipping again and again the extension headers in IPv6,
> wouldn't it make sense to add a new argument to the hook functions which
> argument would point then to the protocol header?

If so, the transport protocol number is needed, too. You can find the
transport protocol number at Next Header field in previous header.

> Best regards,
> Jozsef
> -
> E-mail  : kadlec@blackhole.kfki.hu, kadlec@sunserv.kfki.hu
> PGP key : http://www.kfki.hu/~kadlec/pgp_public_key.txt
> Address : KFKI Research Institute for Particle and Nuclear Physics
>           H-1525 Budapest 114, POB. 49, Hungary

Best regards,
-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-25  9:53   ` Harald Welte
@ 2004-06-28 20:31     ` Patrick McHardy
  2004-07-06 10:20     ` Patrick McHardy
  1 sibling, 0 replies; 29+ messages in thread
From: Patrick McHardy @ 2004-06-28 20:31 UTC (permalink / raw)
  To: Harald Welte; +Cc: Yasuyuki Kozakai, netfilter-devel, kisza, usagi-core

Harald Welte wrote:
> On Thu, Jun 24, 2004 at 01:26:10PM +0200, Patrick McHardy wrote:
> 
>>One call to skb_linearize would most likely be less expensive. I'm
>>thinking about putting the copied protocol header in the control
>>buffer, this would reduce this extensive copying a lot. We could also
>>do some common preprocessing steps in one place, like converting
>>things to host byte order.
> 
> 
> yes, it would be only one copy... but to real memory..
> Maybe we need some benchmarking on this.

It may be only to cache, but in case the skb is non-linear skb_copy_bits
calls local_bh_disable and kmap_atomic for the paged part, and walks
frag_list. I don't expect skb_ip_make_writable to be much faster in
all cases, but a new function which would only linearize the transport
headers, without copying the entire skb, probably would be.
Unfortunately I don't have time to implement and benchmark it right now.

>>Regards
>>Patrick
> 
> 

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-25  9:53   ` Harald Welte
  2004-06-28 20:31     ` Patrick McHardy
@ 2004-07-06 10:20     ` Patrick McHardy
  2004-07-06 10:35       ` Harald Welte
  2004-07-06 22:59       ` Pablo Neira
  1 sibling, 2 replies; 29+ messages in thread
From: Patrick McHardy @ 2004-07-06 10:20 UTC (permalink / raw)
  To: Harald Welte; +Cc: Yasuyuki Kozakai, netfilter-devel, kisza, usagi-core

Harald Welte wrote:
> On Thu, Jun 24, 2004 at 01:26:10PM +0200, Patrick McHardy wrote:
> 
>>Think of 100 rules matching "-p tcp --dport X". We copy the tcp header
>>100 times, for a total of 2000 bytes. 
> 
> 
> but we're talking about local copies to the stack.  those copies are 20
> byte copies to the cache, and most likely would never have to hit memory
> at all.  

I did some profiling:

1000 non-matching rules for tcp-sport, ~40mbit tcp transfer, about 10
minutes

The top three entries are:

CPU: Athlon, speed 1667.51 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a 
unit mask of 0x00 (No unit mask) count 100000
samples  %        linenr info                 image name 
app name                 symbol name
3489310  50.0932  (no location information)   ip_tables.ko 
ip_tables                ipt_do_table
1405809  20.1821  skbuff.c:858                vmlinux 
vmlinux                  skb_copy_bits
755945   10.8525  ip_tables.c:1488            ip_tables.ko 
ip_tables                tcp_match

skb_copy_bits is taking about twice as much cycles as tcp_match
and 20% of total cycles.

Regards
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-07-06 10:20     ` Patrick McHardy
@ 2004-07-06 10:35       ` Harald Welte
  2004-07-06 22:59       ` Pablo Neira
  1 sibling, 0 replies; 29+ messages in thread
From: Harald Welte @ 2004-07-06 10:35 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: Yasuyuki Kozakai, netfilter-devel, kisza, usagi-core

[-- Attachment #1: Type: text/plain, Size: 585 bytes --]

On Tue, Jul 06, 2004 at 12:20:23PM +0200, Patrick McHardy wrote:
> skb_copy_bits is taking about twice as much cycles as tcp_match
> and 20% of total cycles.

Well, I am convinced ;)

> Regards
> Patrick

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-07-06 10:20     ` Patrick McHardy
  2004-07-06 10:35       ` Harald Welte
@ 2004-07-06 22:59       ` Pablo Neira
  2004-07-06 23:33         ` Patrick McHardy
  1 sibling, 1 reply; 29+ messages in thread
From: Pablo Neira @ 2004-07-06 22:59 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Harald Welte, Yasuyuki Kozakai, netfilter-devel, kisza,
	usagi-core

Hi Patrick,

Patrick McHardy wrote:

> skb_copy_bits is taking about twice as much cycles as tcp_match
> and 20% of total cycles.


AFAIK we are using skb_copy_bits because it walks through fragment list 
in a safe way, I read an old thread (Harald and DaveM discussion) about 
this. If I'm not wrong, that's why this was the method selected when 
conntrack started handling fragments in 2.6 in a different way.

Well, I wonder if we could check if current skb is fragmented (something 
like skb->data_len!=0), in that case use skb_copy_bits, if not walk 
through skb payload directly as we use to do in 2.4 and save that memcpy.

regards,
Pablo

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-07-06 22:59       ` Pablo Neira
@ 2004-07-06 23:33         ` Patrick McHardy
  0 siblings, 0 replies; 29+ messages in thread
From: Patrick McHardy @ 2004-07-06 23:33 UTC (permalink / raw)
  To: Pablo Neira
  Cc: Harald Welte, Yasuyuki Kozakai, netfilter-devel, kisza,
	usagi-core

Pablo Neira wrote:
> Hi Patrick,
> 
> Patrick McHardy wrote:
> 
>> skb_copy_bits is taking about twice as much cycles as tcp_match
>> and 20% of total cycles.
> 
> AFAIK we are using skb_copy_bits because it walks through fragment list 
> in a safe way, I read an old thread (Harald and DaveM discussion) about 
> this. If I'm not wrong, that's why this was the method selected when 
> conntrack started handling fragments in 2.6 in a different way.

skb_copy_bits does three things: it copies memory from the linear part
between skb->head and skb->tail, it copies memory from the paged part
in skb->frags[], and it walks skb->frag_list for defragmented packets.

> 
> Well, I wonder if we could check if current skb is fragmented (something 
> like skb->data_len!=0), in that case use skb_copy_bits, if not walk 
> through skb payload directly as we use to do in 2.4 and save that memcpy.

I don't think we should do that, the number of copies would still be
dependant on the ruleset. I see two possibilities: We could linearize
the transport protocol headers or we could copy it to skb->cb.

Linearizing has three cases:
- data is already linear: nothing to do
- parts of data are in paged part in skb->frags[]: append data of frags
   to skb->end until we have enough
- parts of data are in frag_list: walk frag_list until enough linear
   data is in first skb, copy linear data, handle paged data like
   described above

I don't know if this turns out to be much more efficient than the old
call to skb_linearize. Copying to skb->cb is simpler, but the frequent
case of already linear data is handled less efficient than with
linearizing.

Regards
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-06-25 10:01               ` Jozsef Kadlecsik
  2004-06-26  7:25                 ` Yasuyuki Kozakai
@ 2004-07-21 21:36                 ` Harald Welte
  2004-07-29  6:09                   ` Yasuyuki Kozakai
  1 sibling, 1 reply; 29+ messages in thread
From: Harald Welte @ 2004-07-21 21:36 UTC (permalink / raw)
  To: Jozsef Kadlecsik
  Cc: Yasuyuki Kozakai, Patrick McHardy, netfilter-devel, Harald Welte,
	kisza, usagi-core

[-- Attachment #1: Type: text/plain, Size: 1688 bytes --]

On Fri, Jun 25, 2004 at 12:01:37PM +0200, Jozsef Kadlecsik wrote:

> - conntrack requires linearized protocol headers
> 	- with the TCP window tracking code it means the complete
> 	  TCP header, including the options due to the SACK support
> - nat requires writable protocol headers, including the TCP options
>   due to the SACK support
> - raw/filter tables require linearized protocol headers in general (we can
>   safely assume port matching rules :-)
> - mark table requires writable protocol headers if we mangle the packet
>   headers

yes, it's time to implement my old idea of a function like
	skb_linearize_partial(skb, len)

This could then be called by a skb_linearize_l4() function, that would
linearize IP header, ip options, tcp header.

Ideally a hook-registering function should specify what it needs to have
linearized. netfilter.c would then calculate the per-hook maximum and
call the linearize function with that maximum.  

> So I think when any component of netfilter is enabled, we can assume that
> at least linearized protocol headers are required. Therefore I suggested
> to add such a function to nf_hook_slow.

yes, I agree.  Does anybody want to hack up a patch?  The most
interesting part will be testing of that feature...

> Best regards,
> Jozsef

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-07-21 21:36                 ` Harald Welte
@ 2004-07-29  6:09                   ` Yasuyuki Kozakai
  2004-08-01 16:46                     ` Harald Welte
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-07-29  6:09 UTC (permalink / raw)
  To: laforge; +Cc: kadlec, yasuyuki.kozakai, kaber, netfilter-devel, kisza,
	usagi-core

[-- Attachment #1: Type: Text/Plain, Size: 2668 bytes --]


Hi,

I got time to implement your idea. How about this ? (not tested)

BTW this code copies skb in the worst case. While implementing I got a idea
about this issue. How about introducing the function like that

	struct tcphdr hdr;
	struct tcphdr *tcph

	tcph = skb_get_bits(skb, &hdr, skb->nh.iph->ihl*4, sizeof(hdr));

If skb is neither shared nor cloned, this function linearize up to tcp header
and returns the pointer to tcp header in skb.

Otherwise, copies tcp header to "hdr" and return the pointer to it.
If error, return NULL.

This function will also eliminate the needs to linearization at nf_hook_slow()
but it is needed to modify all modules which read contents (but not write).

Do you think which is better ?

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>


From: Harald Welte <laforge@netfilter.org>
Date: Wed, 21 Jul 2004 17:36:54 -0400

> On Fri, Jun 25, 2004 at 12:01:37PM +0200, Jozsef Kadlecsik wrote:
> 
> > - conntrack requires linearized protocol headers
> > 	- with the TCP window tracking code it means the complete
> > 	  TCP header, including the options due to the SACK support
> > - nat requires writable protocol headers, including the TCP options
> >   due to the SACK support
> > - raw/filter tables require linearized protocol headers in general (we can
> >   safely assume port matching rules :-)
> > - mark table requires writable protocol headers if we mangle the packet
> >   headers
> 
> yes, it's time to implement my old idea of a function like
> 	skb_linearize_partial(skb, len)
> 
> This could then be called by a skb_linearize_l4() function, that would
> linearize IP header, ip options, tcp header.
> 
> Ideally a hook-registering function should specify what it needs to have
> linearized. netfilter.c would then calculate the per-hook maximum and
> call the linearize function with that maximum.  
> 
> > So I think when any component of netfilter is enabled, we can assume that
> > at least linearized protocol headers are required. Therefore I suggested
> > to add such a function to nf_hook_slow.
> 
> yes, I agree.  Does anybody want to hack up a patch?  The most
> interesting part will be testing of that feature...
> 
> > Best regards,
> > Jozsef
> 
> -- 
> - Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
> ============================================================================
>   "Fragmentation is like classful addressing -- an interesting early
>    architectural error that shows how much experimentation was going
>    on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: linearize.patch --]
[-- Type: Text/Plain, Size: 7692 bytes --]

diff -Nurp -X dontdiff linux-2.6.7/include/linux/netfilter.h linux-2.6.7-linearize/include/linux/netfilter.h
--- linux-2.6.7/include/linux/netfilter.h	2004-06-16 14:19:23.000000000 +0900
+++ linux-2.6.7-linearize/include/linux/netfilter.h	2004-07-29 10:58:55.000000000 +0900
@@ -53,8 +53,12 @@ struct nf_hook_ops
 	int hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int priority;
+	int linearize_mode;
+	int linearize_layer;
 };
 
+#define NF_LINEARIZE_NONE	0
+
 struct nf_sockopt_ops
 {
 	struct list_head list;
diff -Nurp -X dontdiff linux-2.6.7/include/linux/netfilter_ipv4.h linux-2.6.7-linearize/include/linux/netfilter_ipv4.h
--- linux-2.6.7/include/linux/netfilter_ipv4.h	2004-06-16 14:19:52.000000000 +0900
+++ linux-2.6.7-linearize/include/linux/netfilter_ipv4.h	2004-07-29 14:48:11.073677304 +0900
@@ -78,6 +78,29 @@ void nf_debug_ip_loopback_xmit(struct sk
 void nf_debug_ip_finish_output2(struct sk_buff *skb);
 #endif /*CONFIG_NETFILTER_DEBUG*/
 
+/* linearize skb up to required header */
+extern int ip_linearize_headers(struct sk_buff **pskb, int mode, int layer);
+
+/* mode */
+
+/* not linearize */
+#define NF_IP_LINEARIZE_NONE			NF_LINEARIZE_NONE
+/* make skb readable without copying if possible */
+#define NF_IP_LINEARIZE_READABLE_NO_COPY	100
+/* make skb readable. skb is copied if needed */
+#define NF_IP_LINEARIZE_READABLE		200
+/* make skb readable. skb is copied if needed */
+#define NF_IP_LINEARIZE_WRITABLE		300
+/* linearize skb */
+#define NF_IP_LINEARIZE_ALL			65534
+
+/* layer */
+/* up to IP header */
+#define NF_IP_LINEARIZE_IPHDR			100
+/* up to transport header */
+#define NF_IP_LINEARIZE_TRANSPORT		200
+
+
 extern int ip_route_me_harder(struct sk_buff **pskb);
 
 /* Call this before modifying an existing IP packet: ensures it is
diff -Nurp -X dontdiff linux-2.6.7/net/core/netfilter.c linux-2.6.7-linearize/net/core/netfilter.c
--- linux-2.6.7/net/core/netfilter.c	2004-06-16 14:19:22.000000000 +0900
+++ linux-2.6.7-linearize/net/core/netfilter.c	2004-07-29 11:17:45.000000000 +0900
@@ -46,6 +46,18 @@
 static DECLARE_MUTEX(nf_sockopt_mutex);
 
 struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
+
+static struct {
+	struct {
+		int mode;	/* readable, writable, ... */
+		int layer;	/* protocol layer required to linearize */
+	} conf[NF_MAX_HOOKS];
+
+	/* linearization function per network protocol */
+	int (*fn)(struct sk_buff **pskb, int mode, int layer);
+
+} nf_linearize[NPROTO];
+
 static LIST_HEAD(nf_sockopts);
 static spinlock_t nf_hook_lock = SPIN_LOCK_UNLOCKED;
 
@@ -70,6 +82,21 @@ int nf_register_hook(struct nf_hook_ops 
 			break;
 	}
 	list_add_rcu(&reg->list, i->prev);
+
+	if (reg->linearize_mode != NF_LINEARIZE_NONE &&
+	    nf_linearize[reg->pf].fn == NULL) {
+		spin_unlock_bh(&nf_hook_lock);
+		return -1;
+	}
+
+	if (nf_linearize[reg->pf].conf[reg->hooknum].mode < reg->linearize_mode)
+		nf_linearize[reg->pf].conf[reg->hooknum].mode
+			= reg->linearize_mode;
+	if (nf_linearize[reg->pf].conf[reg->hooknum].layer
+	    < reg->linearize_layer)
+		nf_linearize[reg->pf].conf[reg->hooknum].layer
+			= reg->linearize_layer;
+
 	spin_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
@@ -80,6 +107,24 @@ void nf_unregister_hook(struct nf_hook_o
 {
 	spin_lock_bh(&nf_hook_lock);
 	list_del_rcu(&reg->list);
+
+	if (nf_linearize[reg->pf].fn) {
+		struct list_head *i;
+
+		int linearize_mode = NF_LINEARIZE_NONE;
+		int linearize_layer = 0;
+
+		list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
+			if (linearize_mode < reg->linearize_mode)
+				linearize_mode = reg->linearize_mode;
+			if (linearize_layer < reg->linearize_layer)
+				linearize_layer = reg->linearize_layer;
+		}
+
+		nf_linearize[reg->pf].conf[reg->hooknum].mode = linearize_mode;
+		nf_linearize[reg->pf].conf[reg->hooknum].layer = linearize_layer;
+	}
+
 	spin_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
@@ -515,6 +560,21 @@ int nf_hook_slow(int pf, unsigned int ho
 	skb->nf_debug |= (1 << hook);
 #endif
 
+	if (nf_linearize[pf].conf[hook].mode > NF_LINEARIZE_NONE &&
+	    nf_linearize[pf].fn != NULL) {
+		ret = nf_linearize[pf].fn(&skb,
+					  nf_linearize[pf].conf[hook].mode,
+					  nf_linearize[pf].conf[hook].layer);
+
+		if (ret < 0) {
+			if (net_ratelimit())
+				printk("can't linearize. pf = %d, hook = %d\n",
+				       pf, hook);
+
+			goto out;
+		}
+	}
+
 	elem = &nf_hooks[pf][hook];
  next_hook:
 	verdict = nf_iterate(&nf_hooks[pf][hook], &skb, hook, indev,
@@ -536,6 +596,7 @@ int nf_hook_slow(int pf, unsigned int ho
 		break;
 	}
 
+out:
 	rcu_read_unlock();
 	return ret;
 }
@@ -808,10 +869,16 @@ EXPORT_SYMBOL(nf_log_packet);
    with it. */
 void (*ip_ct_attach)(struct sk_buff *, struct nf_ct_info *);
 
+#ifdef CONFIG_INET
+extern int ip_linearize_headers(struct sk_buff **pskb, int mode, int layer);
+#endif
 void __init netfilter_init(void)
 {
 	int i, h;
 
+#ifdef CONFIG_INET
+	nf_linearize[PF_INET].fn = ip_linearize_headers;
+#endif
 	for (i = 0; i < NPROTO; i++) {
 		for (h = 0; h < NF_MAX_HOOKS; h++)
 			INIT_LIST_HEAD(&nf_hooks[i][h]);
diff -Nurp -X dontdiff linux-2.6.7/net/ipv4/netfilter/Makefile linux-2.6.7-linearize/net/ipv4/netfilter/Makefile
--- linux-2.6.7/net/ipv4/netfilter/Makefile	2004-06-16 14:19:01.000000000 +0900
+++ linux-2.6.7-linearize/net/ipv4/netfilter/Makefile	2004-07-29 10:58:55.000000000 +0900
@@ -96,3 +96,5 @@ obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += i
 obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
 
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+
+obj-y += ip_linearize.o
diff -Nurp -X dontdiff linux-2.6.7/net/ipv4/netfilter/ip_linearize.c linux-2.6.7-linearize/net/ipv4/netfilter/ip_linearize.c
--- linux-2.6.7/net/ipv4/netfilter/ip_linearize.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.7-linearize/net/ipv4/netfilter/ip_linearize.c	2004-07-29 11:02:16.000000000 +0900
@@ -0,0 +1,87 @@
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/errno.h>
+
+#include <asm/bug.h>
+
+/* linearize skb up to specified layer. */
+
+int
+ip_linearize_headers(struct sk_buff **pskb, int mode, int layer)
+{
+	struct sk_buff *nskb;
+	unsigned int totlen;
+
+	if (mode == NF_IP_LINEARIZE_WRITABLE &&
+	    (skb_shared(*pskb) || skb_cloned(*pskb)))
+		goto copy_skb;
+
+	if (layer == NF_IP_LINEARIZE_IPHDR)
+		return 0;
+
+	totlen = (*pskb)->nh.iph->ihl*4;
+
+	if (layer == NF_IP_LINEARIZE_TRANSPORT) {
+		switch ((*pskb)->nh.iph->protocol) {
+		case IPPROTO_TCP: {
+			struct tcphdr hdr;
+			if (skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4,
+					  &hdr, sizeof(hdr)) != 0) {
+				if (mode == NF_IP_LINEARIZE_READABLE_NO_COPY)
+					return 0;
+				else
+					goto copy_skb;
+			}
+			totlen += hdr.doff*4;
+			break;
+		}
+		case IPPROTO_UDP:
+			totlen += sizeof(struct udphdr);
+			break;
+		case IPPROTO_ICMP:
+			totlen += sizeof(struct icmphdr);
+			break;
+		/* Insert other cases here as desired */
+		}
+	} else if (layer == NF_IP_LINEARIZE_ALL)
+		totlen = (*pskb)->len;
+
+	if (totlen > (*pskb)->len)
+		totlen = (*pskb)->len;
+
+	if (totlen <= skb_headlen(*pskb))
+		return 0;
+
+	if (skb_shared(*pskb) || skb_cloned(*pskb)) {
+		if (mode == NF_IP_LINEARIZE_READABLE_NO_COPY)
+			return 0;
+		else
+			goto copy_skb;
+	}
+
+	if (pskb_may_pull(*pskb, totlen))
+		return -ENOMEM;
+	else
+		return 0;
+
+copy_skb:
+	nskb = skb_copy(*pskb, GFP_ATOMIC);
+	/* How should be handled ? */
+	if (!nskb)
+		return -ENOMEM;
+
+	BUG_ON(skb_is_nonlinear(nskb));
+
+	if ((*pskb)->sk)
+		skb_set_owner_w(nskb, (*pskb)->sk);
+	kfree_skb(*pskb);
+	*pskb = nskb;
+	return 0;
+}
+
+EXPORT_SYMBOL(ip_linearize_headers);

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-07-29  6:09                   ` Yasuyuki Kozakai
@ 2004-08-01 16:46                     ` Harald Welte
  2004-08-01 17:08                       ` Patrick McHardy
  0 siblings, 1 reply; 29+ messages in thread
From: Harald Welte @ 2004-08-01 16:46 UTC (permalink / raw)
  To: Yasuyuki Kozakai; +Cc: kadlec, kaber, netfilter-devel, kisza, usagi-core

[-- Attachment #1: Type: text/plain, Size: 1130 bytes --]

On Thu, Jul 29, 2004 at 03:09:02PM +0900, Yasuyuki Kozakai wrote:
> I got time to implement your idea. How about this ? (not tested)
>
> 	struct tcphdr hdr;
> 	struct tcphdr *tcph
> 
> 	tcph = skb_get_bits(skb, &hdr, skb->nh.iph->ihl*4, sizeof(hdr));
> 
> If skb is neither shared nor cloned, this function linearize up to tcp header
> and returns the pointer to tcp header in skb.
> 
> Otherwise, copies tcp header to "hdr" and return the pointer to it.
> If error, return NULL.

This sounds like a very sane approach to me.  

Since we don't have stable API's in 2.6.x anymore (kernel summit
decision), we could even put this in our pending queue of patches for
something like 2.6.10/2.6.11

What does eveybody else think?  Comments?

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-08-01 16:46                     ` Harald Welte
@ 2004-08-01 17:08                       ` Patrick McHardy
  2004-08-01 18:11                         ` Harald Welte
  0 siblings, 1 reply; 29+ messages in thread
From: Patrick McHardy @ 2004-08-01 17:08 UTC (permalink / raw)
  To: Harald Welte; +Cc: Yasuyuki Kozakai, kadlec, netfilter-devel, kisza, usagi-core

Harald Welte wrote:

>On Thu, Jul 29, 2004 at 03:09:02PM +0900, Yasuyuki Kozakai wrote:
>  
>
>>I got time to implement your idea. How about this ? (not tested)
>>
>>	struct tcphdr hdr;
>>	struct tcphdr *tcph
>>
>>	tcph = skb_get_bits(skb, &hdr, skb->nh.iph->ihl*4, sizeof(hdr));
>>
>>If skb is neither shared nor cloned, this function linearize up to tcp header
>>and returns the pointer to tcp header in skb.
>>
>>Otherwise, copies tcp header to "hdr" and return the pointer to it.
>>If error, return NULL.
>>    
>>
>
>This sounds like a very sane approach to me.  
>
>Since we don't have stable API's in 2.6.x anymore (kernel summit
>decision), we could even put this in our pending queue of patches for
>something like 2.6.10/2.6.11
>
>What does eveybody else think?  Comments?
>  
>
The number of copies will still depend on the ruleset with non-linear skbs.
skb_linearize_partial sounds like a much better idea to me.

Regards
Patrick

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-08-01 17:08                       ` Patrick McHardy
@ 2004-08-01 18:11                         ` Harald Welte
  2004-08-02  4:05                           ` Yasuyuki Kozakai
  0 siblings, 1 reply; 29+ messages in thread
From: Harald Welte @ 2004-08-01 18:11 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Yasuyuki Kozakai, kadlec, netfilter-devel, kisza, usagi-core

[-- Attachment #1: Type: text/plain, Size: 1159 bytes --]

On Sun, Aug 01, 2004 at 07:08:59PM +0200, Patrick McHardy wrote:

> >>	struct tcphdr hdr;
> >>	struct tcphdr *tcph
> >>
> >>	tcph = skb_get_bits(skb, &hdr, skb->nh.iph->ihl*4, sizeof(hdr));
> >>
> >>If skb is neither shared nor cloned, this function linearize up to tcp 
> >>header and returns the pointer to tcp header in skb.
> >>Otherwise, copies tcp header to "hdr" and return the pointer to it.
> >>If error, return NULL.
>
> The number of copies will still depend on the ruleset with non-linear skbs.
> skb_linearize_partial sounds like a much better idea to me.

Oh yes, indeed.  I somehow mis-interpreted Yasuyiki's approach.  We
should linearize with the first call, so that every 2nd, 3rd, ... call
do nothing but immediately return.

> Regards
> Patrick

-- 
- Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
============================================================================
  "Fragmentation is like classful addressing -- an interesting early
   architectural error that shows how much experimentation was going
   on while IP was being designed."                    -- Paul Vixie

[-- Attachment #2: Digital signature --]
[-- Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-08-01 18:11                         ` Harald Welte
@ 2004-08-02  4:05                           ` Yasuyuki Kozakai
  2004-08-07 21:05                             ` Yasuyuki Kozakai
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-08-02  4:05 UTC (permalink / raw)
  To: laforge; +Cc: kaber, yasuyuki.kozakai, kadlec, netfilter-devel, kisza,
	usagi-core


Hi,

From: Harald Welte <laforge@netfilter.org>
Date: Sun, 1 Aug 2004 20:11:50 +0200

> On Sun, Aug 01, 2004 at 07:08:59PM +0200, Patrick McHardy wrote:
> 
> > >>	struct tcphdr hdr;
> > >>	struct tcphdr *tcph
> > >>
> > >>	tcph = skb_get_bits(skb, &hdr, skb->nh.iph->ihl*4, sizeof(hdr));
> > >>
> > >>If skb is neither shared nor cloned, this function linearize up to tcp 
> > >>header and returns the pointer to tcp header in skb.
> > >>Otherwise, copies tcp header to "hdr" and return the pointer to it.
> > >>If error, return NULL.
> >
> > The number of copies will still depend on the ruleset with non-linear skbs.
> > skb_linearize_partial sounds like a much better idea to me.
> 
> Oh yes, indeed.  I somehow mis-interpreted Yasuyiki's approach.  We
> should linearize with the first call, so that every 2nd, 3rd, ... call
> do nothing but immediately return.

Oh, sorry. I misstook about pskb_may_pull(). And please forget my patch
because it's inefficient.


> 
> > Regards
> > Patrick
> 
> -- 
> - Harald Welte <laforge@netfilter.org>             http://www.netfilter.org/
> ============================================================================
>   "Fragmentation is like classful addressing -- an interesting early
>    architectural error that shows how much experimentation was going
>    on while IP was being designed."                    -- Paul Vixie

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-08-02  4:05                           ` Yasuyuki Kozakai
@ 2004-08-07 21:05                             ` Yasuyuki Kozakai
  2004-08-09  1:40                               ` Yasuyuki Kozakai
  0 siblings, 1 reply; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-08-07 21:05 UTC (permalink / raw)
  To: yasuyuki.kozakai
  Cc: laforge, kaber, kadlec, netfilter-devel, kisza, usagi-core

[-- Attachment #1: Type: Text/Plain, Size: 4383 bytes --]


Hi, all

I rewrote the codes which partially linearize skb before executing hook
functions. This solves some new issues. And I got good results with ordinary
situations.

The details are following. sorry for long message.

1. Changes

The changes are following.
	- If hook function rearranges skb, skb may be fragmented. 
	  (e.g. ip_conntrack_defrag()) Then I introduce the flag
	  NF_LIN_MAY_FRAG. If this flag is set on nf_hook_ops->lin_flag,
	  check and linearization are tried once more.
	- ip_linearize_headers(), which partially linearizes skb up to the
	  transport header, doesn't linearize if IPv4 packet is fragmented
	  and isn't 1st fragment.
	- deleted writable mode because I'm not sure it's efficient or not.
	- My understanding about pskb_may_pull() was wrong in the previous
	  patch. In this patch, skb is copied if skb_shared() is true.
	  Otherwise, pskb_may_pull() is used.

2. Tests
 I tested in 2 situations with 1st and 2nd patches attached to this mail.

2.1 On router
 I applied the patches to kernel on a router, inserted 2000 non-matching rules
for tcp destination port to FORWARD chain on it, and pass 500MB stream from
one side to another. The result is

$ iperf -c 192.168.1.2 -n 500M
------------------------------------------------------------
Client connecting to 192.168.1.2, TCP port 5001
TCP window size: 16.0 KByte (default)
------------------------------------------------------------
[  3] local 192.168.2.2 port 32830 connected with 192.168.1.2 port 5001
[ ID] Interval       Transfer     Bandwidth
[  3]  0.0-130.9 sec   500 MBytes  32.0 Mbits/sec

In the case of vanilla kernel, 

$ iperf -c 192.168.1.2 -n 500M
------------------------------------------------------------
Client connecting to 192.168.1.2, TCP port 5001
TCP window size: 16.0 KByte (default)
------------------------------------------------------------
[  3] local 192.168.2.2 port 32834 connected with 192.168.1.2 port 5001
[ ID] Interval       Transfer     Bandwidth
[  3]  0.0-199.1 sec   500 MBytes  21.1 Mbits/sec

The parformance is improbed in this case.


2.2 On host
 I applied the patches to kernel on a host, inserted 1000 non-matching rules
for tcp destination port to INPUT chain on it, and pass 500MB data from one
host to another. These hosts are on same link. The result is

$ iperf -c 192.168.1.1 -n 500M
------------------------------------------------------------
Client connecting to 192.168.1.1, TCP port 5001
TCP window size: 16.0 KByte (default)
------------------------------------------------------------
[  3] local 192.168.1.2 port 32770 connected with 192.168.1.1 port 5001
[ ID] Interval       Transfer     Bandwidth
[  3]  0.0-117.6 sec   500 MBytes  35.7 Mbits/sec

In the case of vanilla kernel,

$ iperf -c 192.168.1.1 -n 500M
------------------------------------------------------------
Client connecting to 192.168.1.1, TCP port 5001
TCP window size: 16.0 KByte (default)
------------------------------------------------------------
[  3] local 192.168.1.2 port 32773 connected with 192.168.1.1 port 5001
[ ID] Interval       Transfer     Bandwidth
[  3]  0.0-132.2 sec   500 MBytes  31.7 Mbits/sec

The parformance is improbed in this case, too.

3. Debugging

At least, I saw this patch works well with skb which is ...
	- fragmented at TCP header.
	- already linearized up to required header (and may be cloned or
	  shared).
	- fragmented IPv4 packet.

and __calc_lin_max_layer(), which calculates the highest layer header required
to linearize, works well in the case that ip_conntrack.ko and iptable_filter.ko
are inserted to kernel. But I didn't test with SMP machine.

4. Issues

4.1 some elimination
In the current, the header required to linearize is only transport header.
How about the other layer ? If we need only transport header, we can eliminate
	- some "if" from ip_linearize_headers()
	- calculation of the highest header to linearize

4.2 writable mode
"writable mode" is useful ? In the current, the linearization assures that
skb is readable but not writable. If nat/mangle modules decide to mangle
packet but skb isn't writable, skb will be copied by skb_make_writable().
But I don't know part of skb may be copied twice or not.
Various situations are needed to test.


Regards,

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

[-- Attachment #2: linearize.patch --]
[-- Type: Text/Plain, Size: 10015 bytes --]

diff -X dontdiff -Nurp linux-2.6.8-rc3/include/linux/netfilter.h linux-2.6.8-rc3-linearize/include/linux/netfilter.h
--- linux-2.6.8-rc3/include/linux/netfilter.h	2004-08-04 20:25:13.000000000 +0900
+++ linux-2.6.8-rc3-linearize/include/linux/netfilter.h	2004-08-08 02:01:07.973534136 +0900
@@ -46,6 +46,9 @@ typedef unsigned int nf_hookfn(unsigned 
 struct nf_hook_ops
 {
 	struct list_head list;
+	/* max protocol header required to linearize before executing hook
+	   functions. */
+	unsigned int max_lin_layer;
 
 	/* User fills in from here down. */
 	nf_hookfn *hook;
@@ -54,8 +57,14 @@ struct nf_hook_ops
 	int hooknum;
 	/* Hooks are ordered in ascending priority. */
 	int priority;
+	/* protocol header required to linearize */
+	unsigned int lin_layer;
+	unsigned int lin_flags;
 };
 
+/* for lin_flags */
+#define NF_LIN_MAY_FRAG	0x0001 /* skb may be fragmented after executing hook */
+
 struct nf_sockopt_ops
 {
 	struct list_head list;
@@ -187,6 +196,8 @@ extern void nf_dump_skb(int pf, struct s
 /* FIXME: Before cache is ever used, this must be implemented for real. */
 extern void nf_invalidate_cache(int pf);
 
+extern int skb_make_readable(struct sk_buff **pskb, unsigned int readable_len);
+
 #else /* !CONFIG_NETFILTER */
 #define NF_HOOK(pf, hook, skb, indev, outdev, okfn) (okfn)(skb)
 #endif /*CONFIG_NETFILTER*/
diff -X dontdiff -Nurp linux-2.6.8-rc3/include/linux/netfilter_ipv4.h linux-2.6.8-rc3-linearize/include/linux/netfilter_ipv4.h
--- linux-2.6.8-rc3/include/linux/netfilter_ipv4.h	2004-06-16 14:19:52.000000000 +0900
+++ linux-2.6.8-rc3-linearize/include/linux/netfilter_ipv4.h	2004-08-08 02:01:07.973534136 +0900
@@ -85,6 +85,15 @@ extern int ip_route_me_harder(struct sk_
    Returns true or false. */
 extern int skb_ip_make_writable(struct sk_buff **pskb,
 				unsigned int writable_len);
+
+/* Header required to linearize */
+/* Network protocol header */
+#define NF_IP_LIN_NET	100
+/* Transport protocol header */
+#define NF_IP_LIN_TRANS	200
+/* Whole of packet */
+#define NF_IP_LIN_ALL	UINT_MAX
+
 #endif /*__KERNEL__*/
 
 #endif /*__LINUX_IP_NETFILTER_H*/
diff -X dontdiff -Nurp linux-2.6.8-rc3/net/core/netfilter.c linux-2.6.8-rc3-linearize/net/core/netfilter.c
--- linux-2.6.8-rc3/net/core/netfilter.c	2004-06-16 14:19:22.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/core/netfilter.c	2004-08-08 02:01:07.974533984 +0900
@@ -49,6 +49,8 @@ struct list_head nf_hooks[NPROTO][NF_MAX
 static LIST_HEAD(nf_sockopts);
 static spinlock_t nf_hook_lock = SPIN_LOCK_UNLOCKED;
 
+int (*nf_linearize[NPROTO])(struct sk_buff **pskb, unsigned int layer);
+
 /* 
  * A queue handler may be registered for each protocol.  Each is protected by
  * long term mutex.  The handler must provide an an outfn() to accept packets
@@ -60,16 +62,42 @@ static struct nf_queue_handler_t {
 } queue_handler[NPROTO];
 static rwlock_t queue_handler_lock = RW_LOCK_UNLOCKED;
 
+/* Calculate highest protocol header required to linearize before executing
+   hook functions. locking is needed. */
+static void __calc_lin_layer(int pf, int hooknum)
+{
+	struct nf_hook_ops *elem = NULL;
+	unsigned int max_lin_layer = 0;
+
+	list_for_each_entry_reverse(elem, &nf_hooks[pf][hooknum], list) {
+		/* The 1st condition means that skb may be rearranged by this
+		   element. In this case, linearizing is needed one more after
+		   this. Then it's not needed to linearize the higher layer
+		   this element doesn't require */
+		if ((elem->lin_flags & NF_LIN_MAY_FRAG) ||
+		    (max_lin_layer < elem->lin_layer))
+			max_lin_layer = elem->lin_layer;
+
+		elem->max_lin_layer = max_lin_layer;
+	}
+}
+
 int nf_register_hook(struct nf_hook_ops *reg)
 {
 	struct list_head *i;
 
+	if (reg->lin_layer != 0 && nf_linearize[reg->pf] == NULL)
+		return -1;
+
+	reg->max_lin_layer = reg->lin_layer;
+
 	spin_lock_bh(&nf_hook_lock);
 	list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
 		if (reg->priority < ((struct nf_hook_ops *)i)->priority)
 			break;
 	}
 	list_add_rcu(&reg->list, i->prev);
+	__calc_lin_layer(reg->pf, reg->hooknum);
 	spin_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
@@ -80,6 +108,7 @@ void nf_unregister_hook(struct nf_hook_o
 {
 	spin_lock_bh(&nf_hook_lock);
 	list_del_rcu(&reg->list);
+	__calc_lin_layer(reg->pf, reg->hooknum);
 	spin_unlock_bh(&nf_hook_lock);
 
 	synchronize_net();
@@ -349,6 +378,8 @@ static unsigned int nf_iterate(struct li
 			       int (*okfn)(struct sk_buff *),
 			       int hook_thresh)
 {
+	unsigned int max_lin_layer = 0;
+
 	/*
 	 * The caller must not block between calls to this
 	 * function because of risk of continuing from deleted element.
@@ -359,6 +390,23 @@ static unsigned int nf_iterate(struct li
 		if (hook_thresh > elem->priority)
 			continue;
 
+		/* Ordinarily linearizing is required only once. But may be
+		   required if a element is added/deleted during the iteration
+		   or the previous element rearranges skb. */
+		if (max_lin_layer < elem->max_lin_layer) {
+			max_lin_layer = elem->max_lin_layer;
+			if(!nf_linearize[elem->pf](skb, max_lin_layer)) {
+				if(net_ratelimit())
+					printk("failed to partially linearize "
+					       "skb. dropping...\n");
+
+				return NF_DROP;
+			}
+
+			if (elem->lin_flags & NF_LIN_MAY_FRAG)
+				max_lin_layer = 0;
+		}
+
 		/* Optimization: we don't need to hold module
                    reference here, since function can't sleep. --RR */
 		switch (elem->hook(hook, skb, indev, outdev, okfn)) {
@@ -735,6 +783,27 @@ pull_skb:
 EXPORT_SYMBOL(skb_ip_make_writable);
 #endif /*CONFIG_INET*/
 
+int skb_make_readable(struct sk_buff **pskb, unsigned int readable_len)
+{
+	if (likely(readable_len <= skb_headlen(*pskb)))
+		return 1;
+
+	if (unlikely(readable_len > (*pskb)->len))
+		return 0;
+
+	if (skb_shared(*pskb)) {
+		struct sk_buff *n;
+
+		n = skb_copy(*pskb, GFP_ATOMIC);
+		if (!n)
+			return 0;
+		*pskb = n;
+	} else if (!pskb_may_pull(*pskb, readable_len))
+		return 0;
+
+	return 1;
+}
+
 /* Internal logging interface, which relies on the real 
    LOG target modules */
 
@@ -808,10 +877,17 @@ EXPORT_SYMBOL(nf_log_packet);
    with it. */
 void (*ip_ct_attach)(struct sk_buff *, struct nf_ct_info *);
 
+#ifdef CONFIG_INET
+extern int ip_linearize_headers(struct sk_buff **pskb, unsigned int layer);
+#endif
+
 void __init netfilter_init(void)
 {
 	int i, h;
 
+#ifdef CONFIG_INET
+	nf_linearize[PF_INET] = ip_linearize_headers;
+#endif
 	for (i = 0; i < NPROTO; i++) {
 		for (h = 0; h < NF_MAX_HOOKS; h++)
 			INIT_LIST_HEAD(&nf_hooks[i][h]);
diff -X dontdiff -Nurp linux-2.6.8-rc3/net/ipv4/netfilter/Makefile linux-2.6.8-rc3-linearize/net/ipv4/netfilter/Makefile
--- linux-2.6.8-rc3/net/ipv4/netfilter/Makefile	2004-08-04 20:25:16.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/ipv4/netfilter/Makefile	2004-08-08 02:01:07.974533984 +0900
@@ -98,3 +98,5 @@ obj-$(CONFIG_IP_NF_COMPAT_IPCHAINS) += i
 obj-$(CONFIG_IP_NF_COMPAT_IPFWADM) += ipfwadm.o
 
 obj-$(CONFIG_IP_NF_QUEUE) += ip_queue.o
+
+obj-y += ip_linearize.o
diff -X dontdiff -Nurp linux-2.6.8-rc3/net/ipv4/netfilter/ip_linearize.c linux-2.6.8-rc3-linearize/net/ipv4/netfilter/ip_linearize.c
--- linux-2.6.8-rc3/net/ipv4/netfilter/ip_linearize.c	1970-01-01 09:00:00.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/ipv4/netfilter/ip_linearize.c	2004-08-08 02:01:07.975533832 +0900
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C)2004 USAGI/WIDE Project
+ * 
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ * Authors:
+ *	Yasuyuki Kozakai	<yasuyuki.kozakai@toshiba.co.jp>
+ */
+
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/icmp.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <net/ip.h>
+
+#if 0
+#define DEBUGP printk
+#else
+#define DEBUGP(format, args...)
+#endif
+
+/*
+ * linearize skb up to specified layer. If packet is too short, whole of skb is 
+ * linearized. NOTICE: skb is readable but may not writable because of being
+ * shared or cloned. If you want to mangle the contents of skb, please use
+ * ip_make_writable().
+ */
+int ip_linearize_headers(struct sk_buff **pskb, unsigned int layer)
+{
+	unsigned int totlen;
+
+	if (layer <= NF_IP_LIN_NET)
+		return 1;
+
+	totlen = (*pskb)->nh.iph->ihl*4;
+
+	if (layer == NF_IP_LIN_TRANS) {
+		if (ntohs((*pskb)->nh.iph->frag_off) & IP_OFFSET)
+			return 1;
+
+		switch ((*pskb)->nh.iph->protocol) {
+		case IPPROTO_TCP: {
+			struct tcphdr hdr;
+			int ret;
+
+			/* truncated */
+			if ((*pskb)->len - totlen < sizeof(hdr)) {
+				totlen = (*pskb)->len;
+				break;
+			}
+
+			ret = skb_copy_bits(*pskb, (*pskb)->nh.iph->ihl*4,
+					    &hdr, sizeof(hdr));
+			if (ret) {
+				DEBUGP("ip_linearize: failed to copy bits.\n");
+				return 0;
+			}
+
+			totlen += max_t(unsigned int, sizeof(hdr), hdr.doff*4);
+			break;
+		}
+		case IPPROTO_UDP:
+			totlen += sizeof(struct udphdr);
+			break;
+		case IPPROTO_ICMP:
+			totlen += sizeof(struct icmphdr);
+			break;
+		/* Insert other cases here as desired */
+		}
+	} else if (layer == NF_IP_LIN_ALL)
+		totlen = (*pskb)->len;
+	else {
+		/* unknown layer */
+		DEBUGP("ip_linearize: unknown layer\n");
+		return 0;
+	}
+
+	if (totlen > (*pskb)->len)
+		totlen = (*pskb)->len;
+
+	return skb_make_readable(pskb, totlen);
+}
+
+EXPORT_SYMBOL(ip_linearize_headers);

[-- Attachment #3: linearize-filter.patch --]
[-- Type: Text/Plain, Size: 5525 bytes --]

diff -X dontdiff -Nurp linux-2.6.8-rc3/net/ipv4/netfilter/ip_tables.c linux-2.6.8-rc3-linearize/net/ipv4/netfilter/ip_tables.c
--- linux-2.6.8-rc3/net/ipv4/netfilter/ip_tables.c	2004-08-04 20:25:16.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/ipv4/netfilter/ip_tables.c	2004-08-08 02:01:07.975533832 +0900
@@ -1458,17 +1458,19 @@ tcp_find_option(u_int8_t option,
 		int *hotdrop)
 {
 	/* tcp.doff is only 4 bits, ie. max 15 * 4 bytes */
-	u_int8_t opt[60 - sizeof(struct tcphdr)];
+	u_int8_t *opt;
 	unsigned int i;
+	unsigned int optoff = skb->nh.iph->ihl*4 + sizeof(struct tcphdr);
 
 	duprintf("tcp_match: finding option\n");
 	/* If we don't have the whole header, drop packet. */
-	if (skb_copy_bits(skb, skb->nh.iph->ihl*4 + sizeof(struct tcphdr),
-			  opt, optlen) < 0) {
+	if (skb->len < optoff + optlen) {
 		*hotdrop = 1;
 		return 0;
 	}
 
+	opt = skb->data + optoff;
+
 	for (i = 0; i < optlen; ) {
 		if (opt[i] == option) return !invert;
 		if (opt[i] < 2) i++;
@@ -1486,7 +1488,7 @@ tcp_match(const struct sk_buff *skb,
 	  int offset,
 	  int *hotdrop)
 {
-	struct tcphdr tcph;
+	struct tcphdr *tcp;
 	const struct ipt_tcp *tcpinfo = matchinfo;
 
 	if (offset) {
@@ -1506,7 +1508,7 @@ tcp_match(const struct sk_buff *skb,
 
 #define FWINVTCP(bool,invflg) ((bool) ^ !!(tcpinfo->invflags & invflg))
 
-	if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) {
+	if (skb->len < skb->nh.iph->ihl*4 + sizeof(struct tcphdr)) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil TCP offset=0 tinygram.\n");
@@ -1514,24 +1516,26 @@ tcp_match(const struct sk_buff *skb,
 		return 0;
 	}
 
+	tcp = (struct tcphdr *)(skb->data + skb->nh.iph->ihl*4);
+
 	if (!port_match(tcpinfo->spts[0], tcpinfo->spts[1],
-			ntohs(tcph.source),
+			ntohs(tcp->source),
 			!!(tcpinfo->invflags & IPT_TCP_INV_SRCPT)))
 		return 0;
 	if (!port_match(tcpinfo->dpts[0], tcpinfo->dpts[1],
-			ntohs(tcph.dest),
+			ntohs(tcp->dest),
 			!!(tcpinfo->invflags & IPT_TCP_INV_DSTPT)))
 		return 0;
-	if (!FWINVTCP((((unsigned char *)&tcph)[13] & tcpinfo->flg_mask)
+	if (!FWINVTCP((((unsigned char *)tcp)[13] & tcpinfo->flg_mask)
 		      == tcpinfo->flg_cmp,
 		      IPT_TCP_INV_FLAGS))
 		return 0;
 	if (tcpinfo->option) {
-		if (tcph.doff * 4 < sizeof(tcph)) {
+		if (tcp->doff * 4 < sizeof(struct tcphdr)) {
 			*hotdrop = 1;
 			return 0;
 		}
-		if (!tcp_find_option(tcpinfo->option, skb, tcph.doff*4 - sizeof(tcph),
+		if (!tcp_find_option(tcpinfo->option, skb, tcp->doff*4 - sizeof(struct tcphdr),
 				     tcpinfo->invflags & IPT_TCP_INV_OPTION,
 				     hotdrop))
 			return 0;
@@ -1564,14 +1568,14 @@ udp_match(const struct sk_buff *skb,
 	  int offset,
 	  int *hotdrop)
 {
-	struct udphdr udph;
+	struct udphdr *udp;
 	const struct ipt_udp *udpinfo = matchinfo;
 
 	/* Must not be a fragment. */
 	if (offset)
 		return 0;
 
-	if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &udph, sizeof(udph)) < 0) {
+	if (skb->len < skb->nh.iph->ihl*4 + sizeof(struct udphdr)) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil UDP tinygram.\n");
@@ -1579,11 +1583,13 @@ udp_match(const struct sk_buff *skb,
 		return 0;
 	}
 
+	udp = (struct udphdr *)(skb->data + skb->nh.iph->ihl*4);
+
 	return port_match(udpinfo->spts[0], udpinfo->spts[1],
-			  ntohs(udph.source),
+			  ntohs(udp->source),
 			  !!(udpinfo->invflags & IPT_UDP_INV_SRCPT))
 		&& port_match(udpinfo->dpts[0], udpinfo->dpts[1],
-			      ntohs(udph.dest),
+			      ntohs(udp->dest),
 			      !!(udpinfo->invflags & IPT_UDP_INV_DSTPT));
 }
 
@@ -1635,14 +1641,14 @@ icmp_match(const struct sk_buff *skb,
 	   int offset,
 	   int *hotdrop)
 {
-	struct icmphdr icmph;
+	struct icmphdr *icmp;
 	const struct ipt_icmp *icmpinfo = matchinfo;
 
 	/* Must not be a fragment. */
 	if (offset)
 		return 0;
 
-	if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &icmph, sizeof(icmph)) < 0){
+	if (skb->len < skb->nh.iph->ihl*4 + sizeof(struct icmphdr) < 0) {
 		/* We've been asked to examine this packet, and we
 		   can't.  Hence, no choice but to drop. */
 		duprintf("Dropping evil ICMP tinygram.\n");
@@ -1650,10 +1656,12 @@ icmp_match(const struct sk_buff *skb,
 		return 0;
 	}
 
+	icmp = (struct icmphdr *)(skb->data + sizeof(struct icmphdr));
+
 	return icmp_type_code_match(icmpinfo->type,
 				    icmpinfo->code[0],
 				    icmpinfo->code[1],
-				    icmph.type, icmph.code,
+				    icmp->type, icmp->code,
 				    !!(icmpinfo->invflags&IPT_ICMP_INV));
 }
 
diff -X dontdiff -Nurp linux-2.6.8-rc3/net/ipv4/netfilter/iptable_filter.c linux-2.6.8-rc3-linearize/net/ipv4/netfilter/iptable_filter.c
--- linux-2.6.8-rc3/net/ipv4/netfilter/iptable_filter.c	2004-06-16 14:19:13.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/ipv4/netfilter/iptable_filter.c	2004-08-08 02:01:07.975533832 +0900
@@ -136,6 +136,7 @@ static struct nf_hook_ops ipt_ops[] = {
 		.pf		= PF_INET,
 		.hooknum	= NF_IP_LOCAL_IN,
 		.priority	= NF_IP_PRI_FILTER,
+		.lin_layer	= NF_IP_LIN_TRANS,
 	},
 	{
 		.hook		= ipt_hook,
@@ -143,6 +144,7 @@ static struct nf_hook_ops ipt_ops[] = {
 		.pf		= PF_INET,
 		.hooknum	= NF_IP_FORWARD,
 		.priority	= NF_IP_PRI_FILTER,
+		.lin_layer	= NF_IP_LIN_TRANS,
 	},
 	{
 		.hook		= ipt_local_out_hook,
@@ -150,6 +152,7 @@ static struct nf_hook_ops ipt_ops[] = {
 		.pf		= PF_INET,
 		.hooknum	= NF_IP_LOCAL_OUT,
 		.priority	= NF_IP_PRI_FILTER,
+		.lin_layer	= NF_IP_LIN_TRANS,
 	},
 };
 

[-- Attachment #4: linearize-conntrack.patch --]
[-- Type: Text/Plain, Size: 1198 bytes --]

--- linux-2.6.8-rc3/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-08-08 02:06:18.000000000 +0900
+++ linux-2.6.8-rc3-linearize/net/ipv4/netfilter/ip_conntrack_standalone.c	2004-08-08 05:28:23.768084752 +0900
@@ -263,6 +263,7 @@
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_PRE_ROUTING,
 	.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+	.lin_flags	= NF_LIN_MAY_FRAG,
 };
 
 static struct nf_hook_ops ip_conntrack_in_ops = {
@@ -271,6 +272,7 @@
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_PRE_ROUTING,
 	.priority	= NF_IP_PRI_CONNTRACK,
+	.lin_layer	= NF_IP_LIN_TRANS,
 };
 
 static struct nf_hook_ops ip_conntrack_defrag_local_out_ops = {
@@ -279,6 +281,7 @@
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_LOCAL_OUT,
 	.priority	= NF_IP_PRI_CONNTRACK_DEFRAG,
+	.lin_flags	= NF_LIN_MAY_FRAG,
 };
 
 static struct nf_hook_ops ip_conntrack_local_out_ops = {
@@ -287,6 +290,7 @@
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_LOCAL_OUT,
 	.priority	= NF_IP_PRI_CONNTRACK,
+	.lin_layer	= NF_IP_LIN_TRANS,
 };
 
 /* Refragmenter; last chance. */
@@ -296,6 +300,7 @@
 	.pf		= PF_INET,
 	.hooknum	= NF_IP_POST_ROUTING,
 	.priority	= NF_IP_PRI_LAST,
+	.lin_flags	= NF_LIN_MAY_FRAG,
 };
 
 static struct nf_hook_ops ip_conntrack_local_in_ops = {

^ permalink raw reply	[flat|nested] 29+ messages in thread

* Re: [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization
  2004-08-07 21:05                             ` Yasuyuki Kozakai
@ 2004-08-09  1:40                               ` Yasuyuki Kozakai
  0 siblings, 0 replies; 29+ messages in thread
From: Yasuyuki Kozakai @ 2004-08-09  1:40 UTC (permalink / raw)
  To: yasuyuki.kozakai
  Cc: laforge, kaber, kadlec, netfilter-devel, kisza, usagi-core


Sorry for my misstakes ...

From: Yasuyuki Kozakai <yasuyuki.kozakai@toshiba.co.jp>
Date: Sun, 08 Aug 2004 06:05:35 +0900 (JST)

> 2.1 On router
> I applied the patches to kernel on a router, inserted 2000 non-matching rules
                                                        ^^^^
This is 1000.

> 2.2 On host
> I applied the patches to kernel on a host, inserted 1000 non-matching rules
                                                      ^^^^
This is 2000.

-----------------------------------------------------------------
Yasuyuki KOZAKAI @ USAGI Project <yasuyuki.kozakai@toshiba.co.jp>

^ permalink raw reply	[flat|nested] 29+ messages in thread

end of thread, other threads:[~2004-08-09  1:40 UTC | newest]

Thread overview: 29+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2004-06-24  4:04 [PATCH]: 1st step to remove skb_linearize() in ip6_tables.c and optimization Yasuyuki Kozakai
2004-06-24  8:13 ` Andras Kis-Szabo
2004-06-24 10:12   ` Yasuyuki Kozakai
2004-06-24 10:24     ` Jozsef Kadlecsik
2004-06-24 10:35       ` Yasuyuki Kozakai
2004-06-24 11:26 ` Patrick McHardy
2004-06-24 11:50   ` Jozsef Kadlecsik
2004-06-24 13:04     ` Yasuyuki Kozakai
2004-06-24 13:25       ` Jozsef Kadlecsik
2004-06-24 13:48         ` (usagi-core 18584) " YOSHIFUJI Hideaki / 吉藤英明
2004-06-24 15:06         ` Yasuyuki Kozakai
2004-06-24 16:50           ` Patrick McHardy
2004-06-25  4:57             ` Yasuyuki Kozakai
2004-06-25 10:01               ` Jozsef Kadlecsik
2004-06-26  7:25                 ` Yasuyuki Kozakai
2004-07-21 21:36                 ` Harald Welte
2004-07-29  6:09                   ` Yasuyuki Kozakai
2004-08-01 16:46                     ` Harald Welte
2004-08-01 17:08                       ` Patrick McHardy
2004-08-01 18:11                         ` Harald Welte
2004-08-02  4:05                           ` Yasuyuki Kozakai
2004-08-07 21:05                             ` Yasuyuki Kozakai
2004-08-09  1:40                               ` Yasuyuki Kozakai
2004-06-25  9:53   ` Harald Welte
2004-06-28 20:31     ` Patrick McHardy
2004-07-06 10:20     ` Patrick McHardy
2004-07-06 10:35       ` Harald Welte
2004-07-06 22:59       ` Pablo Neira
2004-07-06 23:33         ` Patrick McHardy

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.