[PATCH 1/6] socket: sk_filter minor cleanups

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 1/6] socket: sk_filter minor cleanups
       [not found] <20080401004708.009204033@vyatta.com>
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-10  8:39   ` David Miller
  2008-04-01  0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
                   ` (4 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: sk-filter-cleanup --]
[-- Type: text/plain, Size: 2328 bytes --]

Some minor style cleanups:
  * Move __KERNEL__ definitions to one place in filter.h
  * Use const for sk_filter_len
  * Line wrapping
  * Put EXPORT_SYMBOL next to function definition

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/include/linux/filter.h	2008-03-31 10:23:28.000000000 -0700
+++ b/include/linux/filter.h	2008-03-31 10:24:40.000000000 -0700
@@ -37,21 +37,6 @@ struct sock_fprog	/* Required for SO_ATT
 	struct sock_filter __user *filter;
 };
 
-#ifdef __KERNEL__
-struct sk_filter
-{
-	atomic_t		refcnt;
-	unsigned int         	len;	/* Number of filter blocks */
-	struct rcu_head		rcu;
-	struct sock_filter     	insns[0];
-};
-
-static inline unsigned int sk_filter_len(struct sk_filter *fp)
-{
-	return fp->len*sizeof(struct sock_filter) + sizeof(*fp);
-}
-#endif
-
 /*
  * Instruction classes
  */
@@ -141,10 +126,24 @@ static inline unsigned int sk_filter_len
 #define SKF_LL_OFF    (-0x200000)
 
 #ifdef __KERNEL__
+struct sk_filter
+{
+	atomic_t		refcnt;
+	unsigned int         	len;	/* Number of filter blocks */
+	struct rcu_head		rcu;
+	struct sock_filter     	insns[0];
+};
+
+static inline unsigned int sk_filter_len(const struct sk_filter *fp)
+{
+	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
+}
+
 struct sk_buff;
 struct sock;
 
-extern unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen);
+extern unsigned int sk_run_filter(struct sk_buff *skb,
+				  struct sock_filter *filter, int flen);
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
 extern int sk_detach_filter(struct sock *sk);
 extern int sk_chk_filter(struct sock_filter *filter, int flen);
--- a/net/core/filter.c	2008-03-31 10:24:56.000000000 -0700
+++ b/net/core/filter.c	2008-03-31 10:25:29.000000000 -0700
@@ -275,6 +275,7 @@ load_b:
 
 	return 0;
 }
+EXPORT_SYMBOL(sk_run_filter);
 
 /**
  *	sk_chk_filter - verify socket filter code
@@ -385,6 +386,7 @@ int sk_chk_filter(struct sock_filter *fi
 
 	return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
 }
+EXPORT_SYMBOL(sk_chk_filter);
 
 /**
  * 	sk_filter_rcu_release: Release a socket filter by rcu_head
@@ -467,6 +469,3 @@ int sk_detach_filter(struct sock *sk)
 	rcu_read_unlock_bh();
 	return ret;
 }
-
-EXPORT_SYMBOL(sk_chk_filter);
-EXPORT_SYMBOL(sk_run_filter);

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 1/6] socket: sk_filter minor cleanups
  2008-04-01  0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
@ 2008-04-10  8:39   ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:39 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:09 -0700

> Some minor style cleanups:
>   * Move __KERNEL__ definitions to one place in filter.h
>   * Use const for sk_filter_len
>   * Line wrapping
>   * Put EXPORT_SYMBOL next to function definition
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied to net-2.6.26, thanks!

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 2/6] socket: sk_filter deinline
       [not found] <20080401004708.009204033@vyatta.com>
  2008-04-01  0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-10  8:49   ` David Miller
  2008-04-01  0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: sk-filter-deinline --]
[-- Type: text/plain, Size: 3243 bytes --]

The sk_filter function is too big to be inlined. This saves 2296 bytes
of text on allyesconfig.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/include/linux/filter.h	2008-03-31 10:24:40.000000000 -0700
+++ b/include/linux/filter.h	2008-03-31 10:26:49.000000000 -0700
@@ -142,6 +142,7 @@ static inline unsigned int sk_filter_len
 struct sk_buff;
 struct sock;
 
+extern int sk_filter(struct sock *sk, struct sk_buff *skb);
 extern unsigned int sk_run_filter(struct sk_buff *skb,
 				  struct sock_filter *filter, int flen);
 extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
--- a/include/net/sock.h	2008-03-31 10:23:12.000000000 -0700
+++ b/include/net/sock.h	2008-03-31 10:25:57.000000000 -0700
@@ -927,41 +927,6 @@ extern void sk_common_release(struct soc
 extern void sock_init_data(struct socket *sock, struct sock *sk);
 
 /**
- *	sk_filter - run a packet through a socket filter
- *	@sk: sock associated with &sk_buff
- *	@skb: buffer to filter
- *	@needlock: set to 1 if the sock is not locked by caller.
- *
- * Run the filter code and then cut skb->data to correct size returned by
- * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
- * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to sk_run_filter. It returns 0 if the packet should
- * be accepted or -EPERM if the packet should be tossed.
- *
- */
-
-static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
-{
-	int err;
-	struct sk_filter *filter;
-	
-	err = security_sock_rcv_skb(sk, skb);
-	if (err)
-		return err;
-	
-	rcu_read_lock_bh();
-	filter = rcu_dereference(sk->sk_filter);
-	if (filter) {
-		unsigned int pkt_len = sk_run_filter(skb, filter->insns,
-				filter->len);
-		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
-	}
- 	rcu_read_unlock_bh();
-
-	return err;
-}
-
-/**
  *	sk_filter_release: Release a socket filter
  *	@sk: socket
  *	@fp: filter to remove
--- a/net/core/filter.c	2008-03-31 10:25:29.000000000 -0700
+++ b/net/core/filter.c	2008-03-31 10:27:43.000000000 -0700
@@ -64,6 +64,41 @@ static inline void *load_pointer(struct 
 }
 
 /**
+ *	sk_filter - run a packet through a socket filter
+ *	@sk: sock associated with &sk_buff
+ *	@skb: buffer to filter
+ *	@needlock: set to 1 if the sock is not locked by caller.
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+	int err;
+	struct sk_filter *filter;
+
+	err = security_sock_rcv_skb(sk, skb);
+	if (err)
+		return err;
+
+	rcu_read_lock_bh();
+	filter = rcu_dereference(sk->sk_filter);
+	if (filter) {
+		unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+				filter->len);
+		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+	}
+ 	rcu_read_unlock_bh();
+
+	return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/**
  *	sk_run_filter - run a filter on a socket
  *	@skb: buffer to run the filter on
  *	@filter: filter to apply

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 2/6] socket: sk_filter deinline
  2008-04-01  0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
@ 2008-04-10  8:49   ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:49 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:10 -0700

> The sk_filter function is too big to be inlined. This saves 2296 bytes
> of text on allyesconfig.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Because of the bloat I'll apply this to net-2.6.26

But I think we can do better in many of the callsites.

Most of them are in the packet receive path, have BH's disabled, and a
lot of the variables already laid out in registers.

I would like to see a new inline created that caters to these cases,
which does something like:

	if (rcu_dereference(sk->sk_filter))
		return sk_filter(sk, skb);
	else
		return 0;

The security check muddles things here, but that should be inlined in
such situations too I think.

It's all of that "pkt_len = this; pskb_trim(that);" that makes for
most of the bloat.  And that can stay in the un-inlined sk_filter() we
have now.

Having a socket filter enabled on a TCP socket is so rare, but
now everyone pays for the function call unconditionally.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
       [not found] <20080401004708.009204033@vyatta.com>
  2008-04-01  0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
  2008-04-01  0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-01  5:52   ` Eric Dumazet
  2008-04-01  0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: route-compare --]
[-- Type: text/plain, Size: 1275 bytes --]

The comparison in ip_route_input is a hot path, by recoding the C
"and" as bit operations, fewer conditional branches get generated
so the code should be faster. Maybe someday Gcc will be smart
enough to do this?

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/route.c	2008-03-31 10:57:30.000000000 -0700
+++ b/net/ipv4/route.c	2008-03-31 11:10:44.000000000 -0700
@@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb, 
 	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
-		if (rth->fl.fl4_dst == daddr &&
-		    rth->fl.fl4_src == saddr &&
-		    rth->fl.iif == iif &&
-		    rth->fl.oif == 0 &&
-		    rth->fl.mark == skb->mark &&
-		    rth->fl.fl4_tos == tos &&
-		    net_eq(dev_net(rth->u.dst.dev), net) &&
-		    rth->rt_genid == atomic_read(&rt_genid)) {
+		if (((rth->fl.fl4_dst ^ daddr) |
+		     (rth->fl.fl4_src ^ saddr) |
+		     (rth->fl.iif ^ iif) |
+		     rth->fl.oif |
+		     (rth->fl.mark ^ skb->mark) |
+		     (rth->fl.fl4_tos ^ tos) |
+		     (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
+		    net_eq(dev_net(rth->u.dst.dev), net)) {
 			dst_use(&rth->u.dst, jiffies);
 			RT_CACHE_STAT_INC(in_hit);
 			rcu_read_unlock();

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-01  0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
@ 2008-04-01  5:52   ` Eric Dumazet
  2008-04-01 20:08     ` Stephen Hemminger
  0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2008-04-01  5:52 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David S. Miller, netdev

Stephen Hemminger a écrit :
> The comparison in ip_route_input is a hot path, by recoding the C
> "and" as bit operations, fewer conditional branches get generated
> so the code should be faster. Maybe someday Gcc will be smart
> enough to do this?
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> --- a/net/ipv4/route.c	2008-03-31 10:57:30.000000000 -0700
> +++ b/net/ipv4/route.c	2008-03-31 11:10:44.000000000 -0700
> @@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb, 
>  	rcu_read_lock();
>  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
>  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
> -		if (rth->fl.fl4_dst == daddr &&
> -		    rth->fl.fl4_src == saddr &&
> -		    rth->fl.iif == iif &&
> -		    rth->fl.oif == 0 &&
> -		    rth->fl.mark == skb->mark &&
> -		    rth->fl.fl4_tos == tos &&
> -		    net_eq(dev_net(rth->u.dst.dev), net) &&
> -		    rth->rt_genid == atomic_read(&rt_genid)) {
> +		if (((rth->fl.fl4_dst ^ daddr) |
> +		     (rth->fl.fl4_src ^ saddr) |
> +		     (rth->fl.iif ^ iif) |
> +		     rth->fl.oif |
> +		     (rth->fl.mark ^ skb->mark) |
> +		     (rth->fl.fl4_tos ^ tos) |
> +		     (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
> +		    net_eq(dev_net(rth->u.dst.dev), net)) {
>  			dst_use(&rth->u.dst, jiffies);
>  			RT_CACHE_STAT_INC(in_hit);
>  			rcu_read_unlock();
> 

Are you sure all fields share same cache lines, on 32bit and 64bit arches ?

I prefer having some conditional branches instead of cache misses, given that 
the first two branches are usually discriminant.

Maybe we could let one test on (daddr,saddr) to do a fast segregation (with 
one cache line at most) of candidates, then one remaining compare on other keys ?


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-01  5:52   ` Eric Dumazet
@ 2008-04-01 20:08     ` Stephen Hemminger
  2008-04-10  8:51       ` David Miller
  0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 20:08 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David S. Miller, netdev

On Tue, 01 Apr 2008 07:52:03 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > The comparison in ip_route_input is a hot path, by recoding the C
> > "and" as bit operations, fewer conditional branches get generated
> > so the code should be faster. Maybe someday Gcc will be smart
> > enough to do this?
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > --- a/net/ipv4/route.c	2008-03-31 10:57:30.000000000 -0700
> > +++ b/net/ipv4/route.c	2008-03-31 11:10:44.000000000 -0700
> > @@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb, 
> >  	rcu_read_lock();
> >  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> >  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
> > -		if (rth->fl.fl4_dst == daddr &&
> > -		    rth->fl.fl4_src == saddr &&
> > -		    rth->fl.iif == iif &&
> > -		    rth->fl.oif == 0 &&
> > -		    rth->fl.mark == skb->mark &&
> > -		    rth->fl.fl4_tos == tos &&
> > -		    net_eq(dev_net(rth->u.dst.dev), net) &&
> > -		    rth->rt_genid == atomic_read(&rt_genid)) {
> > +		if (((rth->fl.fl4_dst ^ daddr) |
> > +		     (rth->fl.fl4_src ^ saddr) |
> > +		     (rth->fl.iif ^ iif) |
> > +		     rth->fl.oif |
> > +		     (rth->fl.mark ^ skb->mark) |
> > +		     (rth->fl.fl4_tos ^ tos) |
> > +		     (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
> > +		    net_eq(dev_net(rth->u.dst.dev), net)) {
> >  			dst_use(&rth->u.dst, jiffies);
> >  			RT_CACHE_STAT_INC(in_hit);
> >  			rcu_read_unlock();
> > 
> 
> Are you sure all fields share same cache lines, on 32bit and 64bit arches ?

The flow fields are all together, and the other parameters are local variables
in registers so that compare should be in one cache line.

--- a/net/ipv4/route.c	2008-03-31 17:12:30.000000000 -0700
+++ b/net/ipv4/route.c	2008-04-01 13:05:46.000000000 -0700
@@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb, 
 	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
 	     rth = rcu_dereference(rth->u.dst.rt_next)) {
-		if (rth->fl.fl4_dst == daddr &&
-		    rth->fl.fl4_src == saddr &&
-		    rth->fl.iif == iif &&
-		    rth->fl.oif == 0 &&
+		if (((rth->fl.fl4_dst ^ daddr) |
+		     (rth->fl.fl4_src ^ saddr) |
+		     (rth->fl.iif ^ iif) |
+		     rth->fl.oif |
+		     (rth->fl.fl4_tos ^ tos)) == 0 &&
 		    rth->fl.mark == skb->mark &&
-		    rth->fl.fl4_tos == tos &&
 		    net_eq(dev_net(rth->u.dst.dev), net) &&
 		    rth->rt_genid == atomic_read(&rt_genid)) {
 			dst_use(&rth->u.dst, jiffies);

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-01 20:08     ` Stephen Hemminger
@ 2008-04-10  8:51       ` David Miller
  2008-04-10  9:01         ` YOSHIFUJI Hideaki / 吉藤英明
  2008-04-10  9:26         ` Eric Dumazet
  0 siblings, 2 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:51 UTC (permalink / raw)
  To: shemminger; +Cc: dada1, netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 1 Apr 2008 13:08:42 -0700

> The flow fields are all together, and the other parameters are local variables
> in registers so that compare should be in one cache line.
> 
> --- a/net/ipv4/route.c	2008-03-31 17:12:30.000000000 -0700
> +++ b/net/ipv4/route.c	2008-04-01 13:05:46.000000000 -0700
> @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb, 
>  	rcu_read_lock();
>  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
>  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
> -		if (rth->fl.fl4_dst == daddr &&
> -		    rth->fl.fl4_src == saddr &&
> -		    rth->fl.iif == iif &&
> -		    rth->fl.oif == 0 &&
> +		if (((rth->fl.fl4_dst ^ daddr) |
> +		     (rth->fl.fl4_src ^ saddr) |
> +		     (rth->fl.iif ^ iif) |
> +		     rth->fl.oif |
> +		     (rth->fl.fl4_tos ^ tos)) == 0 &&
>  		    rth->fl.mark == skb->mark &&
> -		    rth->fl.fl4_tos == tos &&
>  		    net_eq(dev_net(rth->u.dst.dev), net) &&
>  		    rth->rt_genid == atomic_read(&rt_genid)) {
>  			dst_use(&rth->u.dst, jiffies);

Eric, any objections to this version?

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-10  8:51       ` David Miller
@ 2008-04-10  9:01         ` YOSHIFUJI Hideaki / 吉藤英明
  2008-04-10 10:56           ` David Miller
  2008-04-10  9:26         ` Eric Dumazet
  1 sibling, 1 reply; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-04-10  9:01 UTC (permalink / raw)
  To: davem; +Cc: shemminger, dada1, netdev, yoshfuji

In article <20080410.015118.103465510.davem@davemloft.net> (at Thu, 10 Apr 2008 01:51:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:

> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 1 Apr 2008 13:08:42 -0700
> 
> > The flow fields are all together, and the other parameters are local variables
> > in registers so that compare should be in one cache line.
> > 
> > --- a/net/ipv4/route.c	2008-03-31 17:12:30.000000000 -0700
> > +++ b/net/ipv4/route.c	2008-04-01 13:05:46.000000000 -0700
> > @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb, 
> >  	rcu_read_lock();
> >  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> >  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
> > -		if (rth->fl.fl4_dst == daddr &&
> > -		    rth->fl.fl4_src == saddr &&
> > -		    rth->fl.iif == iif &&
> > -		    rth->fl.oif == 0 &&
> > +		if (((rth->fl.fl4_dst ^ daddr) |
> > +		     (rth->fl.fl4_src ^ saddr) |
> > +		     (rth->fl.iif ^ iif) |
> > +		     rth->fl.oif |
> > +		     (rth->fl.fl4_tos ^ tos)) == 0 &&
> >  		    rth->fl.mark == skb->mark &&
> > -		    rth->fl.fl4_tos == tos &&
> >  		    net_eq(dev_net(rth->u.dst.dev), net) &&
> >  		    rth->rt_genid == atomic_read(&rt_genid)) {
> >  			dst_use(&rth->u.dst, jiffies);
> 
> Eric, any objections to this version?

I'm not Eric, but well, I'm now doubting if this is really good.
If the comparision chain is long and it is unlikely to pass all the tests,
it would be better to cut the line.
If we use "or", we need to run through the test, in ayn case.

--yoshfuji

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-10  9:01         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2008-04-10 10:56           ` David Miller
  2008-04-10 12:17             ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2008-04-10 10:56 UTC (permalink / raw)
  To: yoshfuji; +Cc: shemminger, dada1, netdev

From: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Date: Thu, 10 Apr 2008 18:01:48 +0900 (JST)

> In article <20080410.015118.103465510.davem@davemloft.net> (at Thu, 10 Apr 2008 01:51:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> 
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Tue, 1 Apr 2008 13:08:42 -0700
> > 
> > > The flow fields are all together, and the other parameters are local variables
> > > in registers so that compare should be in one cache line.
> > > 
> > > --- a/net/ipv4/route.c	2008-03-31 17:12:30.000000000 -0700
> > > +++ b/net/ipv4/route.c	2008-04-01 13:05:46.000000000 -0700
> > > @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb, 
> > >  	rcu_read_lock();
> > >  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> > >  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
> > > -		if (rth->fl.fl4_dst == daddr &&
> > > -		    rth->fl.fl4_src == saddr &&
> > > -		    rth->fl.iif == iif &&
> > > -		    rth->fl.oif == 0 &&
> > > +		if (((rth->fl.fl4_dst ^ daddr) |
> > > +		     (rth->fl.fl4_src ^ saddr) |
> > > +		     (rth->fl.iif ^ iif) |
> > > +		     rth->fl.oif |
> > > +		     (rth->fl.fl4_tos ^ tos)) == 0 &&
> > >  		    rth->fl.mark == skb->mark &&
> > > -		    rth->fl.fl4_tos == tos &&
> > >  		    net_eq(dev_net(rth->u.dst.dev), net) &&
> > >  		    rth->rt_genid == atomic_read(&rt_genid)) {
> > >  			dst_use(&rth->u.dst, jiffies);
> > 
> > Eric, any objections to this version?
> 
> I'm not Eric, but well, I'm now doubting if this is really good.
> If the comparision chain is long and it is unlikely to pass all the tests,
> it would be better to cut the line.
> If we use "or", we need to run through the test, in ayn case.

Actually the case you mention it is part of the incentive for
this change.

Branch prediction fares very poorly in such cases, and
therefore it is better to mispredict one branch over
all the data items in the same cache line than any one
of several such branches.  The above new sequence gets
emitted by the compiler as several integer operations and
one branch.  As long as all the data items are in the
same cacheline, this is optimal.

We made such a change for ethernet address comparisons a
few years ago.  At the time Eric showed that it mattered
a lot for Athlon processors.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-10 10:56           ` David Miller
@ 2008-04-10 12:17             ` YOSHIFUJI Hideaki / 吉藤英明
  0 siblings, 0 replies; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-04-10 12:17 UTC (permalink / raw)
  To: davem; +Cc: shemminger, dada1, netdev, yoshfuji

In article <20080410.035618.217931997.davem@davemloft.net> (at Thu, 10 Apr 2008 03:56:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:

> Branch prediction fares very poorly in such cases, and
> therefore it is better to mispredict one branch over
> all the data items in the same cache line than any one
> of several such branches.  The above new sequence gets
> emitted by the compiler as several integer operations and
> one branch.  As long as all the data items are in the
> same cacheline, this is optimal.

Okay, thanks.

--yoshfuji

^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-10  8:51       ` David Miller
  2008-04-10  9:01         ` YOSHIFUJI Hideaki / 吉藤英明
@ 2008-04-10  9:26         ` Eric Dumazet
  2008-04-10 11:00           ` David Miller
  1 sibling, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2008-04-10  9:26 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, netdev

David Miller a écrit :
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 1 Apr 2008 13:08:42 -0700
>
>   
>> The flow fields are all together, and the other parameters are local variables
>> in registers so that compare should be in one cache line.
>>
>> --- a/net/ipv4/route.c	2008-03-31 17:12:30.000000000 -0700
>> +++ b/net/ipv4/route.c	2008-04-01 13:05:46.000000000 -0700
>> @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb, 
>>  	rcu_read_lock();
>>  	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
>>  	     rth = rcu_dereference(rth->u.dst.rt_next)) {
>> -		if (rth->fl.fl4_dst == daddr &&
>> -		    rth->fl.fl4_src == saddr &&
>> -		    rth->fl.iif == iif &&
>> -		    rth->fl.oif == 0 &&
>> +		if (((rth->fl.fl4_dst ^ daddr) |
>> +		     (rth->fl.fl4_src ^ saddr) |
>> +		     (rth->fl.iif ^ iif) |
>> +		     rth->fl.oif |
>> +		     (rth->fl.fl4_tos ^ tos)) == 0 &&
>>  		    rth->fl.mark == skb->mark &&
>> -		    rth->fl.fl4_tos == tos &&
>>  		    net_eq(dev_net(rth->u.dst.dev), net) &&
>>  		    rth->rt_genid == atomic_read(&rt_genid)) {
>>  			dst_use(&rth->u.dst, jiffies);
>>     
>
> Eric, any objections to this version?
>   
No objections from me, thank you.

Acked-by: Eric Dumazet <dada1@cosmosbay.com>





^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
  2008-04-10  9:26         ` Eric Dumazet
@ 2008-04-10 11:00           ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10 11:00 UTC (permalink / raw)
  To: dada1; +Cc: shemminger, netdev

From: Eric Dumazet <dada1@cosmosbay.com>
Date: Thu, 10 Apr 2008 11:26:57 +0200

> Acked-by: Eric Dumazet <dada1@cosmosbay.com>

Applied to net-2.6.26, thanks everyone.

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 4/6] IPV4: route inline changes
       [not found] <20080401004708.009204033@vyatta.com>
                   ` (2 preceding siblings ...)
  2008-04-01  0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-10  8:53   ` David Miller
  2008-04-01  0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
  2008-04-01  0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: route-inline --]
[-- Type: text/plain, Size: 3790 bytes --]

Don't mark functions that are large as inline, let compiler decide.
Also, use inline rather than __inline__.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/route.c	2008-03-31 11:10:44.000000000 -0700
+++ b/net/ipv4/route.c	2008-03-31 11:14:50.000000000 -0700
@@ -600,18 +600,18 @@ static inline int ip_rt_proc_init(void)
 }
 #endif /* CONFIG_PROC_FS */
 
-static __inline__ void rt_free(struct rtable *rt)
+static inline void rt_free(struct rtable *rt)
 {
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
-static __inline__ void rt_drop(struct rtable *rt)
+static inline void rt_drop(struct rtable *rt)
 {
 	ip_rt_put(rt);
 	call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 }
 
-static __inline__ int rt_fast_clean(struct rtable *rth)
+static inline int rt_fast_clean(struct rtable *rth)
 {
 	/* Kill broadcast/multicast entries very aggresively, if they
 	   collide in hash table with more useful entries */
@@ -619,7 +619,7 @@ static __inline__ int rt_fast_clean(stru
 		rth->fl.iif && rth->u.dst.rt_next;
 }
 
-static __inline__ int rt_valuable(struct rtable *rth)
+static inline int rt_valuable(struct rtable *rth)
 {
 	return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 		rth->u.dst.expires;
@@ -1420,7 +1420,7 @@ out:	kfree_skb(skb);
 static const unsigned short mtu_plateau[] =
 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
 
-static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+static inline unsigned short guess_mtu(unsigned short old_mtu)
 {
 	int i;
 
@@ -1750,11 +1750,11 @@ static void ip_handle_martian_source(str
 #endif
 }
 
-static inline int __mkroute_input(struct sk_buff *skb,
-				  struct fib_result* res,
-				  struct in_device *in_dev,
-				  __be32 daddr, __be32 saddr, u32 tos,
-				  struct rtable **result)
+static int __mkroute_input(struct sk_buff *skb,
+			   struct fib_result *res,
+			   struct in_device *in_dev,
+			   __be32 daddr, __be32 saddr, u32 tos,
+			   struct rtable **result)
 {
 
 	struct rtable *rth;
@@ -1846,11 +1846,11 @@ static inline int __mkroute_input(struct
 	return err;
 }
 
-static inline int ip_mkroute_input(struct sk_buff *skb,
-				   struct fib_result* res,
-				   const struct flowi *fl,
-				   struct in_device *in_dev,
-				   __be32 daddr, __be32 saddr, u32 tos)
+static int ip_mkroute_input(struct sk_buff *skb,
+			    struct fib_result *res,
+			    const struct flowi *fl,
+			    struct in_device *in_dev,
+			    __be32 daddr, __be32 saddr, u32 tos)
 {
 	struct rtable* rth = NULL;
 	int err;
@@ -2132,12 +2132,12 @@ int ip_route_input(struct sk_buff *skb, 
 	return ip_route_input_slow(skb, daddr, saddr, tos, dev);
 }
 
-static inline int __mkroute_output(struct rtable **result,
-				   struct fib_result* res,
-				   const struct flowi *fl,
-				   const struct flowi *oldflp,
-				   struct net_device *dev_out,
-				   unsigned flags)
+static int __mkroute_output(struct rtable **result,
+			    struct fib_result *res,
+			    const struct flowi *fl,
+			    const struct flowi *oldflp,
+			    struct net_device *dev_out,
+			    unsigned flags)
 {
 	struct rtable *rth;
 	struct in_device *in_dev;
@@ -2252,12 +2252,12 @@ static inline int __mkroute_output(struc
 	return err;
 }
 
-static inline int ip_mkroute_output(struct rtable **rp,
-				    struct fib_result* res,
-				    const struct flowi *fl,
-				    const struct flowi *oldflp,
-				    struct net_device *dev_out,
-				    unsigned flags)
+static int ip_mkroute_output(struct rtable **rp,
+			     struct fib_result *res,
+			     const struct flowi *fl,
+			     const struct flowi *oldflp,
+			     struct net_device *dev_out,
+			     unsigned flags)
 {
 	struct rtable *rth = NULL;
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 4/6] IPV4: route inline changes
  2008-04-01  0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
@ 2008-04-10  8:53   ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:53 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:12 -0700

> Don't mark functions that are large as inline, let compiler decide.
> Also, use inline rather than __inline__.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied to net-2.6.26, thanks!

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 5/6] IPV4: route use jhash3
       [not found] <20080401004708.009204033@vyatta.com>
                   ` (3 preceding siblings ...)
  2008-04-01  0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-10  8:54   ` David Miller
  2008-04-01  0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: route-jhash3 --]
[-- Type: text/plain, Size: 1047 bytes --]

Since route hash is a triple, use jhash_3words rather doing the mixing
directly. This should be as fast and give better distribution.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/route.c	2008-03-31 11:14:50.000000000 -0700
+++ b/net/ipv4/route.c	2008-03-31 11:15:20.000000000 -0700
@@ -259,16 +259,14 @@ static DEFINE_PER_CPU(struct rt_cache_st
 #define RT_CACHE_STAT_INC(field) \
 	(__raw_get_cpu_var(rt_cache_stat).field++)
 
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
+static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
 {
-	return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
-		& rt_hash_mask;
+	return jhash_3words((__force u32)(__be32)(daddr),
+			    (__force u32)(__be32)(saddr),
+			    idx, atomic_read(&rt_genid))
+  		& rt_hash_mask;
 }
 
-#define rt_hash(daddr, saddr, idx) \
-	rt_hash_code((__force u32)(__be32)(daddr),\
-		     (__force u32)(__be32)(saddr) ^ ((idx) << 5))
-
 #ifdef CONFIG_PROC_FS
 struct rt_cache_iter_state {
 	struct seq_net_private p;

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 5/6] IPV4: route use jhash3
  2008-04-01  0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
@ 2008-04-10  8:54   ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:54 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:13 -0700

> Since route hash is a triple, use jhash_3words rather doing the mixing
> directly. This should be as fast and give better distribution.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied to net-2.6.26, but:

> -static unsigned int rt_hash_code(u32 daddr, u32 saddr)
> +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)

Are you sure you want to inline that?  jhash expands
to a lot of instructions

^ permalink raw reply	[flat|nested] 19+ messages in thread

* [PATCH 6/6] IPV4: route rekey timer can be deferrable
       [not found] <20080401004708.009204033@vyatta.com>
                   ` (4 preceding siblings ...)
  2008-04-01  0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
@ 2008-04-01  0:47 ` Stephen Hemminger
  2008-04-10  8:55   ` David Miller
  5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01  0:47 UTC (permalink / raw)
  To: David S. Miller; +Cc: netdev

[-- Attachment #1: route-defer --]
[-- Type: text/plain, Size: 604 bytes --]

No urgency on the rehash interval timer, so mark it as deferrable.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/ipv4/route.c	2008-03-31 11:22:17.000000000 -0700
+++ b/net/ipv4/route.c	2008-03-31 11:22:19.000000000 -0700
@@ -3058,7 +3058,9 @@ int __init ip_rt_init(void)
 	devinet_init();
 	ip_fib_init();
 
-	setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
+	rt_secret_timer.function = rt_secret_rebuild;
+	rt_secret_timer.data = 0;
+	init_timer_deferrable(&rt_secret_timer);
 
 	/* All the timers, started at system startup tend
 	   to synchronize. Perturb it a bit.

-- 


^ permalink raw reply	[flat|nested] 19+ messages in thread

* Re: [PATCH 6/6] IPV4: route rekey timer can be deferrable
  2008-04-01  0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
@ 2008-04-10  8:55   ` David Miller
  0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10  8:55 UTC (permalink / raw)
  To: shemminger; +Cc: netdev

From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:14 -0700

> No urgency on the rehash interval timer, so mark it as deferrable.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

Applied to net-2.6.26, thanks.

> -	setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
> +	rt_secret_timer.function = rt_secret_rebuild;
> +	rt_secret_timer.data = 0;
> +	init_timer_deferrable(&rt_secret_timer);

I've seen enough of these that a setup_timer_deferrable()
might be appropriate.

^ permalink raw reply	[flat|nested] 19+ messages in thread

end of thread, other threads:[~2008-04-10 12:16 UTC | newest]

Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <20080401004708.009204033@vyatta.com>
2008-04-01  0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
2008-04-10  8:39   ` David Miller
2008-04-01  0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
2008-04-10  8:49   ` David Miller
2008-04-01  0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
2008-04-01  5:52   ` Eric Dumazet
2008-04-01 20:08     ` Stephen Hemminger
2008-04-10  8:51       ` David Miller
2008-04-10  9:01         ` YOSHIFUJI Hideaki / 吉藤英明
2008-04-10 10:56           ` David Miller
2008-04-10 12:17             ` YOSHIFUJI Hideaki / 吉藤英明
2008-04-10  9:26         ` Eric Dumazet
2008-04-10 11:00           ` David Miller
2008-04-01  0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
2008-04-10  8:53   ` David Miller
2008-04-01  0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
2008-04-10  8:54   ` David Miller
2008-04-01  0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
2008-04-10  8:55   ` David Miller

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).