* [PATCH 1/6] socket: sk_filter minor cleanups
[not found] <20080401004708.009204033@vyatta.com>
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-10 8:39 ` David Miller
2008-04-01 0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
` (4 subsequent siblings)
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: sk-filter-cleanup --]
[-- Type: text/plain, Size: 2328 bytes --]
Some minor style cleanups:
* Move __KERNEL__ definitions to one place in filter.h
* Use const for sk_filter_len
* Line wrapping
* Put EXPORT_SYMBOL next to function definition
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/include/linux/filter.h 2008-03-31 10:23:28.000000000 -0700
+++ b/include/linux/filter.h 2008-03-31 10:24:40.000000000 -0700
@@ -37,21 +37,6 @@ struct sock_fprog /* Required for SO_ATT
struct sock_filter __user *filter;
};
-#ifdef __KERNEL__
-struct sk_filter
-{
- atomic_t refcnt;
- unsigned int len; /* Number of filter blocks */
- struct rcu_head rcu;
- struct sock_filter insns[0];
-};
-
-static inline unsigned int sk_filter_len(struct sk_filter *fp)
-{
- return fp->len*sizeof(struct sock_filter) + sizeof(*fp);
-}
-#endif
-
/*
* Instruction classes
*/
@@ -141,10 +126,24 @@ static inline unsigned int sk_filter_len
#define SKF_LL_OFF (-0x200000)
#ifdef __KERNEL__
+struct sk_filter
+{
+ atomic_t refcnt;
+ unsigned int len; /* Number of filter blocks */
+ struct rcu_head rcu;
+ struct sock_filter insns[0];
+};
+
+static inline unsigned int sk_filter_len(const struct sk_filter *fp)
+{
+ return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
+}
+
struct sk_buff;
struct sock;
-extern unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen);
+extern unsigned int sk_run_filter(struct sk_buff *skb,
+ struct sock_filter *filter, int flen);
extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
extern int sk_detach_filter(struct sock *sk);
extern int sk_chk_filter(struct sock_filter *filter, int flen);
--- a/net/core/filter.c 2008-03-31 10:24:56.000000000 -0700
+++ b/net/core/filter.c 2008-03-31 10:25:29.000000000 -0700
@@ -275,6 +275,7 @@ load_b:
return 0;
}
+EXPORT_SYMBOL(sk_run_filter);
/**
* sk_chk_filter - verify socket filter code
@@ -385,6 +386,7 @@ int sk_chk_filter(struct sock_filter *fi
return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
}
+EXPORT_SYMBOL(sk_chk_filter);
/**
* sk_filter_rcu_release: Release a socket filter by rcu_head
@@ -467,6 +469,3 @@ int sk_detach_filter(struct sock *sk)
rcu_read_unlock_bh();
return ret;
}
-
-EXPORT_SYMBOL(sk_chk_filter);
-EXPORT_SYMBOL(sk_run_filter);
--
^ permalink raw reply [flat|nested] 19+ messages in thread* [PATCH 2/6] socket: sk_filter deinline
[not found] <20080401004708.009204033@vyatta.com>
2008-04-01 0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-10 8:49 ` David Miller
2008-04-01 0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
` (3 subsequent siblings)
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: sk-filter-deinline --]
[-- Type: text/plain, Size: 3243 bytes --]
The sk_filter function is too big to be inlined. This saves 2296 bytes
of text on allyesconfig.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/include/linux/filter.h 2008-03-31 10:24:40.000000000 -0700
+++ b/include/linux/filter.h 2008-03-31 10:26:49.000000000 -0700
@@ -142,6 +142,7 @@ static inline unsigned int sk_filter_len
struct sk_buff;
struct sock;
+extern int sk_filter(struct sock *sk, struct sk_buff *skb);
extern unsigned int sk_run_filter(struct sk_buff *skb,
struct sock_filter *filter, int flen);
extern int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk);
--- a/include/net/sock.h 2008-03-31 10:23:12.000000000 -0700
+++ b/include/net/sock.h 2008-03-31 10:25:57.000000000 -0700
@@ -927,41 +927,6 @@ extern void sk_common_release(struct soc
extern void sock_init_data(struct socket *sock, struct sock *sk);
/**
- * sk_filter - run a packet through a socket filter
- * @sk: sock associated with &sk_buff
- * @skb: buffer to filter
- * @needlock: set to 1 if the sock is not locked by caller.
- *
- * Run the filter code and then cut skb->data to correct size returned by
- * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
- * than pkt_len we keep whole skb->data. This is the socket level
- * wrapper to sk_run_filter. It returns 0 if the packet should
- * be accepted or -EPERM if the packet should be tossed.
- *
- */
-
-static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
-{
- int err;
- struct sk_filter *filter;
-
- err = security_sock_rcv_skb(sk, skb);
- if (err)
- return err;
-
- rcu_read_lock_bh();
- filter = rcu_dereference(sk->sk_filter);
- if (filter) {
- unsigned int pkt_len = sk_run_filter(skb, filter->insns,
- filter->len);
- err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
- }
- rcu_read_unlock_bh();
-
- return err;
-}
-
-/**
* sk_filter_release: Release a socket filter
* @sk: socket
* @fp: filter to remove
--- a/net/core/filter.c 2008-03-31 10:25:29.000000000 -0700
+++ b/net/core/filter.c 2008-03-31 10:27:43.000000000 -0700
@@ -64,6 +64,41 @@ static inline void *load_pointer(struct
}
/**
+ * sk_filter - run a packet through a socket filter
+ * @sk: sock associated with &sk_buff
+ * @skb: buffer to filter
+ * @needlock: set to 1 if the sock is not locked by caller.
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+ int err;
+ struct sk_filter *filter;
+
+ err = security_sock_rcv_skb(sk, skb);
+ if (err)
+ return err;
+
+ rcu_read_lock_bh();
+ filter = rcu_dereference(sk->sk_filter);
+ if (filter) {
+ unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+ filter->len);
+ err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+ }
+ rcu_read_unlock_bh();
+
+ return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/**
* sk_run_filter - run a filter on a socket
* @skb: buffer to run the filter on
* @filter: filter to apply
--
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 2/6] socket: sk_filter deinline
2008-04-01 0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
@ 2008-04-10 8:49 ` David Miller
0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10 8:49 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:10 -0700
> The sk_filter function is too big to be inlined. This saves 2296 bytes
> of text on allyesconfig.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Because of the bloat I'll apply this to net-2.6.26
But I think we can do better in many of the callsites.
Most of them are in the packet receive path, have BH's disabled, and a
lot of the variables already laid out in registers.
I would like to see a new inline created that caters to these cases,
which does something like:
if (rcu_dereference(sk->sk_filter))
return sk_filter(sk, skb);
else
return 0;
The security check muddles things here, but that should be inlined in
such situations too I think.
It's all of that "pkt_len = this; pskb_trim(that);" that makes for
most of the bloat. And that can stay in the un-inlined sk_filter() we
have now.
Having a socket filter enabled on a TCP socket is so rare, but
now everyone pays for the function call unconditionally.
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
[not found] <20080401004708.009204033@vyatta.com>
2008-04-01 0:47 ` [PATCH 1/6] socket: sk_filter minor cleanups Stephen Hemminger
2008-04-01 0:47 ` [PATCH 2/6] socket: sk_filter deinline Stephen Hemminger
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-01 5:52 ` Eric Dumazet
2008-04-01 0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
` (2 subsequent siblings)
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: route-compare --]
[-- Type: text/plain, Size: 1275 bytes --]
The comparison in ip_route_input is a hot path, by recoding the C
"and" as bit operations, fewer conditional branches get generated
so the code should be faster. Maybe someday Gcc will be smart
enough to do this?
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/route.c 2008-03-31 10:57:30.000000000 -0700
+++ b/net/ipv4/route.c 2008-03-31 11:10:44.000000000 -0700
@@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb,
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.dst.rt_next)) {
- if (rth->fl.fl4_dst == daddr &&
- rth->fl.fl4_src == saddr &&
- rth->fl.iif == iif &&
- rth->fl.oif == 0 &&
- rth->fl.mark == skb->mark &&
- rth->fl.fl4_tos == tos &&
- net_eq(dev_net(rth->u.dst.dev), net) &&
- rth->rt_genid == atomic_read(&rt_genid)) {
+ if (((rth->fl.fl4_dst ^ daddr) |
+ (rth->fl.fl4_src ^ saddr) |
+ (rth->fl.iif ^ iif) |
+ rth->fl.oif |
+ (rth->fl.mark ^ skb->mark) |
+ (rth->fl.fl4_tos ^ tos) |
+ (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
+ net_eq(dev_net(rth->u.dst.dev), net)) {
dst_use(&rth->u.dst, jiffies);
RT_CACHE_STAT_INC(in_hit);
rcu_read_unlock();
--
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-01 0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
@ 2008-04-01 5:52 ` Eric Dumazet
2008-04-01 20:08 ` Stephen Hemminger
0 siblings, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2008-04-01 5:52 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David S. Miller, netdev
Stephen Hemminger a écrit :
> The comparison in ip_route_input is a hot path, by recoding the C
> "and" as bit operations, fewer conditional branches get generated
> so the code should be faster. Maybe someday Gcc will be smart
> enough to do this?
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
> --- a/net/ipv4/route.c 2008-03-31 10:57:30.000000000 -0700
> +++ b/net/ipv4/route.c 2008-03-31 11:10:44.000000000 -0700
> @@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb,
> rcu_read_lock();
> for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> rth = rcu_dereference(rth->u.dst.rt_next)) {
> - if (rth->fl.fl4_dst == daddr &&
> - rth->fl.fl4_src == saddr &&
> - rth->fl.iif == iif &&
> - rth->fl.oif == 0 &&
> - rth->fl.mark == skb->mark &&
> - rth->fl.fl4_tos == tos &&
> - net_eq(dev_net(rth->u.dst.dev), net) &&
> - rth->rt_genid == atomic_read(&rt_genid)) {
> + if (((rth->fl.fl4_dst ^ daddr) |
> + (rth->fl.fl4_src ^ saddr) |
> + (rth->fl.iif ^ iif) |
> + rth->fl.oif |
> + (rth->fl.mark ^ skb->mark) |
> + (rth->fl.fl4_tos ^ tos) |
> + (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
> + net_eq(dev_net(rth->u.dst.dev), net)) {
> dst_use(&rth->u.dst, jiffies);
> RT_CACHE_STAT_INC(in_hit);
> rcu_read_unlock();
>
Are you sure all fields share same cache lines, on 32bit and 64bit arches ?
I prefer having some conditional branches instead of cache misses, given that
the first two branches are usually discriminant.
Maybe we could let one test on (daddr,saddr) to do a fast segregation (with
one cache line at most) of candidates, then one remaining compare on other keys ?
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-01 5:52 ` Eric Dumazet
@ 2008-04-01 20:08 ` Stephen Hemminger
2008-04-10 8:51 ` David Miller
0 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 20:08 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David S. Miller, netdev
On Tue, 01 Apr 2008 07:52:03 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:
> Stephen Hemminger a écrit :
> > The comparison in ip_route_input is a hot path, by recoding the C
> > "and" as bit operations, fewer conditional branches get generated
> > so the code should be faster. Maybe someday Gcc will be smart
> > enough to do this?
> >
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >
> > --- a/net/ipv4/route.c 2008-03-31 10:57:30.000000000 -0700
> > +++ b/net/ipv4/route.c 2008-03-31 11:10:44.000000000 -0700
> > @@ -2079,14 +2079,14 @@ int ip_route_input(struct sk_buff *skb,
> > rcu_read_lock();
> > for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> > rth = rcu_dereference(rth->u.dst.rt_next)) {
> > - if (rth->fl.fl4_dst == daddr &&
> > - rth->fl.fl4_src == saddr &&
> > - rth->fl.iif == iif &&
> > - rth->fl.oif == 0 &&
> > - rth->fl.mark == skb->mark &&
> > - rth->fl.fl4_tos == tos &&
> > - net_eq(dev_net(rth->u.dst.dev), net) &&
> > - rth->rt_genid == atomic_read(&rt_genid)) {
> > + if (((rth->fl.fl4_dst ^ daddr) |
> > + (rth->fl.fl4_src ^ saddr) |
> > + (rth->fl.iif ^ iif) |
> > + rth->fl.oif |
> > + (rth->fl.mark ^ skb->mark) |
> > + (rth->fl.fl4_tos ^ tos) |
> > + (rth->rt_genid ^ atomic_read(&rt_genid))) == 0 &&
> > + net_eq(dev_net(rth->u.dst.dev), net)) {
> > dst_use(&rth->u.dst, jiffies);
> > RT_CACHE_STAT_INC(in_hit);
> > rcu_read_unlock();
> >
>
> Are you sure all fields share same cache lines, on 32bit and 64bit arches ?
The flow fields are all together, and the other parameters are local variables
in registers so that compare should be in one cache line.
--- a/net/ipv4/route.c 2008-03-31 17:12:30.000000000 -0700
+++ b/net/ipv4/route.c 2008-04-01 13:05:46.000000000 -0700
@@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb,
rcu_read_lock();
for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
rth = rcu_dereference(rth->u.dst.rt_next)) {
- if (rth->fl.fl4_dst == daddr &&
- rth->fl.fl4_src == saddr &&
- rth->fl.iif == iif &&
- rth->fl.oif == 0 &&
+ if (((rth->fl.fl4_dst ^ daddr) |
+ (rth->fl.fl4_src ^ saddr) |
+ (rth->fl.iif ^ iif) |
+ rth->fl.oif |
+ (rth->fl.fl4_tos ^ tos)) == 0 &&
rth->fl.mark == skb->mark &&
- rth->fl.fl4_tos == tos &&
net_eq(dev_net(rth->u.dst.dev), net) &&
rth->rt_genid == atomic_read(&rt_genid)) {
dst_use(&rth->u.dst, jiffies);
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-01 20:08 ` Stephen Hemminger
@ 2008-04-10 8:51 ` David Miller
2008-04-10 9:01 ` YOSHIFUJI Hideaki / 吉藤英明
2008-04-10 9:26 ` Eric Dumazet
0 siblings, 2 replies; 19+ messages in thread
From: David Miller @ 2008-04-10 8:51 UTC (permalink / raw)
To: shemminger; +Cc: dada1, netdev
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Tue, 1 Apr 2008 13:08:42 -0700
> The flow fields are all together, and the other parameters are local variables
> in registers so that compare should be in one cache line.
>
> --- a/net/ipv4/route.c 2008-03-31 17:12:30.000000000 -0700
> +++ b/net/ipv4/route.c 2008-04-01 13:05:46.000000000 -0700
> @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb,
> rcu_read_lock();
> for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> rth = rcu_dereference(rth->u.dst.rt_next)) {
> - if (rth->fl.fl4_dst == daddr &&
> - rth->fl.fl4_src == saddr &&
> - rth->fl.iif == iif &&
> - rth->fl.oif == 0 &&
> + if (((rth->fl.fl4_dst ^ daddr) |
> + (rth->fl.fl4_src ^ saddr) |
> + (rth->fl.iif ^ iif) |
> + rth->fl.oif |
> + (rth->fl.fl4_tos ^ tos)) == 0 &&
> rth->fl.mark == skb->mark &&
> - rth->fl.fl4_tos == tos &&
> net_eq(dev_net(rth->u.dst.dev), net) &&
> rth->rt_genid == atomic_read(&rt_genid)) {
> dst_use(&rth->u.dst, jiffies);
Eric, any objections to this version?
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-10 8:51 ` David Miller
@ 2008-04-10 9:01 ` YOSHIFUJI Hideaki / 吉藤英明
2008-04-10 10:56 ` David Miller
2008-04-10 9:26 ` Eric Dumazet
1 sibling, 1 reply; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-04-10 9:01 UTC (permalink / raw)
To: davem; +Cc: shemminger, dada1, netdev, yoshfuji
In article <20080410.015118.103465510.davem@davemloft.net> (at Thu, 10 Apr 2008 01:51:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 1 Apr 2008 13:08:42 -0700
>
> > The flow fields are all together, and the other parameters are local variables
> > in registers so that compare should be in one cache line.
> >
> > --- a/net/ipv4/route.c 2008-03-31 17:12:30.000000000 -0700
> > +++ b/net/ipv4/route.c 2008-04-01 13:05:46.000000000 -0700
> > @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb,
> > rcu_read_lock();
> > for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> > rth = rcu_dereference(rth->u.dst.rt_next)) {
> > - if (rth->fl.fl4_dst == daddr &&
> > - rth->fl.fl4_src == saddr &&
> > - rth->fl.iif == iif &&
> > - rth->fl.oif == 0 &&
> > + if (((rth->fl.fl4_dst ^ daddr) |
> > + (rth->fl.fl4_src ^ saddr) |
> > + (rth->fl.iif ^ iif) |
> > + rth->fl.oif |
> > + (rth->fl.fl4_tos ^ tos)) == 0 &&
> > rth->fl.mark == skb->mark &&
> > - rth->fl.fl4_tos == tos &&
> > net_eq(dev_net(rth->u.dst.dev), net) &&
> > rth->rt_genid == atomic_read(&rt_genid)) {
> > dst_use(&rth->u.dst, jiffies);
>
> Eric, any objections to this version?
I'm not Eric, but well, I'm now doubting if this is really good.
If the comparision chain is long and it is unlikely to pass all the tests,
it would be better to cut the line.
If we use "or", we need to run through the test, in ayn case.
--yoshfuji
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-10 9:01 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2008-04-10 10:56 ` David Miller
2008-04-10 12:17 ` YOSHIFUJI Hideaki / 吉藤英明
0 siblings, 1 reply; 19+ messages in thread
From: David Miller @ 2008-04-10 10:56 UTC (permalink / raw)
To: yoshfuji; +Cc: shemminger, dada1, netdev
From: YOSHIFUJI Hideaki / 吉藤英明 <yoshfuji@linux-ipv6.org>
Date: Thu, 10 Apr 2008 18:01:48 +0900 (JST)
> In article <20080410.015118.103465510.davem@davemloft.net> (at Thu, 10 Apr 2008 01:51:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:
>
> > From: Stephen Hemminger <shemminger@vyatta.com>
> > Date: Tue, 1 Apr 2008 13:08:42 -0700
> >
> > > The flow fields are all together, and the other parameters are local variables
> > > in registers so that compare should be in one cache line.
> > >
> > > --- a/net/ipv4/route.c 2008-03-31 17:12:30.000000000 -0700
> > > +++ b/net/ipv4/route.c 2008-04-01 13:05:46.000000000 -0700
> > > @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb,
> > > rcu_read_lock();
> > > for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
> > > rth = rcu_dereference(rth->u.dst.rt_next)) {
> > > - if (rth->fl.fl4_dst == daddr &&
> > > - rth->fl.fl4_src == saddr &&
> > > - rth->fl.iif == iif &&
> > > - rth->fl.oif == 0 &&
> > > + if (((rth->fl.fl4_dst ^ daddr) |
> > > + (rth->fl.fl4_src ^ saddr) |
> > > + (rth->fl.iif ^ iif) |
> > > + rth->fl.oif |
> > > + (rth->fl.fl4_tos ^ tos)) == 0 &&
> > > rth->fl.mark == skb->mark &&
> > > - rth->fl.fl4_tos == tos &&
> > > net_eq(dev_net(rth->u.dst.dev), net) &&
> > > rth->rt_genid == atomic_read(&rt_genid)) {
> > > dst_use(&rth->u.dst, jiffies);
> >
> > Eric, any objections to this version?
>
> I'm not Eric, but well, I'm now doubting if this is really good.
> If the comparision chain is long and it is unlikely to pass all the tests,
> it would be better to cut the line.
> If we use "or", we need to run through the test, in ayn case.
Actually the case you mention it is part of the incentive for
this change.
Branch prediction fares very poorly in such cases, and
therefore it is better to mispredict one branch over
all the data items in the same cache line than any one
of several such branches. The above new sequence gets
emitted by the compiler as several integer operations and
one branch. As long as all the data items are in the
same cacheline, this is optimal.
We made such a change for ethernet address comparisons a
few years ago. At the time Eric showed that it mattered
a lot for Athlon processors.
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-10 10:56 ` David Miller
@ 2008-04-10 12:17 ` YOSHIFUJI Hideaki / 吉藤英明
0 siblings, 0 replies; 19+ messages in thread
From: YOSHIFUJI Hideaki / 吉藤英明 @ 2008-04-10 12:17 UTC (permalink / raw)
To: davem; +Cc: shemminger, dada1, netdev, yoshfuji
In article <20080410.035618.217931997.davem@davemloft.net> (at Thu, 10 Apr 2008 03:56:18 -0700 (PDT)), David Miller <davem@davemloft.net> says:
> Branch prediction fares very poorly in such cases, and
> therefore it is better to mispredict one branch over
> all the data items in the same cache line than any one
> of several such branches. The above new sequence gets
> emitted by the compiler as several integer operations and
> one branch. As long as all the data items are in the
> same cacheline, this is optimal.
Okay, thanks.
--yoshfuji
^ permalink raw reply [flat|nested] 19+ messages in thread
* Re: [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare
2008-04-10 8:51 ` David Miller
2008-04-10 9:01 ` YOSHIFUJI Hideaki / 吉藤英明
@ 2008-04-10 9:26 ` Eric Dumazet
2008-04-10 11:00 ` David Miller
1 sibling, 1 reply; 19+ messages in thread
From: Eric Dumazet @ 2008-04-10 9:26 UTC (permalink / raw)
To: David Miller; +Cc: shemminger, netdev
David Miller a écrit :
> From: Stephen Hemminger <shemminger@vyatta.com>
> Date: Tue, 1 Apr 2008 13:08:42 -0700
>
>
>> The flow fields are all together, and the other parameters are local variables
>> in registers so that compare should be in one cache line.
>>
>> --- a/net/ipv4/route.c 2008-03-31 17:12:30.000000000 -0700
>> +++ b/net/ipv4/route.c 2008-04-01 13:05:46.000000000 -0700
>> @@ -2079,12 +2079,12 @@ int ip_route_input(struct sk_buff *skb,
>> rcu_read_lock();
>> for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
>> rth = rcu_dereference(rth->u.dst.rt_next)) {
>> - if (rth->fl.fl4_dst == daddr &&
>> - rth->fl.fl4_src == saddr &&
>> - rth->fl.iif == iif &&
>> - rth->fl.oif == 0 &&
>> + if (((rth->fl.fl4_dst ^ daddr) |
>> + (rth->fl.fl4_src ^ saddr) |
>> + (rth->fl.iif ^ iif) |
>> + rth->fl.oif |
>> + (rth->fl.fl4_tos ^ tos)) == 0 &&
>> rth->fl.mark == skb->mark &&
>> - rth->fl.fl4_tos == tos &&
>> net_eq(dev_net(rth->u.dst.dev), net) &&
>> rth->rt_genid == atomic_read(&rt_genid)) {
>> dst_use(&rth->u.dst, jiffies);
>>
>
> Eric, any objections to this version?
>
No objections from me, thank you.
Acked-by: Eric Dumazet <dada1@cosmosbay.com>
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 4/6] IPV4: route inline changes
[not found] <20080401004708.009204033@vyatta.com>
` (2 preceding siblings ...)
2008-04-01 0:47 ` [PATCH 3/6] IPV4 : use xor rather than multiple ands for route compare Stephen Hemminger
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-10 8:53 ` David Miller
2008-04-01 0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
2008-04-01 0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: route-inline --]
[-- Type: text/plain, Size: 3790 bytes --]
Don't mark functions that are large as inline, let compiler decide.
Also, use inline rather than __inline__.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/route.c 2008-03-31 11:10:44.000000000 -0700
+++ b/net/ipv4/route.c 2008-03-31 11:14:50.000000000 -0700
@@ -600,18 +600,18 @@ static inline int ip_rt_proc_init(void)
}
#endif /* CONFIG_PROC_FS */
-static __inline__ void rt_free(struct rtable *rt)
+static inline void rt_free(struct rtable *rt)
{
call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}
-static __inline__ void rt_drop(struct rtable *rt)
+static inline void rt_drop(struct rtable *rt)
{
ip_rt_put(rt);
call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
}
-static __inline__ int rt_fast_clean(struct rtable *rth)
+static inline int rt_fast_clean(struct rtable *rth)
{
/* Kill broadcast/multicast entries very aggresively, if they
collide in hash table with more useful entries */
@@ -619,7 +619,7 @@ static __inline__ int rt_fast_clean(stru
rth->fl.iif && rth->u.dst.rt_next;
}
-static __inline__ int rt_valuable(struct rtable *rth)
+static inline int rt_valuable(struct rtable *rth)
{
return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
rth->u.dst.expires;
@@ -1420,7 +1420,7 @@ out: kfree_skb(skb);
static const unsigned short mtu_plateau[] =
{32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
-static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
+static inline unsigned short guess_mtu(unsigned short old_mtu)
{
int i;
@@ -1750,11 +1750,11 @@ static void ip_handle_martian_source(str
#endif
}
-static inline int __mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos,
- struct rtable **result)
+static int __mkroute_input(struct sk_buff *skb,
+ struct fib_result *res,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos,
+ struct rtable **result)
{
struct rtable *rth;
@@ -1846,11 +1846,11 @@ static inline int __mkroute_input(struct
return err;
}
-static inline int ip_mkroute_input(struct sk_buff *skb,
- struct fib_result* res,
- const struct flowi *fl,
- struct in_device *in_dev,
- __be32 daddr, __be32 saddr, u32 tos)
+static int ip_mkroute_input(struct sk_buff *skb,
+ struct fib_result *res,
+ const struct flowi *fl,
+ struct in_device *in_dev,
+ __be32 daddr, __be32 saddr, u32 tos)
{
struct rtable* rth = NULL;
int err;
@@ -2132,12 +2132,12 @@ int ip_route_input(struct sk_buff *skb,
return ip_route_input_slow(skb, daddr, saddr, tos, dev);
}
-static inline int __mkroute_output(struct rtable **result,
- struct fib_result* res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static int __mkroute_output(struct rtable **result,
+ struct fib_result *res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
{
struct rtable *rth;
struct in_device *in_dev;
@@ -2252,12 +2252,12 @@ static inline int __mkroute_output(struc
return err;
}
-static inline int ip_mkroute_output(struct rtable **rp,
- struct fib_result* res,
- const struct flowi *fl,
- const struct flowi *oldflp,
- struct net_device *dev_out,
- unsigned flags)
+static int ip_mkroute_output(struct rtable **rp,
+ struct fib_result *res,
+ const struct flowi *fl,
+ const struct flowi *oldflp,
+ struct net_device *dev_out,
+ unsigned flags)
{
struct rtable *rth = NULL;
int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
--
^ permalink raw reply [flat|nested] 19+ messages in thread* [PATCH 5/6] IPV4: route use jhash3
[not found] <20080401004708.009204033@vyatta.com>
` (3 preceding siblings ...)
2008-04-01 0:47 ` [PATCH 4/6] IPV4: route inline changes Stephen Hemminger
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-10 8:54 ` David Miller
2008-04-01 0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: route-jhash3 --]
[-- Type: text/plain, Size: 1047 bytes --]
Since route hash is a triple, use jhash_3words rather doing the mixing
directly. This should be as fast and give better distribution.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/route.c 2008-03-31 11:14:50.000000000 -0700
+++ b/net/ipv4/route.c 2008-03-31 11:15:20.000000000 -0700
@@ -259,16 +259,14 @@ static DEFINE_PER_CPU(struct rt_cache_st
#define RT_CACHE_STAT_INC(field) \
(__raw_get_cpu_var(rt_cache_stat).field++)
-static unsigned int rt_hash_code(u32 daddr, u32 saddr)
+static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
{
- return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
- & rt_hash_mask;
+ return jhash_3words((__force u32)(__be32)(daddr),
+ (__force u32)(__be32)(saddr),
+ idx, atomic_read(&rt_genid))
+ & rt_hash_mask;
}
-#define rt_hash(daddr, saddr, idx) \
- rt_hash_code((__force u32)(__be32)(daddr),\
- (__force u32)(__be32)(saddr) ^ ((idx) << 5))
-
#ifdef CONFIG_PROC_FS
struct rt_cache_iter_state {
struct seq_net_private p;
--
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 5/6] IPV4: route use jhash3
2008-04-01 0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
@ 2008-04-10 8:54 ` David Miller
0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10 8:54 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:13 -0700
> Since route hash is a triple, use jhash_3words rather doing the mixing
> directly. This should be as fast and give better distribution.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Applied to net-2.6.26, but:
> -static unsigned int rt_hash_code(u32 daddr, u32 saddr)
> +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx)
Are you sure you want to inline that? jhash expands
to a lot of instructions
^ permalink raw reply [flat|nested] 19+ messages in thread
* [PATCH 6/6] IPV4: route rekey timer can be deferrable
[not found] <20080401004708.009204033@vyatta.com>
` (4 preceding siblings ...)
2008-04-01 0:47 ` [PATCH 5/6] IPV4: route use jhash3 Stephen Hemminger
@ 2008-04-01 0:47 ` Stephen Hemminger
2008-04-10 8:55 ` David Miller
5 siblings, 1 reply; 19+ messages in thread
From: Stephen Hemminger @ 2008-04-01 0:47 UTC (permalink / raw)
To: David S. Miller; +Cc: netdev
[-- Attachment #1: route-defer --]
[-- Type: text/plain, Size: 604 bytes --]
No urgency on the rehash interval timer, so mark it as deferrable.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
--- a/net/ipv4/route.c 2008-03-31 11:22:17.000000000 -0700
+++ b/net/ipv4/route.c 2008-03-31 11:22:19.000000000 -0700
@@ -3058,7 +3058,9 @@ int __init ip_rt_init(void)
devinet_init();
ip_fib_init();
- setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
+ rt_secret_timer.function = rt_secret_rebuild;
+ rt_secret_timer.data = 0;
+ init_timer_deferrable(&rt_secret_timer);
/* All the timers, started at system startup tend
to synchronize. Perturb it a bit.
--
^ permalink raw reply [flat|nested] 19+ messages in thread* Re: [PATCH 6/6] IPV4: route rekey timer can be deferrable
2008-04-01 0:47 ` [PATCH 6/6] IPV4: route rekey timer can be deferrable Stephen Hemminger
@ 2008-04-10 8:55 ` David Miller
0 siblings, 0 replies; 19+ messages in thread
From: David Miller @ 2008-04-10 8:55 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@vyatta.com>
Date: Mon, 31 Mar 2008 17:47:14 -0700
> No urgency on the rehash interval timer, so mark it as deferrable.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Applied to net-2.6.26, thanks.
> - setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
> + rt_secret_timer.function = rt_secret_rebuild;
> + rt_secret_timer.data = 0;
> + init_timer_deferrable(&rt_secret_timer);
I've seen enough of these that a setup_timer_deferrable()
might be appropriate.
^ permalink raw reply [flat|nested] 19+ messages in thread