* [PATCH] convert hh_lock to seqlock
@ 2006-12-07 19:33 Stephen Hemminger
2006-12-07 20:23 ` Eric Dumazet
2006-12-07 23:08 ` [PATCH] convert hh_lock to seqlock David Miller
0 siblings, 2 replies; 10+ messages in thread
From: Stephen Hemminger @ 2006-12-07 19:33 UTC (permalink / raw)
To: David Miller; +Cc: netdev
The hard header cache is in the main output path, so using
seqlock instead of reader/writer lock should reduce overhead.
Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
---
include/linux/netdevice.h | 2 +-
include/net/neighbour.h | 18 ++++++++++++++++++
net/core/neighbour.c | 11 ++++++-----
net/ipv4/ip_output.c | 14 +++-----------
net/ipv6/ip6_output.c | 15 +++------------
5 files changed, 31 insertions(+), 29 deletions(-)
--- linux-2.6.19.orig/include/linux/netdevice.h 2006-12-05 13:55:54.000000000 -0800
+++ linux-2.6.19/include/linux/netdevice.h 2006-12-06 10:29:15.000000000 -0800
@@ -199,7 +199,7 @@
*/
u16 hh_len; /* length of header */
int (*hh_output)(struct sk_buff *skb);
- rwlock_t hh_lock;
+ seqlock_t hh_lock;
/* cached hardware header; allow for machine alignment needs. */
#define HH_DATA_MOD 16
--- linux-2.6.19.orig/include/net/neighbour.h 2006-12-05 13:36:16.000000000 -0800
+++ linux-2.6.19/include/net/neighbour.h 2006-12-06 14:53:52.000000000 -0800
@@ -309,6 +309,24 @@
return 0;
}
+static inline int neigh_hh_output(struct hh_cache *hh, struct sk_buff *skb)
+{
+ unsigned seq;
+ int hh_len;
+
+ do {
+ int hh_alen;
+
+ seq = read_seqbegin(&hh->hh_lock);
+ hh_len = hh->hh_len;
+ hh_alen = HH_DATA_ALIGN(hh_len);
+ memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
+ } while (read_seqretry(&hh->hh_lock, seq));
+
+ skb_push(skb, hh_len);
+ return hh->hh_output(skb);
+}
+
static inline struct neighbour *
__neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat)
{
--- linux-2.6.19.orig/net/core/neighbour.c 2006-12-05 13:55:54.000000000 -0800
+++ linux-2.6.19/net/core/neighbour.c 2006-12-06 10:29:15.000000000 -0800
@@ -577,9 +577,10 @@
while ((hh = neigh->hh) != NULL) {
neigh->hh = hh->hh_next;
hh->hh_next = NULL;
- write_lock_bh(&hh->hh_lock);
+
+ write_seqlock_bh(&hh->hh_lock);
hh->hh_output = neigh_blackhole;
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
if (atomic_dec_and_test(&hh->hh_refcnt))
kfree(hh);
}
@@ -897,9 +898,9 @@
if (update) {
for (hh = neigh->hh; hh; hh = hh->hh_next) {
- write_lock_bh(&hh->hh_lock);
+ write_seqlock_bh(&hh->hh_lock);
update(hh, neigh->dev, neigh->ha);
- write_unlock_bh(&hh->hh_lock);
+ write_sequnlock_bh(&hh->hh_lock);
}
}
}
@@ -1089,7 +1090,7 @@
break;
if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) {
- rwlock_init(&hh->hh_lock);
+ seqlock_init(&hh->hh_lock);
hh->hh_type = protocol;
atomic_set(&hh->hh_refcnt, 0);
hh->hh_next = NULL;
--- linux-2.6.19.orig/net/ipv4/ip_output.c 2006-12-05 13:55:54.000000000 -0800
+++ linux-2.6.19/net/ipv4/ip_output.c 2006-12-06 10:29:15.000000000 -0800
@@ -164,7 +164,6 @@
static inline int ip_finish_output2(struct sk_buff *skb)
{
struct dst_entry *dst = skb->dst;
- struct hh_cache *hh = dst->hh;
struct net_device *dev = dst->dev;
int hh_len = LL_RESERVED_SPACE(dev);
@@ -183,16 +182,9 @@
skb = skb2;
}
- if (hh) {
- int hh_alen;
-
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
+ if (dst->hh)
+ return neigh_hh_output(dst->hh, skb);
+ else if (dst->neighbour)
return dst->neighbour->output(skb);
if (net_ratelimit())
--- linux-2.6.19.orig/net/ipv6/ip6_output.c 2006-12-05 13:55:54.000000000 -0800
+++ linux-2.6.19/net/ipv6/ip6_output.c 2006-12-06 10:33:24.000000000 -0800
@@ -72,20 +72,11 @@
static inline int ip6_output_finish(struct sk_buff *skb)
{
-
struct dst_entry *dst = skb->dst;
- struct hh_cache *hh = dst->hh;
-
- if (hh) {
- int hh_alen;
- read_lock_bh(&hh->hh_lock);
- hh_alen = HH_DATA_ALIGN(hh->hh_len);
- memcpy(skb->data - hh_alen, hh->hh_data, hh_alen);
- read_unlock_bh(&hh->hh_lock);
- skb_push(skb, hh->hh_len);
- return hh->hh_output(skb);
- } else if (dst->neighbour)
+ if (dst->hh)
+ return neigh_hh_output(dst->hh, skb);
+ else if (dst->neighbour)
return dst->neighbour->output(skb);
IP6_INC_STATS_BH(ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 19:33 [PATCH] convert hh_lock to seqlock Stephen Hemminger
@ 2006-12-07 20:23 ` Eric Dumazet
2006-12-07 21:15 ` Stephen Hemminger
2006-12-07 23:08 ` [PATCH] convert hh_lock to seqlock David Miller
1 sibling, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-12-07 20:23 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Stephen Hemminger a écrit :
> The hard header cache is in the main output path, so using
> seqlock instead of reader/writer lock should reduce overhead.
>
Nice work Stephen, I am very interested.
Did you benchmarked it ?
I ask because I think hh_refcnt frequent changes may defeat the gain you want
(ie avoiding cache line ping pongs between cpus). seqlock are definitly better
than rwlock, but if we really keep cache lines shared.
So I would suggest reordering fields of hh_cache and adding one
____cacheline_aligned_in_smp to keep hh_refcnt in another cache line.
(hh_len, hh_lock and hh_data should be placed on a 'mostly read' cache line)
Thank you
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 20:23 ` Eric Dumazet
@ 2006-12-07 21:15 ` Stephen Hemminger
2006-12-07 22:27 ` Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Stephen Hemminger @ 2006-12-07 21:15 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev
On Thu, 07 Dec 2006 21:23:07 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:
> Stephen Hemminger a écrit :
> > The hard header cache is in the main output path, so using
> > seqlock instead of reader/writer lock should reduce overhead.
> >
>
> Nice work Stephen, I am very interested.
>
> Did you benchmarked it ?
>
> I ask because I think hh_refcnt frequent changes may defeat the gain you want
> (ie avoiding cache line ping pongs between cpus). seqlock are definitly better
> than rwlock, but if we really keep cache lines shared.
>
> So I would suggest reordering fields of hh_cache and adding one
> ____cacheline_aligned_in_smp to keep hh_refcnt in another cache line.
>
> (hh_len, hh_lock and hh_data should be placed on a 'mostly read' cache line)
>
> Thank you
> Eric
It doesn't make any visible performance difference for real networks;
copies and device issues are much larger.
The hh_refcnt is used only when creating destroying neighbor entries,
so except under DoS attack it doesn't make a lot of difference.
The hh_lock is used on each packet sent.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 21:15 ` Stephen Hemminger
@ 2006-12-07 22:27 ` Eric Dumazet
2006-12-07 22:53 ` Stephen Hemminger
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-12-07 22:27 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Stephen Hemminger a écrit :
> On Thu, 07 Dec 2006 21:23:07 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>> Stephen Hemminger a écrit :
>>> The hard header cache is in the main output path, so using
>>> seqlock instead of reader/writer lock should reduce overhead.
>>>
>> Nice work Stephen, I am very interested.
>>
>> Did you benchmarked it ?
>>
>> I ask because I think hh_refcnt frequent changes may defeat the gain you want
>> (ie avoiding cache line ping pongs between cpus). seqlock are definitly better
>> than rwlock, but if we really keep cache lines shared.
>>
>> So I would suggest reordering fields of hh_cache and adding one
>> ____cacheline_aligned_in_smp to keep hh_refcnt in another cache line.
>>
>> (hh_len, hh_lock and hh_data should be placed on a 'mostly read' cache line)
>>
>> Thank you
>> Eric
>
> It doesn't make any visible performance difference for real networks;
> copies and device issues are much larger.
Hum, so 'my' machines must be unreal :)
>
> The hh_refcnt is used only when creating destroying neighbor entries,
> so except under DoS attack it doesn't make a lot of difference.
> The hh_lock is used on each packet sent.
Some machines create/delete 10.000 entries per second in rt_cache.
I believe they are real. DoS ? you tell it, some people wont agree.
# grep eth0 /proc/net/rt_cache|head -n 10000|cut -f13|sort -u|wc -l
359
(13th field of /proc/net/rt_cache is HHRef)
# rtstat -c1000 -i1
rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|rt_cache|
entries| in_hit|in_slow_|in_slow_|in_no_ro| in_brd|in_marti|in_marti|
out_hit|out_slow|out_slow|gc_total|gc_ignor|gc_goal_|gc_dst_o|in_hlist|out_hlis|
| | tot| mc| ute| | an_dst| an_src|
| _tot| _mc| | ed| miss| verflow| _search|t_search|
2467048|2479640328|1334812199| 0| 0| 34| 0|
117112|6139056109|7510556324| 0| 0| 0| 0|
0|8864696485|9819074477|
2465642| 16594| 4791| 0| 0| 0| 0| 0|
2387| 2738| 0| 0| 0| 0| 0| 21878| 7478|
2464482| 16505| 4765| 0| 0| 0| 0| 0|
2460| 2669| 0| 0| 0| 0| 0| 22224| 7499|
2463512| 17281| 4640| 0| 0| 0| 0| 0|
2449| 2632| 0| 0| 0| 0| 0| 22069| 7240|
2462651| 16504| 4314| 0| 0| 0| 0| 0|
2446| 2497| 0| 0| 0| 0| 0| 20796| 6979|
2462175| 18152| 5792| 0| 0| 0| 0| 0|
2448| 2791| 0| 0| 0| 0| 0| 26164| 7731|
2461889| 16970| 5059| 0| 0| 0| 0| 0|
2535| 2829| 0| 0| 0| 0| 0| 22614| 7595|
2461719| 16446| 4643| 0| 0| 0| 0| 0|
2496| 2717| 0| 0| 0| 0| 0| 21347| 7354|
2461775| 17098| 4782| 0| 0| 0| 0| 0|
2386| 2570| 0| 0| 0| 0| 0| 22448| 7049|
Thank you
Eric
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 22:27 ` Eric Dumazet
@ 2006-12-07 22:53 ` Stephen Hemminger
2006-12-07 23:02 ` Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: Stephen Hemminger @ 2006-12-07 22:53 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, netdev
On Thu, 07 Dec 2006 23:27:00 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:
> Stephen Hemminger a écrit :
> > On Thu, 07 Dec 2006 21:23:07 +0100
> > Eric Dumazet <dada1@cosmosbay.com> wrote:
> >
> >> Stephen Hemminger a écrit :
> >>> The hard header cache is in the main output path, so using
> >>> seqlock instead of reader/writer lock should reduce overhead.
> >>>
> >> Nice work Stephen, I am very interested.
> >>
> >> Did you benchmarked it ?
> >>
> >> I ask because I think hh_refcnt frequent changes may defeat the gain you want
> >> (ie avoiding cache line ping pongs between cpus). seqlock are definitly better
> >> than rwlock, but if we really keep cache lines shared.
> >>
> >> So I would suggest reordering fields of hh_cache and adding one
> >> ____cacheline_aligned_in_smp to keep hh_refcnt in another cache line.
> >>
> >> (hh_len, hh_lock and hh_data should be placed on a 'mostly read' cache line)
> >>
> >> Thank you
> >> Eric
> >
> > It doesn't make any visible performance difference for real networks;
> > copies and device issues are much larger.
>
> Hum, so 'my' machines must be unreal :)
>
> >
> > The hh_refcnt is used only when creating destroying neighbor entries,
> > so except under DoS attack it doesn't make a lot of difference.
> > The hh_lock is used on each packet sent.
>
> Some machines create/delete 10.000 entries per second in rt_cache.
> I believe they are real. DoS ? you tell it, some people wont agree.
That could be fixed by doing RCU, I did some of that previously, but it
seemed better to hit the worst case first. Even Robert doesn't see 10,000
rt cache entries per second.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 22:53 ` Stephen Hemminger
@ 2006-12-07 23:02 ` Eric Dumazet
2006-12-07 23:07 ` David Miller
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-12-07 23:02 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, netdev
Stephen Hemminger a écrit :
> On Thu, 07 Dec 2006 23:27:00 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>> Stephen Hemminger a écrit :
>>> On Thu, 07 Dec 2006 21:23:07 +0100
>>> Eric Dumazet <dada1@cosmosbay.com> wrote:
>>>
>>>> Stephen Hemminger a écrit :
>>>>> The hard header cache is in the main output path, so using
>>>>> seqlock instead of reader/writer lock should reduce overhead.
>>>>>
>>>> Nice work Stephen, I am very interested.
>>>>
>>>> Did you benchmarked it ?
>>>>
>>>> I ask because I think hh_refcnt frequent changes may defeat the gain you want
>>>> (ie avoiding cache line ping pongs between cpus). seqlock are definitly better
>>>> than rwlock, but if we really keep cache lines shared.
>>>>
>>>> So I would suggest reordering fields of hh_cache and adding one
>>>> ____cacheline_aligned_in_smp to keep hh_refcnt in another cache line.
>>>>
>>>> (hh_len, hh_lock and hh_data should be placed on a 'mostly read' cache line)
>>>>
>>>> Thank you
>>>> Eric
>>> It doesn't make any visible performance difference for real networks;
>>> copies and device issues are much larger.
>> Hum, so 'my' machines must be unreal :)
>>
>>> The hh_refcnt is used only when creating destroying neighbor entries,
>>> so except under DoS attack it doesn't make a lot of difference.
>>> The hh_lock is used on each packet sent.
>> Some machines create/delete 10.000 entries per second in rt_cache.
>> I believe they are real. DoS ? you tell it, some people wont agree.
>
>
> That could be fixed by doing RCU, I did some of that previously, but it
> seemed better to hit the worst case first. Even Robert doesn't see 10,000
> rt cache entries per second.
What's the problem with my suggestion of keeping hh_refcnt on another cache line ?
It is basically free (once your change from rwlock to seqlock put in), and no
change of algorithm.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 23:02 ` Eric Dumazet
@ 2006-12-07 23:07 ` David Miller
2006-12-08 8:06 ` [PATCH] NET : force a cache line split in hh_cache in SMP Eric Dumazet
0 siblings, 1 reply; 10+ messages in thread
From: David Miller @ 2006-12-07 23:07 UTC (permalink / raw)
To: dada1; +Cc: shemminger, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Dec 2006 00:02:44 +0100
> What's the problem with my suggestion of keeping hh_refcnt on
> another cache line ? It is basically free (once your change from
> rwlock to seqlock put in), and no change of algorithm.
I think this change is worthwhile for the short term
until we get time to implement something like the RCU
idea.
^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH] convert hh_lock to seqlock
2006-12-07 19:33 [PATCH] convert hh_lock to seqlock Stephen Hemminger
2006-12-07 20:23 ` Eric Dumazet
@ 2006-12-07 23:08 ` David Miller
1 sibling, 0 replies; 10+ messages in thread
From: David Miller @ 2006-12-07 23:08 UTC (permalink / raw)
To: shemminger; +Cc: netdev
From: Stephen Hemminger <shemminger@osdl.org>
Date: Thu, 7 Dec 2006 11:33:09 -0800
> The hard header cache is in the main output path, so using
> seqlock instead of reader/writer lock should reduce overhead.
>
> Signed-off-by: Stephen Hemminger <shemminger@osdl.org>
I've applied this, thanks Stephen.
^ permalink raw reply [flat|nested] 10+ messages in thread
* [PATCH] NET : force a cache line split in hh_cache in SMP
2006-12-07 23:07 ` David Miller
@ 2006-12-08 8:06 ` Eric Dumazet
2006-12-08 8:08 ` David Miller
0 siblings, 1 reply; 10+ messages in thread
From: Eric Dumazet @ 2006-12-08 8:06 UTC (permalink / raw)
To: David Miller; +Cc: shemminger, netdev
[-- Attachment #1: Type: text/plain, Size: 300 bytes --]
hh_lock was converted from rwlock to seqlock by Stephen.
To have a 100% benefit of this change, I suggest to place read mostly fields
of hh_cache in a separate cache line, because hh_refcnt may be changed quite
frequently on some busy machines.
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: hh_cache.patch --]
[-- Type: text/plain, Size: 869 bytes --]
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 631cec4..6be767c 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -193,7 +193,14 @@ struct hh_cache
{
struct hh_cache *hh_next; /* Next entry */
atomic_t hh_refcnt; /* number of users */
- __be16 hh_type; /* protocol identifier, f.e ETH_P_IP
+/*
+ * We want hh_output, hh_len, hh_lock and hh_data be a in a separate
+ * cache line on SMP.
+ * They are mostly read, but hh_refcnt may be changed quite frequently,
+ * incurring cache line ping pongs.
+ */
+ __be16 hh_type ____cacheline_aligned_in_smp;
+ /* protocol identifier, f.e ETH_P_IP
* NOTE: For VLANs, this will be the
* encapuslated type. --BLG
*/
^ permalink raw reply related [flat|nested] 10+ messages in thread
* Re: [PATCH] NET : force a cache line split in hh_cache in SMP
2006-12-08 8:06 ` [PATCH] NET : force a cache line split in hh_cache in SMP Eric Dumazet
@ 2006-12-08 8:08 ` David Miller
0 siblings, 0 replies; 10+ messages in thread
From: David Miller @ 2006-12-08 8:08 UTC (permalink / raw)
To: dada1; +Cc: shemminger, netdev
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Fri, 08 Dec 2006 09:06:32 +0100
> hh_lock was converted from rwlock to seqlock by Stephen.
>
> To have a 100% benefit of this change, I suggest to place read mostly fields
> of hh_cache in a separate cache line, because hh_refcnt may be changed quite
> frequently on some busy machines.
>
> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Applied, thanks Eric.
^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2006-12-08 8:08 UTC | newest]
Thread overview: 10+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2006-12-07 19:33 [PATCH] convert hh_lock to seqlock Stephen Hemminger
2006-12-07 20:23 ` Eric Dumazet
2006-12-07 21:15 ` Stephen Hemminger
2006-12-07 22:27 ` Eric Dumazet
2006-12-07 22:53 ` Stephen Hemminger
2006-12-07 23:02 ` Eric Dumazet
2006-12-07 23:07 ` David Miller
2006-12-08 8:06 ` [PATCH] NET : force a cache line split in hh_cache in SMP Eric Dumazet
2006-12-08 8:08 ` David Miller
2006-12-07 23:08 ` [PATCH] convert hh_lock to seqlock David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).