From mboxrd@z Thu Jan 1 00:00:00 1970 From: Eric Dumazet Subject: Re: [PATCH] netfilter: use per-cpu spinlock rather than RCU Date: Tue, 14 Apr 2009 16:23:33 +0200 Message-ID: <49E49C65.8060808@cosmosbay.com> References: <20090411174801.GG6822@linux.vnet.ibm.com> <18913.53699.544083.320542@cargo.ozlabs.ibm.com> <20090412173108.GO6822@linux.vnet.ibm.com> <20090412.181330.23529546.davem@davemloft.net> <20090413040413.GQ6822@linux.vnet.ibm.com> <20090413095309.631cf395@nehalam> <49E48136.5060700@trash.net> Mime-Version: 1.0 Content-Type: text/plain; charset=ISO-8859-15 Content-Transfer-Encoding: QUOTED-PRINTABLE Cc: Stephen Hemminger , paulmck@linux.vnet.ibm.com, David Miller , paulus@samba.org, mingo@elte.hu, torvalds@linux-foundation.org, laijs@cn.fujitsu.com, jeff.chua.linux@gmail.com, jengelh@medozas.de, r000n@r000n.net, linux-kernel@vger.kernel.org, netfilter-devel@vger.kernel.org, netdev@vger.kernel.org, benh@kernel.crashing.org To: Patrick McHardy Return-path: Received: from gw1.cosmosbay.com ([212.99.114.194]:51845 "EHLO gw1.cosmosbay.com" rhost-flags-OK-OK-OK-OK) by vger.kernel.org with ESMTP id S1752243AbZDNOZ0 convert rfc822-to-8bit (ORCPT ); Tue, 14 Apr 2009 10:25:26 -0400 In-Reply-To: <49E48136.5060700@trash.net> Sender: netdev-owner@vger.kernel.org List-ID: Patrick McHardy a =E9crit : > Stephen Hemminger wrote: >> This is an alternative version of ip/ip6/arp tables locking using >> per-cpu locks. This avoids the overhead of synchronize_net() during >> update but still removes the expensive rwlock in earlier versions. >> >> The idea for this came from an earlier version done by Eric Duzamet. >> Locking is done per-cpu, the fast path locks on the current cpu >> and updates counters. The slow case involves acquiring the locks on >> all cpu's. >> >> The mutex that was added for 2.6.30 in xt_table is unnecessary since >> there already is a mutex for xt[af].mutex that is held. >> >> Tested basic functionality (add/remove/list), but don't have test ca= ses >> for stress, ip6tables or arptables. >=20 > Thanks Stephen, I'll do some testing with ip6tables. Here is the patch I cooked on top of Stephen one to get proper locking. In the "iptables -L" case, we freeze updates on all cpus to get previou= s RCU behavior (not sure it is mandatory, but anyway...) And xt_replace_table() uses same logic to make sure a cpu wont try to p= arse rules and update counters while a writer is replacing tables (and thus callin= g vfree() and unmapping in-use pages) =46eel free to merge this patch to Stephen one before upstream submissi= on Thank you Signed-off-by: Eric Dumazet --- include/linux/netfilter/x_tables.h | 5 ++ net/ipv4/netfilter/arp_tables.c | 20 +++------ net/ipv4/netfilter/ip_tables.c | 24 ++++------- net/ipv6/netfilter/ip6_tables.c | 24 ++++------- net/netfilter/x_tables.c | 55 ++++++++++++++++++++++++++- 5 files changed, 84 insertions(+), 44 deletions(-) diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilt= er/x_tables.h index 1ff1a76..a5840a4 100644 --- a/include/linux/netfilter/x_tables.h +++ b/include/linux/netfilter/x_tables.h @@ -426,6 +426,11 @@ extern struct xt_table *xt_find_table_lock(struct = net *net, u_int8_t af, const char *name); extern void xt_table_unlock(struct xt_table *t); =20 +extern void xt_tlock_lockall(void); +extern void xt_tlock_unlockall(void); +extern void xt_tlock_lock(void); +extern void xt_tlock_unlock(void); + extern int xt_proto_init(struct net *net, u_int8_t af); extern void xt_proto_fini(struct net *net, u_int8_t af); =20 diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_t= ables.c index c60cc11..b561e1e 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -231,8 +231,6 @@ static inline struct arpt_entry *get_entry(void *ba= se, unsigned int offset) return (struct arpt_entry *)(base + offset); } =20 -static DEFINE_PER_CPU(spinlock_t, arp_tables_lock); - unsigned int arpt_do_table(struct sk_buff *skb, unsigned int hook, const struct net_device *in, @@ -256,7 +254,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, outdev =3D out ? out->name : nulldevname; =20 local_bh_disable(); - spin_lock(&__get_cpu_var(arp_tables_lock)); + xt_tlock_lock(); private =3D table->private; table_base =3D private->entries[smp_processor_id()]; =20 @@ -331,7 +329,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, e =3D (void *)e + e->next_offset; } } while (!hotdrop); - spin_unlock(&__get_cpu_var(arp_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); =20 if (hotdrop) @@ -709,33 +707,31 @@ static void get_counters(const struct xt_table_in= fo *t, { unsigned int cpu; unsigned int i =3D 0; - unsigned int curcpu =3D raw_smp_processor_id(); + unsigned int curcpu; =20 + xt_tlock_lockall(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - spin_lock_bh(&per_cpu(arp_tables_lock, curcpu)); + curcpu =3D smp_processor_id(); ARPT_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(arp_tables_lock, curcpu)); =20 for_each_possible_cpu(cpu) { if (cpu =3D=3D curcpu) continue; i =3D 0; - spin_lock_bh(&per_cpu(arp_tables_lock, cpu)); ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(arp_tables_lock, cpu)); } + xt_tlock_unlockall(); } =20 static struct xt_counters *alloc_counters(struct xt_table *table) @@ -1181,14 +1177,14 @@ static int do_add_counters(struct net *net, voi= d __user *user, unsigned int len, /* Choose the copy that is on our node */ local_bh_disable(); curcpu =3D smp_processor_id(); - spin_lock(&__get_cpu_var(arp_tables_lock)); + xt_tlock_lock(); loc_cpu_entry =3D private->entries[curcpu]; ARPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - spin_unlock(&__get_cpu_var(arp_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); unlock_up_free: =20 diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tab= les.c index cb3b779..81d173e 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -297,7 +297,6 @@ static void trace_packet(struct sk_buff *skb, } #endif =20 -static DEFINE_PER_CPU(spinlock_t, ip_tables_lock); =20 /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int @@ -342,7 +341,7 @@ ipt_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); =20 local_bh_disable(); - spin_lock(&__get_cpu_var(ip_tables_lock)); + xt_tlock_lock(); private =3D table->private; table_base =3D private->entries[smp_processor_id()]; =20 @@ -439,7 +438,7 @@ ipt_do_table(struct sk_buff *skb, e =3D (void *)e + e->next_offset; } } while (!hotdrop); - spin_unlock(&__get_cpu_var(ip_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); =20 #ifdef DEBUG_ALLOW_ALL @@ -895,34 +894,32 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i =3D 0; - unsigned int curcpu =3D raw_smp_processor_id(); + unsigned int curcpu; =20 + xt_tlock_lockall(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - spin_lock_bh(&per_cpu(ip_tables_lock, curcpu)); + curcpu =3D smp_processor_id(); IPT_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(ip_tables_lock, curcpu)); =20 for_each_possible_cpu(cpu) { if (cpu =3D=3D curcpu) continue; =20 i =3D 0; - spin_lock_bh(&per_cpu(ip_tables_lock, cpu)); IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(ip_tables_lock, cpu)); } + xt_tlock_unlockall(); } =20 static struct xt_counters * alloc_counters(struct xt_table *table) @@ -1393,14 +1390,14 @@ do_add_counters(struct net *net, void __user *u= ser, unsigned int len, int compat local_bh_disable(); /* Choose the copy that is on our node */ curcpu =3D smp_processor_id(); - spin_lock(&__get_cpu_var(ip_tables_lock)); + xt_tlock_lock(); loc_cpu_entry =3D private->entries[curcpu]; IPT_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - spin_unlock(&__get_cpu_var(ip_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); =20 unlock_up_free: @@ -2220,10 +2217,7 @@ static struct pernet_operations ip_tables_net_op= s =3D { =20 static int __init ip_tables_init(void) { - int cpu, ret; - - for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(ip_tables_lock, cpu)); + int ret; =20 ret =3D register_pernet_subsys(&ip_tables_net_ops); if (ret < 0) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_t= ables.c index ac46ca4..d6ba69e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -329,7 +329,6 @@ static void trace_packet(struct sk_buff *skb, } #endif =20 -static DEFINE_PER_CPU(spinlock_t, ip6_tables_lock); =20 /* Returns one of the generic firewall policies, like NF_ACCEPT. */ unsigned int @@ -368,7 +367,7 @@ ip6t_do_table(struct sk_buff *skb, IP_NF_ASSERT(table->valid_hooks & (1 << hook)); =20 local_bh_disable(); - spin_lock(&__get_cpu_var(ip6_tables_lock)); + xt_tlock_lock(); private =3D table->private; table_base =3D private->entries[smp_processor_id()]; =20 @@ -469,7 +468,7 @@ ip6t_do_table(struct sk_buff *skb, #ifdef CONFIG_NETFILTER_DEBUG ((struct ip6t_entry *)table_base)->comefrom =3D NETFILTER_LINK_POISON= ; #endif - spin_unlock(&__get_cpu_var(ip6_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); =20 #ifdef DEBUG_ALLOW_ALL @@ -925,33 +924,31 @@ get_counters(const struct xt_table_info *t, { unsigned int cpu; unsigned int i =3D 0; - unsigned int curcpu =3D raw_smp_processor_id(); + unsigned int curcpu; =20 + xt_tlock_lockall(); /* Instead of clearing (by a previous call to memset()) * the counters and using adds, we set the counters * with data used by 'current' CPU - * We dont care about preemption here. */ - spin_lock_bh(&per_cpu(ip6_tables_lock, curcpu)); + curcpu =3D smp_processor_id(); IP6T_ENTRY_ITERATE(t->entries[curcpu], t->size, set_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(ip6_tables_lock, curcpu)); =20 for_each_possible_cpu(cpu) { if (cpu =3D=3D curcpu) continue; i =3D 0; - spin_lock_bh(&per_cpu(ip6_tables_lock, cpu)); IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); - spin_unlock_bh(&per_cpu(ip6_tables_lock, cpu)); } + xt_tlock_unlockall(); } =20 static struct xt_counters *alloc_counters(struct xt_table *table) @@ -1423,14 +1420,14 @@ do_add_counters(struct net *net, void __user *u= ser, unsigned int len, local_bh_disable(); /* Choose the copy that is on our node */ curcpu =3D smp_processor_id(); - spin_lock(&__get_cpu_var(ip6_tables_lock)); + xt_tlock_lock(); loc_cpu_entry =3D private->entries[curcpu]; IP6T_ENTRY_ITERATE(loc_cpu_entry, private->size, add_counter_to_entry, paddc, &i); - spin_unlock(&__get_cpu_var(ip6_tables_lock)); + xt_tlock_unlock(); local_bh_enable(); =20 unlock_up_free: @@ -2248,10 +2245,7 @@ static struct pernet_operations ip6_tables_net_o= ps =3D { =20 static int __init ip6_tables_init(void) { - int cpu, ret; - - for_each_possible_cpu(cpu) - spin_lock_init(&per_cpu(ip6_tables_lock, cpu)); + int ret; =20 ret =3D register_pernet_subsys(&ip6_tables_net_ops); if (ret < 0) diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 0d94020..f2ad79f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -670,19 +670,21 @@ xt_replace_table(struct xt_table *table, { struct xt_table_info *oldinfo, *private; =20 + xt_tlock_lockall(); /* Do the substitution. */ private =3D table->private; /* Check inside lock: is the old number correct? */ if (num_counters !=3D private->number) { duprintf("num_counters !=3D table->private->number (%u/%u)\n", num_counters, private->number); + xt_tlock_unlockall(); *error =3D -EAGAIN; return NULL; } oldinfo =3D private; table->private =3D newinfo; newinfo->initial_entries =3D oldinfo->initial_entries; - + xt_tlock_unlockall(); return oldinfo; } EXPORT_SYMBOL_GPL(xt_replace_table); @@ -1126,9 +1128,58 @@ static struct pernet_operations xt_net_ops =3D { .init =3D xt_net_init, }; =20 +static DEFINE_PER_CPU(spinlock_t, xt_tables_lock); + +void xt_tlock_lockall(void) +{ + int cpu; + + local_bh_disable(); + preempt_disable(); + for_each_possible_cpu(cpu) { + spin_lock(&per_cpu(xt_tables_lock, cpu)); + /* + * avoid preempt counter overflow + */ + preempt_enable_no_resched(); + } +} +EXPORT_SYMBOL(xt_tlock_lockall); + +void xt_tlock_unlockall(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + preempt_disable(); + spin_unlock(&per_cpu(xt_tables_lock, cpu)); + } + preempt_enable(); + local_bh_enable(); +} +EXPORT_SYMBOL(xt_tlock_unlockall); + +/* + * preemption should be disabled by caller + */ +void xt_tlock_lock(void) +{ + spin_lock(&__get_cpu_var(xt_tables_lock)); +} +EXPORT_SYMBOL(xt_tlock_lock); + +void xt_tlock_unlock(void) +{ + spin_unlock(&__get_cpu_var(xt_tables_lock)); +} +EXPORT_SYMBOL(xt_tlock_unlock); + static int __init xt_init(void) { - int i, rv; + int i, rv, cpu; + + for_each_possible_cpu(cpu) + spin_lock_init(&per_cpu(xt_tables_lock, cpu)); =20 xt =3D kmalloc(sizeof(struct xt_af) * NFPROTO_NUMPROTO, GFP_KERNEL); if (!xt)