* [RFT 1/4] netfilter: change elements in x_tables
2009-01-27 23:53 [RFT 0/4] Iptables rwlock elimination Stephen Hemminger
@ 2009-01-27 23:53 ` Stephen Hemminger
2009-01-27 23:53 ` [RFT 2/4] netfilter: remove unneeded initializations Stephen Hemminger
` (2 subsequent siblings)
3 siblings, 0 replies; 11+ messages in thread
From: Stephen Hemminger @ 2009-01-27 23:53 UTC (permalink / raw)
To: David Miller, Patrick McHardy; +Cc: netdev, netfilter-devel
[-- Attachment #1: x_tables.patch --]
[-- Type: text/plain, Size: 1062 bytes --]
Change to proper type on private pointer rather than anonymous void.
Keep active elements on same cache line.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
include/linux/netfilter/x_tables.h | 9 ++++-----
1 file changed, 4 insertions(+), 5 deletions(-)
--- a/include/linux/netfilter/x_tables.h 2009-01-26 17:24:43.251543415 -0800
+++ b/include/linux/netfilter/x_tables.h 2009-01-26 17:29:12.510649107 -0800
@@ -349,9 +349,6 @@ struct xt_table
{
struct list_head list;
- /* A unique name... */
- const char name[XT_TABLE_MAXNAMELEN];
-
/* What hooks you will enter on */
unsigned int valid_hooks;
@@ -359,13 +356,15 @@ struct xt_table
rwlock_t lock;
/* Man behind the curtain... */
- //struct ip6t_table_info *private;
- void *private;
+ struct xt_table_info *private;
/* Set this to THIS_MODULE if you are a module, otherwise NULL */
struct module *me;
u_int8_t af; /* address/protocol family */
+
+ /* A unique name... */
+ const char name[XT_TABLE_MAXNAMELEN];
};
#include <linux/netfilter_ipv4.h>
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* [RFT 2/4] netfilter: remove unneeded initializations
2009-01-27 23:53 [RFT 0/4] Iptables rwlock elimination Stephen Hemminger
2009-01-27 23:53 ` [RFT 1/4] netfilter: change elements in x_tables Stephen Hemminger
@ 2009-01-27 23:53 ` Stephen Hemminger
2009-01-28 0:10 ` Alexey Dobriyan
2009-01-27 23:53 ` [RFT 3/4] netfilter: use sequence number synchronization for counters Stephen Hemminger
2009-01-27 23:53 ` [RFT 4/4] netfilter: convert x_tables to use RCU Stephen Hemminger
3 siblings, 1 reply; 11+ messages in thread
From: Stephen Hemminger @ 2009-01-27 23:53 UTC (permalink / raw)
To: David Miller, Patrick McHardy; +Cc: netdev, netfilter-devel
[-- Attachment #1: iptables-lock-init.patch --]
[-- Type: text/plain, Size: 4593 bytes --]
Later patches change the locking on xt_table and the initialization of
the lock element is not needed since the lock is always initialized in
xt_table_register anyway.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
net/ipv4/netfilter/arptable_filter.c | 2 --
net/ipv4/netfilter/iptable_filter.c | 1 -
net/ipv4/netfilter/iptable_mangle.c | 1 -
net/ipv4/netfilter/iptable_raw.c | 1 -
net/ipv4/netfilter/iptable_security.c | 1 -
net/ipv4/netfilter/nf_nat_rule.c | 1 -
net/ipv6/netfilter/ip6table_filter.c | 1 -
net/ipv6/netfilter/ip6table_mangle.c | 1 -
net/ipv6/netfilter/ip6table_raw.c | 1 -
net/ipv6/netfilter/ip6table_security.c | 1 -
10 files changed, 11 deletions(-)
--- a/net/ipv4/netfilter/arptable_filter.c 2009-01-26 17:24:43.687542005 -0800
+++ b/net/ipv4/netfilter/arptable_filter.c 2009-01-26 19:50:37.891042244 -0800
@@ -48,8 +48,6 @@ static struct
static struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
- .private = NULL,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
};
--- a/net/ipv4/netfilter/iptable_filter.c 2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_filter.c 2009-01-26 19:50:37.891042244 -0800
@@ -56,7 +56,6 @@ static struct
static struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
.me = THIS_MODULE,
.af = AF_INET,
};
--- a/net/ipv4/netfilter/iptable_raw.c 2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_raw.c 2009-01-26 19:50:37.891042244 -0800
@@ -39,7 +39,6 @@ static struct
static struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
.me = THIS_MODULE,
.af = AF_INET,
};
--- a/net/ipv4/netfilter/iptable_security.c 2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_security.c 2009-01-26 19:50:37.891042244 -0800
@@ -60,7 +60,6 @@ static struct
static struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(security_table.lock),
.me = THIS_MODULE,
.af = AF_INET,
};
--- a/net/ipv4/netfilter/nf_nat_rule.c 2009-01-26 17:24:43.695541481 -0800
+++ b/net/ipv4/netfilter/nf_nat_rule.c 2009-01-26 19:51:20.338030618 -0800
@@ -61,7 +61,6 @@ static struct
static struct xt_table nat_table = {
.name = "nat",
.valid_hooks = NAT_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(nat_table.lock),
.me = THIS_MODULE,
.af = AF_INET,
};
--- a/net/ipv6/netfilter/ip6table_filter.c 2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_filter.c 2009-01-26 19:50:37.895044361 -0800
@@ -54,7 +54,6 @@ static struct
static struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
.me = THIS_MODULE,
.af = AF_INET6,
};
--- a/net/ipv6/netfilter/ip6table_mangle.c 2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_mangle.c 2009-01-26 19:50:37.895044361 -0800
@@ -60,7 +60,6 @@ static struct
static struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock),
.me = THIS_MODULE,
.af = AF_INET6,
};
--- a/net/ipv6/netfilter/ip6table_raw.c 2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_raw.c 2009-01-26 19:50:37.895044361 -0800
@@ -38,7 +38,6 @@ static struct
static struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
.me = THIS_MODULE,
.af = AF_INET6,
};
--- a/net/ipv6/netfilter/ip6table_security.c 2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_security.c 2009-01-26 19:50:37.895044361 -0800
@@ -59,7 +59,6 @@ static struct
static struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(security_table.lock),
.me = THIS_MODULE,
.af = AF_INET6,
};
--- a/net/ipv4/netfilter/iptable_mangle.c 2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_mangle.c 2009-01-26 19:50:37.895044361 -0800
@@ -67,7 +67,6 @@ static struct
static struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
- .lock = __RW_LOCK_UNLOCKED(packet_mangler.lock),
.me = THIS_MODULE,
.af = AF_INET,
};
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 2/4] netfilter: remove unneeded initializations
2009-01-27 23:53 ` [RFT 2/4] netfilter: remove unneeded initializations Stephen Hemminger
@ 2009-01-28 0:10 ` Alexey Dobriyan
0 siblings, 0 replies; 11+ messages in thread
From: Alexey Dobriyan @ 2009-01-28 0:10 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, Patrick McHardy, netdev, netfilter-devel
On Tue, Jan 27, 2009 at 03:53:12PM -0800, Stephen Hemminger wrote:
> Later patches change the locking on xt_table and the initialization of
> the lock element is not needed since the lock is always initialized in
> xt_table_register anyway.
> net/ipv4/netfilter/arptable_filter.c | 2 --
> net/ipv4/netfilter/iptable_filter.c | 1 -
> net/ipv4/netfilter/iptable_mangle.c | 1 -
> net/ipv4/netfilter/iptable_raw.c | 1 -
> net/ipv4/netfilter/iptable_security.c | 1 -
> net/ipv4/netfilter/nf_nat_rule.c | 1 -
> net/ipv6/netfilter/ip6table_filter.c | 1 -
> net/ipv6/netfilter/ip6table_mangle.c | 1 -
> net/ipv6/netfilter/ip6table_raw.c | 1 -
> net/ipv6/netfilter/ip6table_security.c | 1 -
> 10 files changed, 11 deletions(-)
>
> --- a/net/ipv4/netfilter/arptable_filter.c
> +++ b/net/ipv4/netfilter/arptable_filter.c
> @@ -48,8 +48,6 @@ static struct
> static struct xt_table packet_filter = {
> .name = "filter",
> .valid_hooks = FILTER_VALID_HOOKS,
> - .lock = __RW_LOCK_UNLOCKED(packet_filter.lock),
ACK and you can do the same for ebt_register_table() users.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [RFT 3/4] netfilter: use sequence number synchronization for counters
2009-01-27 23:53 [RFT 0/4] Iptables rwlock elimination Stephen Hemminger
2009-01-27 23:53 ` [RFT 1/4] netfilter: change elements in x_tables Stephen Hemminger
2009-01-27 23:53 ` [RFT 2/4] netfilter: remove unneeded initializations Stephen Hemminger
@ 2009-01-27 23:53 ` Stephen Hemminger
2009-01-28 6:17 ` Eric Dumazet
2009-01-27 23:53 ` [RFT 4/4] netfilter: convert x_tables to use RCU Stephen Hemminger
3 siblings, 1 reply; 11+ messages in thread
From: Stephen Hemminger @ 2009-01-27 23:53 UTC (permalink / raw)
To: David Miller, Patrick McHardy; +Cc: netdev, netfilter-devel
[-- Attachment #1: counters-seqcount.patch --]
[-- Type: text/plain, Size: 7240 bytes --]
Change how synchronization is done on the iptables counters. Use seqcount
wrapper instead of depending on reader/writer lock.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
include/linux/netfilter/x_tables.h | 2 +-
include/linux/netfilter_arp/arp_tables.h | 3 +++
include/linux/netfilter_ipv4/ip_tables.h | 3 +++
include/linux/netfilter_ipv6/ip6_tables.h | 3 +++
net/ipv4/netfilter/arp_tables.c | 20 +++++++++++++-------
net/ipv4/netfilter/ip_tables.c | 20 +++++++++++++-------
net/ipv6/netfilter/ip6_tables.c | 20 +++++++++++++-------
net/netfilter/x_tables.c | 1 +
8 files changed, 50 insertions(+), 22 deletions(-)
4
--- a/include/linux/netfilter_ipv6/ip6_tables.h 2009-01-27 15:03:02.843376881 -0800
+++ b/include/linux/netfilter_ipv6/ip6_tables.h 2009-01-27 15:37:38.935377810 -0800
@@ -103,6 +103,9 @@ struct ip6t_entry
/* Back pointer */
unsigned int comefrom;
+ /* Update of counter synchronization */
+ seqcount_t seq;
+
/* Packet and byte counters. */
struct xt_counters counters;
--- a/net/ipv4/netfilter/arp_tables.c 2009-01-27 14:48:41.579877551 -0800
+++ b/net/ipv4/netfilter/arp_tables.c 2009-01-27 15:45:34.566650540 -0800
@@ -256,7 +256,9 @@ unsigned int arpt_do_table(struct sk_buf
hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
(2 * skb->dev->addr_len);
+ write_seqcount_begin(&e->seq);
ADD_COUNTER(e->counters, hdr_len, 1);
+ write_seqcount_end(&e->seq);
t = arpt_get_target(e);
@@ -549,6 +551,7 @@ static inline int check_entry_size_and_h
< 0 (not ARPT_RETURN). --RR */
/* Clear counters and comefrom */
+ seqcount_init(&e->seq);
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
@@ -703,14 +706,17 @@ static void get_counters(const struct xt
&i);
for_each_possible_cpu(cpu) {
+ struct arpt_entry *e = t->entries[cpu];
+ unsigned int start;
+
if (cpu == curcpu)
continue;
i = 0;
- ARPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ do {
+ start = read_seqcount_begin(&e->seq);
+ ARPT_ENTRY_ITERATE(t->entries[cpu], t->size,
+ add_entry_to_counter, counters, &i);
+ } while (read_seqcount_retry(&e->seq, start));
}
}
@@ -731,9 +737,9 @@ static inline struct xt_counters *alloc_
return ERR_PTR(-ENOMEM);
/* First, sum counters... */
- write_lock_bh(&table->lock);
+ local_bh_disable();
get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ local_bh_enable();
return counters;
}
--- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 14:48:41.567879095 -0800
+++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
@@ -366,7 +366,9 @@ ipt_do_table(struct sk_buff *skb,
if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
goto no_match;
+ write_seqcount_begin(&e->seq);
ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+ write_seqcount_end(&e->seq);
t = ipt_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -758,6 +760,7 @@ check_entry_size_and_hooks(struct ipt_en
< 0 (not IPT_RETURN). --RR */
/* Clear counters and comefrom */
+ seqcount_init(&e->seq);
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
@@ -915,14 +918,17 @@ get_counters(const struct xt_table_info
&i);
for_each_possible_cpu(cpu) {
+ struct ipt_entry *e = t->entries[cpu];
+ unsigned int start;
+
if (cpu == curcpu)
continue;
i = 0;
- IPT_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ do {
+ start = read_seqcount_begin(&e->seq);
+ IPT_ENTRY_ITERATE(e, t->size,
+ add_entry_to_counter, counters, &i);
+ } while (read_seqcount_retry(&e->seq, start));
}
}
@@ -942,9 +948,9 @@ static struct xt_counters * alloc_counte
return ERR_PTR(-ENOMEM);
/* First, sum counters... */
- write_lock_bh(&table->lock);
+ local_bh_disable();
get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ local_bh_enable();
return counters;
}
--- a/net/ipv6/netfilter/ip6_tables.c 2009-01-27 14:48:41.603877653 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c 2009-01-27 15:45:22.262639173 -0800
@@ -392,9 +392,11 @@ ip6t_do_table(struct sk_buff *skb,
if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
goto no_match;
+ write_seqcount_begin(&e->seq);
ADD_COUNTER(e->counters,
ntohs(ipv6_hdr(skb)->payload_len) +
sizeof(struct ipv6hdr), 1);
+ write_seqcount_end(&e->seq);
t = ip6t_get_target(e);
IP_NF_ASSERT(t->u.kernel.target);
@@ -787,6 +789,7 @@ check_entry_size_and_hooks(struct ip6t_e
< 0 (not IP6T_RETURN). --RR */
/* Clear counters and comefrom */
+ seqcount_init(&e->seq);
e->counters = ((struct xt_counters) { 0, 0 });
e->comefrom = 0;
@@ -944,14 +947,17 @@ get_counters(const struct xt_table_info
&i);
for_each_possible_cpu(cpu) {
+ struct ip6t_entry *e = t->entries[cpu];
+ unsigned int start;
+
if (cpu == curcpu)
continue;
i = 0;
- IP6T_ENTRY_ITERATE(t->entries[cpu],
- t->size,
- add_entry_to_counter,
- counters,
- &i);
+ do {
+ start = read_seqcount_begin(&e->seq);
+ IP6T_ENTRY_ITERATE(e, t->size,
+ add_entry_to_counter, counters, &i);
+ } while (read_seqcount_retry(&e->seq, start));
}
}
@@ -971,9 +977,9 @@ static struct xt_counters *alloc_counter
return ERR_PTR(-ENOMEM);
/* First, sum counters... */
- write_lock_bh(&table->lock);
+ local_bh_disable();
get_counters(private, counters);
- write_unlock_bh(&table->lock);
+ local_bh_enable();
return counters;
}
--- a/include/linux/netfilter_ipv4/ip_tables.h 2009-01-27 15:02:24.367376923 -0800
+++ b/include/linux/netfilter_ipv4/ip_tables.h 2009-01-27 15:16:43.940902866 -0800
@@ -91,6 +91,9 @@ struct ipt_entry
/* Back pointer */
unsigned int comefrom;
+ /* Update of counter synchronization */
+ seqcount_t seq;
+
/* Packet and byte counters. */
struct xt_counters counters;
--- a/net/netfilter/x_tables.c 2009-01-27 15:06:05.822878866 -0800
+++ b/net/netfilter/x_tables.c 2009-01-27 15:14:06.004743434 -0800
@@ -720,6 +720,7 @@ struct xt_table *xt_register_table(struc
/* Simplifies replace_table code. */
table->private = bootstrap;
rwlock_init(&table->lock);
+
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
--- a/include/linux/netfilter/x_tables.h 2009-01-27 15:01:04.420377356 -0800
+++ b/include/linux/netfilter/x_tables.h 2009-01-27 15:33:10.791377313 -0800
@@ -385,7 +385,7 @@ struct xt_table_info
/* ipt_entry tables: one per CPU */
/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
- char *entries[1];
+ void *entries[1];
};
#define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
--- a/include/linux/netfilter_arp/arp_tables.h 2009-01-27 15:35:33.827376817 -0800
+++ b/include/linux/netfilter_arp/arp_tables.h 2009-01-27 15:36:48.919127941 -0800
@@ -99,6 +99,9 @@ struct arpt_entry
/* Back pointer */
unsigned int comefrom;
+ /* Update of counter synchronization */
+ seqcount_t seq;
+
/* Packet and byte counters. */
struct xt_counters counters;
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 3/4] netfilter: use sequence number synchronization for counters
2009-01-27 23:53 ` [RFT 3/4] netfilter: use sequence number synchronization for counters Stephen Hemminger
@ 2009-01-28 6:17 ` Eric Dumazet
2009-01-28 6:28 ` Stephen Hemminger
0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2009-01-28 6:17 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, Patrick McHardy, netdev, netfilter-devel
Stephen Hemminger a écrit :
> Change how synchronization is done on the iptables counters. Use seqcount
> wrapper instead of depending on reader/writer lock.
>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
>
>
> --- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 14:48:41.567879095 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
> @@ -366,7 +366,9 @@ ipt_do_table(struct sk_buff *skb,
> if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
> goto no_match;
>
> + write_seqcount_begin(&e->seq);
> ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
> + write_seqcount_end(&e->seq);
>
Its not very good to do it like this, (one seqcount_t per rule per cpu)
>
> t = ipt_get_target(e);
> IP_NF_ASSERT(t->u.kernel.target);
> @@ -758,6 +760,7 @@ check_entry_size_and_hooks(struct ipt_en
> < 0 (not IPT_RETURN). --RR */
>
> /* Clear counters and comefrom */
> + seqcount_init(&e->seq);
> e->counters = ((struct xt_counters) { 0, 0 });
> e->comefrom = 0;
>
> @@ -915,14 +918,17 @@ get_counters(const struct xt_table_info
> &i);
>
> for_each_possible_cpu(cpu) {
> + struct ipt_entry *e = t->entries[cpu];
> + unsigned int start;
> +
> if (cpu == curcpu)
> continue;
> i = 0;
> - IPT_ENTRY_ITERATE(t->entries[cpu],
> - t->size,
> - add_entry_to_counter,
> - counters,
> - &i);
> + do {
> + start = read_seqcount_begin(&e->seq);
> + IPT_ENTRY_ITERATE(e, t->size,
> + add_entry_to_counter, counters, &i);
> + } while (read_seqcount_retry(&e->seq, start));
>
This will never complete on a loaded machine and a big set of rules.
When we reach the end of IPT_ENTRY_ITERATE, we notice many packets came
while doing the iteration and restart,
with wrong accumulated values (no rollback of what was done to accumulator)
You want to do the seqcount_begin/end in the leaf function
(add_entry_to_counter()), and make accumulate a value pair (bytes/counter)
only once you are sure they are correct.
Using one seqcount_t per rule (struct ipt_entry) is very expensive.
(This is 4 bytes per rule X num_possible_cpus())
You need one seqcount_t per cpu
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 3/4] netfilter: use sequence number synchronization for counters
2009-01-28 6:17 ` Eric Dumazet
@ 2009-01-28 6:28 ` Stephen Hemminger
2009-01-28 6:35 ` Eric Dumazet
0 siblings, 1 reply; 11+ messages in thread
From: Stephen Hemminger @ 2009-01-28 6:28 UTC (permalink / raw)
To: Eric Dumazet; +Cc: David Miller, Patrick McHardy, netdev, netfilter-devel
On Wed, 28 Jan 2009 07:17:04 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:
> Stephen Hemminger a écrit :
> > Change how synchronization is done on the iptables counters. Use seqcount
> > wrapper instead of depending on reader/writer lock.
> >
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >
> >
> >
> > --- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 14:48:41.567879095 -0800
> > +++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
> > @@ -366,7 +366,9 @@ ipt_do_table(struct sk_buff *skb,
> > if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
> > goto no_match;
> >
> > + write_seqcount_begin(&e->seq);
> > ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
> > + write_seqcount_end(&e->seq);
> >
> Its not very good to do it like this, (one seqcount_t per rule per cpu)
If we use one count per table, that solves it, but it becomes a hot
spot, and on an active machine will never settle.
> >
> > t = ipt_get_target(e);
> > IP_NF_ASSERT(t->u.kernel.target);
> > @@ -758,6 +760,7 @@ check_entry_size_and_hooks(struct ipt_en
> > < 0 (not IPT_RETURN). --RR */
> >
> > /* Clear counters and comefrom */
> > + seqcount_init(&e->seq);
> > e->counters = ((struct xt_counters) { 0, 0 });
> > e->comefrom = 0;
> >
> > @@ -915,14 +918,17 @@ get_counters(const struct xt_table_info
> > &i);
> >
> > for_each_possible_cpu(cpu) {
> > + struct ipt_entry *e = t->entries[cpu];
> > + unsigned int start;
> > +
> > if (cpu == curcpu)
> > continue;
> > i = 0;
> > - IPT_ENTRY_ITERATE(t->entries[cpu],
> > - t->size,
> > - add_entry_to_counter,
> > - counters,
> > - &i);
> > + do {
> > + start = read_seqcount_begin(&e->seq);
> > + IPT_ENTRY_ITERATE(e, t->size,
> > + add_entry_to_counter, counters, &i);
> > + } while (read_seqcount_retry(&e->seq, start));
> >
> This will never complete on a loaded machine and a big set of rules.
> When we reach the end of IPT_ENTRY_ITERATE, we notice many packets came
> while doing the iteration and restart,
> with wrong accumulated values (no rollback of what was done to accumulator)
>
> You want to do the seqcount_begin/end in the leaf function
> (add_entry_to_counter()), and make accumulate a value pair (bytes/counter)
> only once you are sure they are correct.
>
> Using one seqcount_t per rule (struct ipt_entry) is very expensive.
> (This is 4 bytes per rule X num_possible_cpus())
>
> You need one seqcount_t per cpu
The other option would be swapping counters and using rcu, but that adds lots of
RCU synchronization, and RCU sync overhead only seems to be growing.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 3/4] netfilter: use sequence number synchronization for counters
2009-01-28 6:28 ` Stephen Hemminger
@ 2009-01-28 6:35 ` Eric Dumazet
2009-01-28 16:15 ` Patrick McHardy
0 siblings, 1 reply; 11+ messages in thread
From: Eric Dumazet @ 2009-01-28 6:35 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, Patrick McHardy, netdev, netfilter-devel
Stephen Hemminger a écrit :
> On Wed, 28 Jan 2009 07:17:04 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
>
>
>> Stephen Hemminger a écrit :
>>
>>> Change how synchronization is done on the iptables counters. Use seqcount
>>> wrapper instead of depending on reader/writer lock.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>
>>>
>>>
>>> --- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 14:48:41.567879095 -0800
>>> +++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
>>> @@ -366,7 +366,9 @@ ipt_do_table(struct sk_buff *skb,
>>> if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
>>> goto no_match;
>>>
>>> + write_seqcount_begin(&e->seq);
>>> ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>>> + write_seqcount_end(&e->seq);
>>>
>>>
>> Its not very good to do it like this, (one seqcount_t per rule per cpu)
>>
>
> If we use one count per table, that solves it, but it becomes a hot
> spot, and on an active machine will never settle.
>
>
One seqcount per table and per cpu.
Only one cpu (the owner) will need to change the seqcount (one increment
when entering ipt_do_table(), one increment when leaving)
This location is only read by the thread doing the "iptables -L". We
dont care it spends a few cycles, it's already a big cruncher.
I dont understand your concern, what do you mean by "never settle" ?
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 3/4] netfilter: use sequence number synchronization for counters
2009-01-28 6:35 ` Eric Dumazet
@ 2009-01-28 16:15 ` Patrick McHardy
0 siblings, 0 replies; 11+ messages in thread
From: Patrick McHardy @ 2009-01-28 16:15 UTC (permalink / raw)
To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev, netfilter-devel
Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>>>> --- a/net/ipv4/netfilter/ip_tables.c 2009-01-27
>>>> 14:48:41.567879095 -0800
>>>> +++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27
>>>> 15:45:05.766673246 -0800
>>>> @@ -366,7 +366,9 @@ ipt_do_table(struct sk_buff *skb,
>>>> if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
>>>> goto no_match;
>>>>
>>>> + write_seqcount_begin(&e->seq);
>>>> ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
>>>> + write_seqcount_end(&e->seq);
>>>>
>>> Its not very good to do it like this, (one seqcount_t per rule per cpu)
>>>
>>
>> If we use one count per table, that solves it, but it becomes a hot
>> spot, and on an active machine will never settle.
>>
>>
> One seqcount per table and per cpu.
> Only one cpu (the owner) will need to change the seqcount (one increment
> when entering ipt_do_table(), one increment when leaving)
That would also make sure the counters add up, right?
> This location is only read by the thread doing the "iptables -L". We
> dont care it spends a few cycles, it's already a big cruncher.
Indeed.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 11+ messages in thread
* [RFT 4/4] netfilter: convert x_tables to use RCU
2009-01-27 23:53 [RFT 0/4] Iptables rwlock elimination Stephen Hemminger
` (2 preceding siblings ...)
2009-01-27 23:53 ` [RFT 3/4] netfilter: use sequence number synchronization for counters Stephen Hemminger
@ 2009-01-27 23:53 ` Stephen Hemminger
2009-01-28 7:37 ` Paul E. McKenney
3 siblings, 1 reply; 11+ messages in thread
From: Stephen Hemminger @ 2009-01-27 23:53 UTC (permalink / raw)
To: David Miller, Patrick McHardy; +Cc: netdev, netfilter-devel
[-- Attachment #1: iptables-rcu.patch --]
[-- Type: text/plain, Size: 8228 bytes --]
Replace existing reader/writer lock with Read-Copy-Update to
elminate the overhead of a read lock on each incoming packet.
This should reduce the overhead of iptables especially on SMP
systems.
The previous code used a reader-writer lock for two purposes.
The first was to ensure that the xt_table_info reference was not in
process of being changed. Since xt_table_info is only freed via one
routine, it was a direct conversion to RCU.
The other use of the reader-writer lock was to to block changes
to counters while they were being read. This synchronization was
fixed by the previous patch. But still need to make sure table info
isn't going away. There are a couple of harmless races in reading
the counters, but we always give a valid values they just aren't
exactly frozen in time across all cpu's.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
---
include/linux/netfilter/x_tables.h | 10 ++++++--
net/ipv4/netfilter/arp_tables.c | 12 +++++-----
net/ipv4/netfilter/ip_tables.c | 12 +++++-----
net/ipv6/netfilter/ip6_tables.c | 12 +++++-----
net/netfilter/x_tables.c | 44 ++++++++++++++++++++++++++-----------
5 files changed, 58 insertions(+), 32 deletions(-)
--- a/include/linux/netfilter/x_tables.h 2009-01-27 15:33:10.791377313 -0800
+++ b/include/linux/netfilter/x_tables.h 2009-01-27 15:46:26.024128742 -0800
@@ -352,8 +352,8 @@ struct xt_table
/* What hooks you will enter on */
unsigned int valid_hooks;
- /* Lock for the curtain */
- rwlock_t lock;
+ /* Lock for curtain */
+ spinlock_t lock;
/* Man behind the curtain... */
struct xt_table_info *private;
@@ -383,6 +383,12 @@ struct xt_table_info
unsigned int hook_entry[NF_INET_NUMHOOKS];
unsigned int underflow[NF_INET_NUMHOOKS];
+ /* For the dustman... */
+ union {
+ struct rcu_head rcu;
+ struct work_struct work;
+ };
+
/* ipt_entry tables: one per CPU */
/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
void *entries[1];
--- a/net/ipv4/netfilter/arp_tables.c 2009-01-27 15:45:34.566650540 -0800
+++ b/net/ipv4/netfilter/arp_tables.c 2009-01-27 15:46:26.024128742 -0800
@@ -237,8 +237,8 @@ unsigned int arpt_do_table(struct sk_buf
indev = in ? in->name : nulldevname;
outdev = out ? out->name : nulldevname;
- read_lock_bh(&table->lock);
- private = table->private;
+ rcu_read_lock_bh();
+ private = rcu_dereference(table->private);
table_base = (void *)private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
back = get_entry(table_base, private->underflow[hook]);
@@ -313,7 +313,7 @@ unsigned int arpt_do_table(struct sk_buf
e = (void *)e + e->next_offset;
}
} while (!hotdrop);
- read_unlock_bh(&table->lock);
+ rcu_read_unlock_bh();
if (hotdrop)
return NF_DROP;
@@ -1154,8 +1154,8 @@ static int do_add_counters(struct net *n
goto free;
}
- write_lock_bh(&t->lock);
- private = t->private;
+ rcu_read_lock_bh();
+ private = rcu_dereference(t->private);
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1170,7 +1170,7 @@ static int do_add_counters(struct net *n
paddc,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ rcu_read_unlock_bh();
xt_table_unlock(t);
module_put(t->me);
free:
--- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
+++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:46:26.024128742 -0800
@@ -347,9 +347,9 @@ ipt_do_table(struct sk_buff *skb,
mtpar.family = tgpar.family = NFPROTO_IPV4;
tgpar.hooknum = hook;
- read_lock_bh(&table->lock);
+ rcu_read_lock_bh();
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- private = table->private;
+ private = rcu_dereference(table->private);
table_base = (void *)private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
@@ -447,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
}
} while (!hotdrop);
- read_unlock_bh(&table->lock);
+ rcu_read_unlock_bh();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -1399,8 +1399,8 @@ do_add_counters(struct net *net, void __
goto free;
}
- write_lock_bh(&t->lock);
- private = t->private;
+ rcu_read_lock_bh();
+ private = rcu_dereference(t->private);
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1415,7 +1415,7 @@ do_add_counters(struct net *net, void __
paddc,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ rcu_read_unlock_bh();
xt_table_unlock(t);
module_put(t->me);
free:
--- a/net/ipv6/netfilter/ip6_tables.c 2009-01-27 15:45:22.262639173 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c 2009-01-27 15:46:26.028128353 -0800
@@ -373,9 +373,9 @@ ip6t_do_table(struct sk_buff *skb,
mtpar.family = tgpar.family = NFPROTO_IPV6;
tgpar.hooknum = hook;
- read_lock_bh(&table->lock);
+ rcu_read_lock_bh();
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- private = table->private;
+ private = rcu_dereference(table->private);
table_base = (void *)private->entries[smp_processor_id()];
e = get_entry(table_base, private->hook_entry[hook]);
@@ -476,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb,
#ifdef CONFIG_NETFILTER_DEBUG
((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
#endif
- read_unlock_bh(&table->lock);
+ rcu_read_unlock_bh();
#ifdef DEBUG_ALLOW_ALL
return NF_ACCEPT;
@@ -1430,8 +1430,8 @@ do_add_counters(struct net *net, void __
goto free;
}
- write_lock_bh(&t->lock);
- private = t->private;
+ rcu_read_lock_bh();
+ private = rcu_dereference(t->private);
if (private->number != num_counters) {
ret = -EINVAL;
goto unlock_up_free;
@@ -1446,7 +1446,7 @@ do_add_counters(struct net *net, void __
paddc,
&i);
unlock_up_free:
- write_unlock_bh(&t->lock);
+ rcu_read_unlock_bh();
xt_table_unlock(t);
module_put(t->me);
free:
--- a/net/netfilter/x_tables.c 2009-01-27 15:14:06.004743434 -0800
+++ b/net/netfilter/x_tables.c 2009-01-27 15:48:07.194667206 -0800
@@ -611,17 +611,37 @@ struct xt_table_info *xt_alloc_table_inf
}
EXPORT_SYMBOL(xt_alloc_table_info);
-void xt_free_table_info(struct xt_table_info *info)
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
+{
+ struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ vfree(info->entries[cpu]);
+ kfree(info);
+}
+
+/* callback to do free after all cpu's are done */
+static void xt_free_table_info_rcu(struct rcu_head *arg)
{
- int cpu;
+ struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
- for_each_possible_cpu(cpu) {
- if (info->size <= PAGE_SIZE)
+ if (info->size <= PAGE_SIZE) {
+ unsigned int cpu;
+ for_each_possible_cpu(cpu)
kfree(info->entries[cpu]);
- else
- vfree(info->entries[cpu]);
+ kfree(info);
+ } else {
+ /* can't safely call vfree in current context */
+ INIT_WORK(&info->work, xt_free_table_info_work);
+ schedule_work(&info->work);
}
- kfree(info);
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ call_rcu(&info->rcu, xt_free_table_info_rcu);
}
EXPORT_SYMBOL(xt_free_table_info);
@@ -671,20 +691,20 @@ xt_replace_table(struct xt_table *table,
struct xt_table_info *oldinfo, *private;
/* Do the substitution. */
- write_lock_bh(&table->lock);
+ spin_lock_bh(&table->lock);
private = table->private;
/* Check inside lock: is the old number correct? */
if (num_counters != private->number) {
duprintf("num_counters != table->private->number (%u/%u)\n",
num_counters, private->number);
- write_unlock_bh(&table->lock);
+ spin_unlock_bh(&table->lock);
*error = -EAGAIN;
return NULL;
}
oldinfo = private;
- table->private = newinfo;
+ rcu_assign_pointer(table->private, newinfo);
newinfo->initial_entries = oldinfo->initial_entries;
- write_unlock_bh(&table->lock);
+ spin_unlock_bh(&table->lock);
return oldinfo;
}
@@ -719,7 +739,7 @@ struct xt_table *xt_register_table(struc
/* Simplifies replace_table code. */
table->private = bootstrap;
- rwlock_init(&table->lock);
+ spin_lock_init(&table->lock);
if (!xt_replace_table(table, 0, newinfo, &ret))
goto unlock;
--
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [RFT 4/4] netfilter: convert x_tables to use RCU
2009-01-27 23:53 ` [RFT 4/4] netfilter: convert x_tables to use RCU Stephen Hemminger
@ 2009-01-28 7:37 ` Paul E. McKenney
0 siblings, 0 replies; 11+ messages in thread
From: Paul E. McKenney @ 2009-01-28 7:37 UTC (permalink / raw)
To: Stephen Hemminger; +Cc: David Miller, Patrick McHardy, netdev, netfilter-devel
On Tue, Jan 27, 2009 at 03:53:14PM -0800, Stephen Hemminger wrote:
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
>
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
>
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This synchronization was
> fixed by the previous patch. But still need to make sure table info
> isn't going away. There are a couple of harmless races in reading
> the counters, but we always give a valid values they just aren't
> exactly frozen in time across all cpu's.
Looks good from an RCU perspective!
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>
>
> ---
> include/linux/netfilter/x_tables.h | 10 ++++++--
> net/ipv4/netfilter/arp_tables.c | 12 +++++-----
> net/ipv4/netfilter/ip_tables.c | 12 +++++-----
> net/ipv6/netfilter/ip6_tables.c | 12 +++++-----
> net/netfilter/x_tables.c | 44 ++++++++++++++++++++++++++-----------
> 5 files changed, 58 insertions(+), 32 deletions(-)
>
> --- a/include/linux/netfilter/x_tables.h 2009-01-27 15:33:10.791377313 -0800
> +++ b/include/linux/netfilter/x_tables.h 2009-01-27 15:46:26.024128742 -0800
> @@ -352,8 +352,8 @@ struct xt_table
> /* What hooks you will enter on */
> unsigned int valid_hooks;
>
> - /* Lock for the curtain */
> - rwlock_t lock;
> + /* Lock for curtain */
> + spinlock_t lock;
>
> /* Man behind the curtain... */
> struct xt_table_info *private;
> @@ -383,6 +383,12 @@ struct xt_table_info
> unsigned int hook_entry[NF_INET_NUMHOOKS];
> unsigned int underflow[NF_INET_NUMHOOKS];
>
> + /* For the dustman... */
> + union {
> + struct rcu_head rcu;
> + struct work_struct work;
> + };
> +
> /* ipt_entry tables: one per CPU */
> /* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> void *entries[1];
> --- a/net/ipv4/netfilter/arp_tables.c 2009-01-27 15:45:34.566650540 -0800
> +++ b/net/ipv4/netfilter/arp_tables.c 2009-01-27 15:46:26.024128742 -0800
> @@ -237,8 +237,8 @@ unsigned int arpt_do_table(struct sk_buf
> indev = in ? in->name : nulldevname;
> outdev = out ? out->name : nulldevname;
>
> - read_lock_bh(&table->lock);
> - private = table->private;
> + rcu_read_lock_bh();
> + private = rcu_dereference(table->private);
> table_base = (void *)private->entries[smp_processor_id()];
> e = get_entry(table_base, private->hook_entry[hook]);
> back = get_entry(table_base, private->underflow[hook]);
> @@ -313,7 +313,7 @@ unsigned int arpt_do_table(struct sk_buf
> e = (void *)e + e->next_offset;
> }
> } while (!hotdrop);
> - read_unlock_bh(&table->lock);
> + rcu_read_unlock_bh();
>
> if (hotdrop)
> return NF_DROP;
> @@ -1154,8 +1154,8 @@ static int do_add_counters(struct net *n
> goto free;
> }
>
> - write_lock_bh(&t->lock);
> - private = t->private;
> + rcu_read_lock_bh();
> + private = rcu_dereference(t->private);
> if (private->number != num_counters) {
> ret = -EINVAL;
> goto unlock_up_free;
> @@ -1170,7 +1170,7 @@ static int do_add_counters(struct net *n
> paddc,
> &i);
> unlock_up_free:
> - write_unlock_bh(&t->lock);
> + rcu_read_unlock_bh();
> xt_table_unlock(t);
> module_put(t->me);
> free:
> --- a/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:45:05.766673246 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c 2009-01-27 15:46:26.024128742 -0800
> @@ -347,9 +347,9 @@ ipt_do_table(struct sk_buff *skb,
> mtpar.family = tgpar.family = NFPROTO_IPV4;
> tgpar.hooknum = hook;
>
> - read_lock_bh(&table->lock);
> + rcu_read_lock_bh();
> IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> - private = table->private;
> + private = rcu_dereference(table->private);
> table_base = (void *)private->entries[smp_processor_id()];
> e = get_entry(table_base, private->hook_entry[hook]);
>
> @@ -447,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> }
> } while (!hotdrop);
>
> - read_unlock_bh(&table->lock);
> + rcu_read_unlock_bh();
>
> #ifdef DEBUG_ALLOW_ALL
> return NF_ACCEPT;
> @@ -1399,8 +1399,8 @@ do_add_counters(struct net *net, void __
> goto free;
> }
>
> - write_lock_bh(&t->lock);
> - private = t->private;
> + rcu_read_lock_bh();
> + private = rcu_dereference(t->private);
> if (private->number != num_counters) {
> ret = -EINVAL;
> goto unlock_up_free;
> @@ -1415,7 +1415,7 @@ do_add_counters(struct net *net, void __
> paddc,
> &i);
> unlock_up_free:
> - write_unlock_bh(&t->lock);
> + rcu_read_unlock_bh();
> xt_table_unlock(t);
> module_put(t->me);
> free:
> --- a/net/ipv6/netfilter/ip6_tables.c 2009-01-27 15:45:22.262639173 -0800
> +++ b/net/ipv6/netfilter/ip6_tables.c 2009-01-27 15:46:26.028128353 -0800
> @@ -373,9 +373,9 @@ ip6t_do_table(struct sk_buff *skb,
> mtpar.family = tgpar.family = NFPROTO_IPV6;
> tgpar.hooknum = hook;
>
> - read_lock_bh(&table->lock);
> + rcu_read_lock_bh();
> IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> - private = table->private;
> + private = rcu_dereference(table->private);
> table_base = (void *)private->entries[smp_processor_id()];
> e = get_entry(table_base, private->hook_entry[hook]);
>
> @@ -476,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb,
> #ifdef CONFIG_NETFILTER_DEBUG
> ((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
> #endif
> - read_unlock_bh(&table->lock);
> + rcu_read_unlock_bh();
>
> #ifdef DEBUG_ALLOW_ALL
> return NF_ACCEPT;
> @@ -1430,8 +1430,8 @@ do_add_counters(struct net *net, void __
> goto free;
> }
>
> - write_lock_bh(&t->lock);
> - private = t->private;
> + rcu_read_lock_bh();
> + private = rcu_dereference(t->private);
> if (private->number != num_counters) {
> ret = -EINVAL;
> goto unlock_up_free;
> @@ -1446,7 +1446,7 @@ do_add_counters(struct net *net, void __
> paddc,
> &i);
> unlock_up_free:
> - write_unlock_bh(&t->lock);
> + rcu_read_unlock_bh();
> xt_table_unlock(t);
> module_put(t->me);
> free:
> --- a/net/netfilter/x_tables.c 2009-01-27 15:14:06.004743434 -0800
> +++ b/net/netfilter/x_tables.c 2009-01-27 15:48:07.194667206 -0800
> @@ -611,17 +611,37 @@ struct xt_table_info *xt_alloc_table_inf
> }
> EXPORT_SYMBOL(xt_alloc_table_info);
>
> -void xt_free_table_info(struct xt_table_info *info)
> +/* callback to do free for vmalloc'd case */
> +static void xt_free_table_info_work(struct work_struct *arg)
> +{
> + struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
> + unsigned int cpu;
> +
> + for_each_possible_cpu(cpu)
> + vfree(info->entries[cpu]);
> + kfree(info);
> +}
> +
> +/* callback to do free after all cpu's are done */
> +static void xt_free_table_info_rcu(struct rcu_head *arg)
> {
> - int cpu;
> + struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
>
> - for_each_possible_cpu(cpu) {
> - if (info->size <= PAGE_SIZE)
> + if (info->size <= PAGE_SIZE) {
> + unsigned int cpu;
> + for_each_possible_cpu(cpu)
> kfree(info->entries[cpu]);
> - else
> - vfree(info->entries[cpu]);
> + kfree(info);
> + } else {
> + /* can't safely call vfree in current context */
> + INIT_WORK(&info->work, xt_free_table_info_work);
> + schedule_work(&info->work);
> }
> - kfree(info);
> +}
> +
> +void xt_free_table_info(struct xt_table_info *info)
> +{
> + call_rcu(&info->rcu, xt_free_table_info_rcu);
> }
> EXPORT_SYMBOL(xt_free_table_info);
>
> @@ -671,20 +691,20 @@ xt_replace_table(struct xt_table *table,
> struct xt_table_info *oldinfo, *private;
>
> /* Do the substitution. */
> - write_lock_bh(&table->lock);
> + spin_lock_bh(&table->lock);
> private = table->private;
> /* Check inside lock: is the old number correct? */
> if (num_counters != private->number) {
> duprintf("num_counters != table->private->number (%u/%u)\n",
> num_counters, private->number);
> - write_unlock_bh(&table->lock);
> + spin_unlock_bh(&table->lock);
> *error = -EAGAIN;
> return NULL;
> }
> oldinfo = private;
> - table->private = newinfo;
> + rcu_assign_pointer(table->private, newinfo);
> newinfo->initial_entries = oldinfo->initial_entries;
> - write_unlock_bh(&table->lock);
> + spin_unlock_bh(&table->lock);
>
> return oldinfo;
> }
> @@ -719,7 +739,7 @@ struct xt_table *xt_register_table(struc
>
> /* Simplifies replace_table code. */
> table->private = bootstrap;
> - rwlock_init(&table->lock);
> + spin_lock_init(&table->lock);
>
> if (!xt_replace_table(table, 0, newinfo, &ret))
> goto unlock;
>
> --
>
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at http://vger.kernel.org/majordomo-info.html
^ permalink raw reply [flat|nested] 11+ messages in thread