netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/5] iptables lockless receive (v0.3)
@ 2009-01-29  6:25 Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 1/5] netfilter: change elements in x_tables Stephen Hemminger
                   ` (5 more replies)
  0 siblings, 6 replies; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

Another experimental version of iptables locking that removes
reader/writer lock overhead. This version incorprates suggestion
of moving counter to table_info.  I haven't done ebtables, and don't
know if it is worth bothering.

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 1/5] netfilter: change elements in x_tables
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
@ 2009-01-29  6:25 ` Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 2/5] netfilter: remove unneeded initializations Stephen Hemminger
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: x_tables.patch --]
[-- Type: text/plain, Size: 1062 bytes --]

Change to proper type on private pointer rather than anonymous void.
Keep active elements on same cache line.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-01-26 17:24:43.251543415 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-26 17:29:12.510649107 -0800
@@ -349,9 +349,6 @@ struct xt_table
 {
 	struct list_head list;
 
-	/* A unique name... */
-	const char name[XT_TABLE_MAXNAMELEN];
-
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
@@ -359,13 +356,15 @@ struct xt_table
 	rwlock_t lock;
 
 	/* Man behind the curtain... */
-	//struct ip6t_table_info *private;
-	void *private;
+	struct xt_table_info *private;
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
 	u_int8_t af;		/* address/protocol family */
+
+	/* A unique name... */
+	const char name[XT_TABLE_MAXNAMELEN];
 };
 
 #include <linux/netfilter_ipv4.h>

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 2/5] netfilter: remove unneeded initializations
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 1/5] netfilter: change elements in x_tables Stephen Hemminger
@ 2009-01-29  6:25 ` Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 3/5] ebtables: " Stephen Hemminger
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: iptables-lock-init.patch --]
[-- Type: text/plain, Size: 4593 bytes --]

Later patches change the locking on xt_table and the initialization of
the lock element is not needed since the lock is always initialized in
xt_table_register anyway.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 net/ipv4/netfilter/arptable_filter.c   |    2 --
 net/ipv4/netfilter/iptable_filter.c    |    1 -
 net/ipv4/netfilter/iptable_mangle.c    |    1 -
 net/ipv4/netfilter/iptable_raw.c       |    1 -
 net/ipv4/netfilter/iptable_security.c  |    1 -
 net/ipv4/netfilter/nf_nat_rule.c       |    1 -
 net/ipv6/netfilter/ip6table_filter.c   |    1 -
 net/ipv6/netfilter/ip6table_mangle.c   |    1 -
 net/ipv6/netfilter/ip6table_raw.c      |    1 -
 net/ipv6/netfilter/ip6table_security.c |    1 -
 10 files changed, 11 deletions(-)

--- a/net/ipv4/netfilter/arptable_filter.c	2009-01-26 17:24:43.687542005 -0800
+++ b/net/ipv4/netfilter/arptable_filter.c	2009-01-26 19:50:37.891042244 -0800
@@ -48,8 +48,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
-	.private	= NULL,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_ARP,
 };
--- a/net/ipv4/netfilter/iptable_filter.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_filter.c	2009-01-26 19:50:37.891042244 -0800
@@ -56,7 +56,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv4/netfilter/iptable_raw.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_raw.c	2009-01-26 19:50:37.891042244 -0800
@@ -39,7 +39,6 @@ static struct
 static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
-	.lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE,
 	.af = AF_INET,
 };
--- a/net/ipv4/netfilter/iptable_security.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_security.c	2009-01-26 19:50:37.891042244 -0800
@@ -60,7 +60,6 @@ static struct
 static struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(security_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv4/netfilter/nf_nat_rule.c	2009-01-26 17:24:43.695541481 -0800
+++ b/net/ipv4/netfilter/nf_nat_rule.c	2009-01-26 19:51:20.338030618 -0800
@@ -61,7 +61,6 @@ static struct
 static struct xt_table nat_table = {
 	.name		= "nat",
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(nat_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv6/netfilter/ip6table_filter.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_filter.c	2009-01-26 19:50:37.895044361 -0800
@@ -54,7 +54,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_mangle.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_mangle.c	2009-01-26 19:50:37.895044361 -0800
@@ -60,7 +60,6 @@ static struct
 static struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_raw.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_raw.c	2009-01-26 19:50:37.895044361 -0800
@@ -38,7 +38,6 @@ static struct
 static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
-	.lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE,
 	.af = AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_security.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_security.c	2009-01-26 19:50:37.895044361 -0800
@@ -59,7 +59,6 @@ static struct
 static struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(security_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv4/netfilter/iptable_mangle.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_mangle.c	2009-01-26 19:50:37.895044361 -0800
@@ -67,7 +67,6 @@ static struct
 static struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 3/5] ebtables: remove unneeded initializations
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 1/5] netfilter: change elements in x_tables Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 2/5] netfilter: remove unneeded initializations Stephen Hemminger
@ 2009-01-29  6:25 ` Stephen Hemminger
  2009-01-29  6:25 ` [PATCH 4/5] netfilter: use sequence number synchronization for counters Stephen Hemminger
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: ebtables-lock-init.patch --]
[-- Type: text/plain, Size: 1537 bytes --]

The initialization of the lock element is not needed
since the lock is always initialized in ebt_register_table.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/bridge/netfilter/ebtable_broute.c |    1 -
 net/bridge/netfilter/ebtable_filter.c |    1 -
 net/bridge/netfilter/ebtable_nat.c    |    1 -
 3 files changed, 3 deletions(-)

--- a/net/bridge/netfilter/ebtable_broute.c	2009-01-27 17:09:10.313100854 -0800
+++ b/net/bridge/netfilter/ebtable_broute.c	2009-01-27 17:09:15.862142852 -0800
@@ -46,7 +46,6 @@ static struct ebt_table broute_table =
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.lock		= __RW_LOCK_UNLOCKED(broute_table.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
--- a/net/bridge/netfilter/ebtable_filter.c	2009-01-27 17:08:50.725100955 -0800
+++ b/net/bridge/netfilter/ebtable_filter.c	2009-01-27 17:08:53.828611768 -0800
@@ -55,7 +55,6 @@ static struct ebt_table frame_filter =
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_filter.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
--- a/net/bridge/netfilter/ebtable_nat.c	2009-01-27 17:09:22.896602465 -0800
+++ b/net/bridge/netfilter/ebtable_nat.c	2009-01-27 17:09:31.589085328 -0800
@@ -55,7 +55,6 @@ static struct ebt_table frame_nat =
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_nat.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 4/5] netfilter: use sequence number synchronization for counters
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
                   ` (2 preceding siblings ...)
  2009-01-29  6:25 ` [PATCH 3/5] ebtables: " Stephen Hemminger
@ 2009-01-29  6:25 ` Stephen Hemminger
  2009-01-29  8:47   ` Eric Dumazet
  2009-01-29  6:25 ` [PATCH 5/5] netfilter: convert x_tables to use RCU Stephen Hemminger
  2009-01-29  8:07 ` [PATCH 0/5] iptables lockless receive (v0.3) Eric Dumazet
  5 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: counters-seqcount.patch --]
[-- Type: text/plain, Size: 9055 bytes --]

Change how synchronization is done on the iptables counters. Use seqcount
wrapper instead of depending on reader/writer lock.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |    3 +++
 net/ipv4/netfilter/arp_tables.c    |   24 +++++++++++++++++++-----
 net/ipv4/netfilter/ip_tables.c     |   24 +++++++++++++++++++-----
 net/ipv6/netfilter/ip6_tables.c    |   32 +++++++++++++++++++++++---------
 net/netfilter/x_tables.c           |   11 +++++++++++
 5 files changed, 75 insertions(+), 19 deletions(-)
4
--- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 21:24:39.223991934 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
@@ -230,6 +230,7 @@ unsigned int arpt_do_table(struct sk_buf
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_target_param tgpar;
+	seqcount_t *seq;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -240,6 +241,7 @@ unsigned int arpt_do_table(struct sk_buf
 	read_lock_bh(&table->lock);
 	private = table->private;
 	table_base = (void *)private->entries[smp_processor_id()];
+	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -256,7 +258,9 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
+			write_seqcount_begin(seq);
 			ADD_COUNTER(e->counters, hdr_len, 1);
+			write_seqcount_end(seq);
 
 			t = arpt_get_target(e);
 
@@ -662,10 +666,20 @@ static int translate_table(const char *n
 
 /* Gets counters. */
 static inline int add_entry_to_counter(const struct arpt_entry *e,
+				       seqcount_t *seq,
 				       struct xt_counters total[],
 				       unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+	struct xt_counters count;
+	unsigned int start;
+
+	/* Atomic fetch */
+	do {
+		start = read_seqcount_begin(seq);
+		count = e->counters;
+	} while (read_seqcount_retry(seq, start));
+
+	ADD_COUNTER(total[*i], count.bcnt, count.pcnt);
 
 	(*i)++;
 	return 0;
@@ -709,6 +723,7 @@ static void get_counters(const struct xt
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
+				   per_cpu_ptr(t->seq, cpu),
 				   counters,
 				   &i);
 	}
@@ -731,9 +746,9 @@ static inline struct xt_counters *alloc_
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_disable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_enable();
 
 	return counters;
 }
@@ -1736,8 +1751,7 @@ struct xt_table *arpt_register_table(str
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap = { 0 };
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
--- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 21:24:39.211990658 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
@@ -327,6 +327,7 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	seqcount_t *seq;
 
 	/* Initialization */
 	ip = ip_hdr(skb);
@@ -351,6 +352,7 @@ ipt_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	private = table->private;
 	table_base = (void *)private->entries[smp_processor_id()];
+	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -366,7 +368,9 @@ ipt_do_table(struct sk_buff *skb,
 			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
+			write_seqcount_begin(seq);
 			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+			write_seqcount_end(seq);
 
 			t = ipt_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -872,10 +876,20 @@ translate_table(const char *name,
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ipt_entry *e,
+		     seqcount_t *seq,
 		     struct xt_counters total[],
 		     unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+	struct xt_counters count;
+	unsigned int start;
+
+	/* Atomic fetch */
+	do {
+		start = read_seqcount_begin(seq);
+		count = e->counters;
+	} while (read_seqcount_retry(seq, start));
+
+	ADD_COUNTER(total[*i], count.bcnt, count.pcnt);
 
 	(*i)++;
 	return 0;
@@ -921,6 +935,7 @@ get_counters(const struct xt_table_info 
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
+				  per_cpu_ptr(t->seq, cpu),
 				  counters,
 				  &i);
 	}
@@ -942,9 +957,9 @@ static struct xt_counters * alloc_counte
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_disable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_enable();
 
 	return counters;
 }
@@ -2064,8 +2079,7 @@ struct xt_table *ipt_register_table(stru
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap = { 0 };
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
--- a/net/ipv6/netfilter/ip6_tables.c	2009-01-28 21:24:39.243992135 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-01-28 22:13:16.419490741 -0800
@@ -357,6 +357,7 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_table_info *private;
 	struct xt_match_param mtpar;
 	struct xt_target_param tgpar;
+	seqcount_t *seq;
 
 	/* Initialization */
 	indev = in ? in->name : nulldevname;
@@ -377,6 +378,7 @@ ip6t_do_table(struct sk_buff *skb,
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
 	private = table->private;
 	table_base = (void *)private->entries[smp_processor_id()];
+	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -392,9 +394,11 @@ ip6t_do_table(struct sk_buff *skb,
 			if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
+			write_seqcount_begin(seq);
 			ADD_COUNTER(e->counters,
 				    ntohs(ipv6_hdr(skb)->payload_len) +
 				    sizeof(struct ipv6hdr), 1);
+			write_seqcount_end(seq);
 
 			t = ip6t_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -901,11 +905,21 @@ translate_table(const char *name,
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ip6t_entry *e,
+		     seqcount_t *seq,
 		     struct xt_counters total[],
 		     unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
 
+	struct xt_counters count;
+	unsigned int start;
+
+	/* Atomic fetch */
+	do {
+		start = read_seqcount_begin(seq);
+		count = e->counters;
+	} while (read_seqcount_retry(seq, start));
+
+	ADD_COUNTER(total[*i], count.bcnt, count.pcnt);
 	(*i)++;
 	return 0;
 }
@@ -948,10 +962,11 @@ get_counters(const struct xt_table_info 
 			continue;
 		i = 0;
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
-				  t->size,
-				  add_entry_to_counter,
-				  counters,
-				  &i);
+				   t->size,
+				   add_entry_to_counter,
+				   per_cpu_ptr(t->seq, cpu),
+				   counters,
+				   &i);
 	}
 }
 
@@ -971,9 +986,9 @@ static struct xt_counters *alloc_counter
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_disable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_enable();
 
 	return counters;
 }
@@ -2094,8 +2109,7 @@ struct xt_table *ip6t_register_table(str
 {
 	int ret;
 	struct xt_table_info *newinfo;
-	struct xt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	struct xt_table_info bootstrap =  { 0 };
 	void *loc_cpu_entry;
 	struct xt_table *new_table;
 
--- a/net/netfilter/x_tables.c	2009-01-28 21:39:17.644495623 -0800
+++ b/net/netfilter/x_tables.c	2009-01-28 22:14:33.143990681 -0800
@@ -591,8 +591,18 @@ struct xt_table_info *xt_alloc_table_inf
 		return NULL;
 
 	newinfo->size = size;
+	newinfo->seq = alloc_percpu(seqcount_t);
+	if (!newinfo->seq) {
+		kfree(newinfo);
+		return NULL;
+	}
+
 
 	for_each_possible_cpu(cpu) {
+		seqcount_t *cnt = per_cpu_ptr(newinfo->seq, cpu);
+
+		seqcount_init(cnt);
+
 		if (size <= PAGE_SIZE)
 			newinfo->entries[cpu] = kmalloc_node(size,
 							GFP_KERNEL,
@@ -621,6 +631,7 @@ void xt_free_table_info(struct xt_table_
 		else
 			vfree(info->entries[cpu]);
 	}
+	free_percpu(info->seq);
 	kfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
--- a/include/linux/netfilter/x_tables.h	2009-01-28 21:35:12.044240843 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
@@ -383,6 +383,9 @@ struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* Secret compartment */
+	seqcount_t *seq;
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	char *entries[1];

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
                   ` (3 preceding siblings ...)
  2009-01-29  6:25 ` [PATCH 4/5] netfilter: use sequence number synchronization for counters Stephen Hemminger
@ 2009-01-29  6:25 ` Stephen Hemminger
  2009-01-29 23:04   ` Eric Dumazet
  2009-01-29  8:07 ` [PATCH 0/5] iptables lockless receive (v0.3) Eric Dumazet
  5 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29  6:25 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: iptables-rcu.patch --]
[-- Type: text/plain, Size: 8238 bytes --]

Replace existing reader/writer lock with Read-Copy-Update to
elminate the overhead of a read lock on each incoming packet.
This should reduce the overhead of iptables especially on SMP
systems.

The previous code used a reader-writer lock for two purposes.
The first was to ensure that the xt_table_info reference was not in
process of being changed. Since xt_table_info is only freed via one
routine, it was a direct conversion to RCU.

The other use of the reader-writer lock was to to block changes
to counters while they were being read. This synchronization was
fixed by the previous patch.  But still need to make sure table info
isn't going away.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |   10 ++++++-
 net/ipv4/netfilter/arp_tables.c    |   12 ++++-----
 net/ipv4/netfilter/ip_tables.c     |   12 ++++-----
 net/ipv6/netfilter/ip6_tables.c    |   12 ++++-----
 net/netfilter/x_tables.c           |   48 ++++++++++++++++++++++++++-----------
 5 files changed, 60 insertions(+), 34 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:14:54.648490491 -0800
@@ -352,8 +352,8 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	rwlock_t lock;
+	/* Lock for curtain */
+	spinlock_t lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -386,6 +386,12 @@ struct xt_table_info
 	/* Secret compartment */
 	seqcount_t *seq;
 
+	/* For the dustman... */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	char *entries[1];
--- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:14:54.648490491 -0800
@@ -238,8 +238,8 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
@@ -315,7 +315,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -1163,8 +1163,8 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1179,7 +1179,7 @@ static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:14:54.648490491 -0800
@@ -348,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
+	rcu_read_lock_bh();
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
@@ -449,7 +449,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -1408,8 +1408,8 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1424,7 +1424,7 @@ do_add_counters(struct net *net, void __
 			  paddc,
 			  &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-01-28 22:13:16.419490741 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-01-28 22:14:54.652490133 -0800
@@ -374,9 +374,9 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
+	rcu_read_lock_bh();
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	seq = per_cpu_ptr(private->seq, smp_processor_id());
 	e = get_entry(table_base, private->hook_entry[hook]);
@@ -478,7 +478,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -1439,8 +1439,8 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1455,7 +1455,7 @@ do_add_counters(struct net *net, void __
 			  paddc,
 			  &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-01-28 22:14:33.143990681 -0800
+++ b/net/netfilter/x_tables.c	2009-01-28 22:17:40.183990832 -0800
@@ -621,19 +621,39 @@ struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
 {
-	int cpu;
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
+	unsigned int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
-	free_percpu(info->seq);
+	for_each_possible_cpu(cpu)
+		vfree(info->entries[cpu]);
 	kfree(info);
 }
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+ 	struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+
+	free_percpu(info->seq);
+
+ 	if (info->size <= PAGE_SIZE) {
+		unsigned int cpu;
+ 		for_each_possible_cpu(cpu)
+ 			kfree(info->entries[cpu]);
+ 		kfree(info);
+ 	} else {
+ 		/* can't safely call vfree in current context */
+ 		INIT_WORK(&info->work, xt_free_table_info_work);
+ 		schedule_work(&info->work);
+  	}
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ 	call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
@@ -682,20 +702,20 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	spin_lock_bh(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		spin_unlock_bh(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	spin_unlock_bh(&table->lock);
 
 	return oldinfo;
 }
@@ -730,7 +750,7 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	spin_lock_init(&table->lock);
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 

-- 


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 0/5] iptables lockless receive (v0.3)
  2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
                   ` (4 preceding siblings ...)
  2009-01-29  6:25 ` [PATCH 5/5] netfilter: convert x_tables to use RCU Stephen Hemminger
@ 2009-01-29  8:07 ` Eric Dumazet
  5 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2009-01-29  8:07 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Another experimental version of iptables locking that removes
> reader/writer lock overhead. This version incorprates suggestion
> of moving counter to table_info.  I haven't done ebtables, and don't
> know if it is worth bothering.
> 

A fast review reveals no problems, this looks good.

I will do some testings today and give a feedback.

By the way, you forgot to CC Patrick McHardy and netfilter-devel

Thanks a lot Stephen


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 4/5] netfilter: use sequence number synchronization for counters
  2009-01-29  6:25 ` [PATCH 4/5] netfilter: use sequence number synchronization for counters Stephen Hemminger
@ 2009-01-29  8:47   ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2009-01-29  8:47 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Change how synchronization is done on the iptables counters. Use seqcount
> wrapper instead of depending on reader/writer lock.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> ---
>  include/linux/netfilter/x_tables.h |    3 +++
>  net/ipv4/netfilter/arp_tables.c    |   24 +++++++++++++++++++-----
>  net/ipv4/netfilter/ip_tables.c     |   24 +++++++++++++++++++-----
>  net/ipv6/netfilter/ip6_tables.c    |   32 +++++++++++++++++++++++---------
>  net/netfilter/x_tables.c           |   11 +++++++++++
>  5 files changed, 75 insertions(+), 19 deletions(-)
> 4
> --- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 21:24:39.223991934 -0800
> +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
> @@ -230,6 +230,7 @@ unsigned int arpt_do_table(struct sk_buf
>  	void *table_base;
>  	const struct xt_table_info *private;
>  	struct xt_target_param tgpar;
> +	seqcount_t *seq;
>  
>  	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
>  		return NF_DROP;
> @@ -240,6 +241,7 @@ unsigned int arpt_do_table(struct sk_buf
>  	read_lock_bh(&table->lock);
>  	private = table->private;
>  	table_base = (void *)private->entries[smp_processor_id()];
> +	seq = per_cpu_ptr(private->seq, smp_processor_id());

But, why not using a global seqcount_t, shared by all tables, no matter they
are arp_tables, ip_tables, ip6_tables ?

A global PER_CPU variable, not dynamically allocated, so that its
access can be faster (no indirection), and uses exactly 4 bytes per
possible cpu.

DEFINE_PER_CPU(seqcount_t,  nf_seqcount);



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-29  6:25 ` [PATCH 5/5] netfilter: convert x_tables to use RCU Stephen Hemminger
@ 2009-01-29 23:04   ` Eric Dumazet
  2009-01-29 23:16     ` Stephen Hemminger
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-01-29 23:04 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
> 
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
> 
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This synchronization was
> fixed by the previous patch.  But still need to make sure table info
> isn't going away.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> ---
>  include/linux/netfilter/x_tables.h |   10 ++++++-
>  net/ipv4/netfilter/arp_tables.c    |   12 ++++-----
>  net/ipv4/netfilter/ip_tables.c     |   12 ++++-----
>  net/ipv6/netfilter/ip6_tables.c    |   12 ++++-----
>  net/netfilter/x_tables.c           |   48 ++++++++++++++++++++++++++-----------
>  5 files changed, 60 insertions(+), 34 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:14:54.648490491 -0800
> @@ -352,8 +352,8 @@ struct xt_table
>  	/* What hooks you will enter on */
>  	unsigned int valid_hooks;
>  
> -	/* Lock for the curtain */
> -	rwlock_t lock;
> +	/* Lock for curtain */
> +	spinlock_t lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -386,6 +386,12 @@ struct xt_table_info
>  	/* Secret compartment */
>  	seqcount_t *seq;
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>  	char *entries[1];
> --- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
> +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:14:54.648490491 -0800
> @@ -238,8 +238,8 @@ unsigned int arpt_do_table(struct sk_buf
>  	indev = in ? in->name : nulldevname;
>  	outdev = out ? out->name : nulldevname;
>  
> -	read_lock_bh(&table->lock);
> -	private = table->private;
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
>  	table_base = (void *)private->entries[smp_processor_id()];
>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>  	e = get_entry(table_base, private->hook_entry[hook]);
> @@ -315,7 +315,7 @@ unsigned int arpt_do_table(struct sk_buf
>  			e = (void *)e + e->next_offset;
>  		}
>  	} while (!hotdrop);
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  	if (hotdrop)
>  		return NF_DROP;
> @@ -1163,8 +1163,8 @@ static int do_add_counters(struct net *n
>  		goto free;
>  	}
>  
> -	write_lock_bh(&t->lock);
> -	private = t->private;
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(t->private);
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
> @@ -1179,7 +1179,7 @@ static int do_add_counters(struct net *n
>  			   paddc,
>  			   &i);
>   unlock_up_free:
> -	write_unlock_bh(&t->lock);
> +	rcu_read_unlock_bh();
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:
> --- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:14:54.648490491 -0800
> @@ -348,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
> +	rcu_read_lock_bh();
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> +	private = rcu_dereference(table->private);
>  	table_base = (void *)private->entries[smp_processor_id()];
>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>  	e = get_entry(table_base, private->hook_entry[hook]);
> @@ -449,7 +449,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -1408,8 +1408,8 @@ do_add_counters(struct net *net, void __
>  		goto free;
>  	}
>  
> -	write_lock_bh(&t->lock);
> -	private = t->private;
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(t->private);

I feel litle bit nervous seeing a write_lock_bh() changed to a rcu_read_lock()

Also, add_counter_to_entry() is not using seqcount protection, so another thread
doing an iptables -L in parallel with this thread will possibly get corrupted counters.


(With write_lock_bh(), this corruption could not occur)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-29 23:04   ` Eric Dumazet
@ 2009-01-29 23:16     ` Stephen Hemminger
  2009-01-30  6:53       ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Stephen Hemminger @ 2009-01-29 23:16 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

On Fri, 30 Jan 2009 00:04:16 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > Replace existing reader/writer lock with Read-Copy-Update to
> > elminate the overhead of a read lock on each incoming packet.
> > This should reduce the overhead of iptables especially on SMP
> > systems.
> > 
> > The previous code used a reader-writer lock for two purposes.
> > The first was to ensure that the xt_table_info reference was not in
> > process of being changed. Since xt_table_info is only freed via one
> > routine, it was a direct conversion to RCU.
> > 
> > The other use of the reader-writer lock was to to block changes
> > to counters while they were being read. This synchronization was
> > fixed by the previous patch.  But still need to make sure table info
> > isn't going away.
> > 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > 
> > ---
> >  include/linux/netfilter/x_tables.h |   10 ++++++-
> >  net/ipv4/netfilter/arp_tables.c    |   12 ++++-----
> >  net/ipv4/netfilter/ip_tables.c     |   12 ++++-----
> >  net/ipv6/netfilter/ip6_tables.c    |   12 ++++-----
> >  net/netfilter/x_tables.c           |   48 ++++++++++++++++++++++++++-----------
> >  5 files changed, 60 insertions(+), 34 deletions(-)
> > 
> > --- a/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
> > +++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:14:54.648490491 -0800
> > @@ -352,8 +352,8 @@ struct xt_table
> >  	/* What hooks you will enter on */
> >  	unsigned int valid_hooks;
> >  
> > -	/* Lock for the curtain */
> > -	rwlock_t lock;
> > +	/* Lock for curtain */
> > +	spinlock_t lock;
> >  
> >  	/* Man behind the curtain... */
> >  	struct xt_table_info *private;
> > @@ -386,6 +386,12 @@ struct xt_table_info
> >  	/* Secret compartment */
> >  	seqcount_t *seq;
> >  
> > +	/* For the dustman... */
> > +	union {
> > +		struct rcu_head rcu;
> > +		struct work_struct work;
> > +	};
> > +
> >  	/* ipt_entry tables: one per CPU */
> >  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> >  	char *entries[1];
> > --- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
> > +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:14:54.648490491 -0800
> > @@ -238,8 +238,8 @@ unsigned int arpt_do_table(struct sk_buf
> >  	indev = in ? in->name : nulldevname;
> >  	outdev = out ? out->name : nulldevname;
> >  
> > -	read_lock_bh(&table->lock);
> > -	private = table->private;
> > +	rcu_read_lock_bh();
> > +	private = rcu_dereference(table->private);
> >  	table_base = (void *)private->entries[smp_processor_id()];
> >  	seq = per_cpu_ptr(private->seq, smp_processor_id());
> >  	e = get_entry(table_base, private->hook_entry[hook]);
> > @@ -315,7 +315,7 @@ unsigned int arpt_do_table(struct sk_buf
> >  			e = (void *)e + e->next_offset;
> >  		}
> >  	} while (!hotdrop);
> > -	read_unlock_bh(&table->lock);
> > +	rcu_read_unlock_bh();
> >  
> >  	if (hotdrop)
> >  		return NF_DROP;
> > @@ -1163,8 +1163,8 @@ static int do_add_counters(struct net *n
> >  		goto free;
> >  	}
> >  
> > -	write_lock_bh(&t->lock);
> > -	private = t->private;
> > +	rcu_read_lock_bh();
> > +	private = rcu_dereference(t->private);
> >  	if (private->number != num_counters) {
> >  		ret = -EINVAL;
> >  		goto unlock_up_free;
> > @@ -1179,7 +1179,7 @@ static int do_add_counters(struct net *n
> >  			   paddc,
> >  			   &i);
> >   unlock_up_free:
> > -	write_unlock_bh(&t->lock);
> > +	rcu_read_unlock_bh();
> >  	xt_table_unlock(t);
> >  	module_put(t->me);
> >   free:
> > --- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
> > +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:14:54.648490491 -0800
> > @@ -348,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
> >  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> >  	tgpar.hooknum = hook;
> >  
> > -	read_lock_bh(&table->lock);
> > +	rcu_read_lock_bh();
> >  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > -	private = table->private;
> > +	private = rcu_dereference(table->private);
> >  	table_base = (void *)private->entries[smp_processor_id()];
> >  	seq = per_cpu_ptr(private->seq, smp_processor_id());
> >  	e = get_entry(table_base, private->hook_entry[hook]);
> > @@ -449,7 +449,7 @@ ipt_do_table(struct sk_buff *skb,
> >  		}
> >  	} while (!hotdrop);
> >  
> > -	read_unlock_bh(&table->lock);
> > +	rcu_read_unlock_bh();
> >  
> >  #ifdef DEBUG_ALLOW_ALL
> >  	return NF_ACCEPT;
> > @@ -1408,8 +1408,8 @@ do_add_counters(struct net *net, void __
> >  		goto free;
> >  	}
> >  
> > -	write_lock_bh(&t->lock);
> > -	private = t->private;
> > +	rcu_read_lock_bh();
> > +	private = rcu_dereference(t->private);
> 
> I feel litle bit nervous seeing a write_lock_bh() changed to a rcu_read_lock()

Facts, it is only updating entries on current cpu

> Also, add_counter_to_entry() is not using seqcount protection, so another thread
> doing an iptables -L in parallel with this thread will possibly get corrupted counters.
add_counter_to_entry is local to current CPU.


> (With write_lock_bh(), this corruption could not occur)
> 
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-29 23:16     ` Stephen Hemminger
@ 2009-01-30  6:53       ` Eric Dumazet
  2009-01-30  7:02         ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-01-30  6:53 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> On Fri, 30 Jan 2009 00:04:16 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Stephen Hemminger a écrit :
>>> Replace existing reader/writer lock with Read-Copy-Update to
>>> elminate the overhead of a read lock on each incoming packet.
>>> This should reduce the overhead of iptables especially on SMP
>>> systems.
>>>
>>> The previous code used a reader-writer lock for two purposes.
>>> The first was to ensure that the xt_table_info reference was not in
>>> process of being changed. Since xt_table_info is only freed via one
>>> routine, it was a direct conversion to RCU.
>>>
>>> The other use of the reader-writer lock was to to block changes
>>> to counters while they were being read. This synchronization was
>>> fixed by the previous patch.  But still need to make sure table info
>>> isn't going away.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>
>>>
>>> ---
>>>  include/linux/netfilter/x_tables.h |   10 ++++++-
>>>  net/ipv4/netfilter/arp_tables.c    |   12 ++++-----
>>>  net/ipv4/netfilter/ip_tables.c     |   12 ++++-----
>>>  net/ipv6/netfilter/ip6_tables.c    |   12 ++++-----
>>>  net/netfilter/x_tables.c           |   48 ++++++++++++++++++++++++++-----------
>>>  5 files changed, 60 insertions(+), 34 deletions(-)
>>>
>>> --- a/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
>>> +++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:14:54.648490491 -0800
>>> @@ -352,8 +352,8 @@ struct xt_table
>>>  	/* What hooks you will enter on */
>>>  	unsigned int valid_hooks;
>>>  
>>> -	/* Lock for the curtain */
>>> -	rwlock_t lock;
>>> +	/* Lock for curtain */
>>> +	spinlock_t lock;
>>>  
>>>  	/* Man behind the curtain... */
>>>  	struct xt_table_info *private;
>>> @@ -386,6 +386,12 @@ struct xt_table_info
>>>  	/* Secret compartment */
>>>  	seqcount_t *seq;
>>>  
>>> +	/* For the dustman... */
>>> +	union {
>>> +		struct rcu_head rcu;
>>> +		struct work_struct work;
>>> +	};
>>> +
>>>  	/* ipt_entry tables: one per CPU */
>>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>>>  	char *entries[1];
>>> --- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
>>> +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:14:54.648490491 -0800
>>> @@ -238,8 +238,8 @@ unsigned int arpt_do_table(struct sk_buf
>>>  	indev = in ? in->name : nulldevname;
>>>  	outdev = out ? out->name : nulldevname;
>>>  
>>> -	read_lock_bh(&table->lock);
>>> -	private = table->private;
>>> +	rcu_read_lock_bh();
>>> +	private = rcu_dereference(table->private);
>>>  	table_base = (void *)private->entries[smp_processor_id()];
>>>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>> @@ -315,7 +315,7 @@ unsigned int arpt_do_table(struct sk_buf
>>>  			e = (void *)e + e->next_offset;
>>>  		}
>>>  	} while (!hotdrop);
>>> -	read_unlock_bh(&table->lock);
>>> +	rcu_read_unlock_bh();
>>>  
>>>  	if (hotdrop)
>>>  		return NF_DROP;
>>> @@ -1163,8 +1163,8 @@ static int do_add_counters(struct net *n
>>>  		goto free;
>>>  	}
>>>  
>>> -	write_lock_bh(&t->lock);
>>> -	private = t->private;
>>> +	rcu_read_lock_bh();
>>> +	private = rcu_dereference(t->private);
>>>  	if (private->number != num_counters) {
>>>  		ret = -EINVAL;
>>>  		goto unlock_up_free;
>>> @@ -1179,7 +1179,7 @@ static int do_add_counters(struct net *n
>>>  			   paddc,
>>>  			   &i);
>>>   unlock_up_free:
>>> -	write_unlock_bh(&t->lock);
>>> +	rcu_read_unlock_bh();
>>>  	xt_table_unlock(t);
>>>  	module_put(t->me);
>>>   free:
>>> --- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
>>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:14:54.648490491 -0800
>>> @@ -348,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
>>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>>  	tgpar.hooknum = hook;
>>>  
>>> -	read_lock_bh(&table->lock);
>>> +	rcu_read_lock_bh();
>>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>>> -	private = table->private;
>>> +	private = rcu_dereference(table->private);
>>>  	table_base = (void *)private->entries[smp_processor_id()];
>>>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>> @@ -449,7 +449,7 @@ ipt_do_table(struct sk_buff *skb,
>>>  		}
>>>  	} while (!hotdrop);
>>>  
>>> -	read_unlock_bh(&table->lock);
>>> +	rcu_read_unlock_bh();
>>>  
>>>  #ifdef DEBUG_ALLOW_ALL
>>>  	return NF_ACCEPT;
>>> @@ -1408,8 +1408,8 @@ do_add_counters(struct net *net, void __
>>>  		goto free;
>>>  	}
>>>  
>>> -	write_lock_bh(&t->lock);
>>> -	private = t->private;
>>> +	rcu_read_lock_bh();
>>> +	private = rcu_dereference(t->private);
>> I feel litle bit nervous seeing a write_lock_bh() changed to a rcu_read_lock()
> 
> Facts, it is only updating entries on current cpu

Yes, like done in ipt_do_table() ;)

Fact is we need to tell other threads, running on other cpus, that an update
 of our entries is running.

Let me check if your v4 and xt_counters abstraction already solved this problem.

> 
>> Also, add_counter_to_entry() is not using seqcount protection, so another thread
>> doing an iptables -L in parallel with this thread will possibly get corrupted counters.
> add_counter_to_entry is local to current CPU.
> 
> 
>> (With write_lock_bh(), this corruption could not occur)
>>
>>
> --


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-30  6:53       ` Eric Dumazet
@ 2009-01-30  7:02         ` Eric Dumazet
  2009-01-30  7:05           ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2009-01-30  7:02 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> On Fri, 30 Jan 2009 00:04:16 +0100
>> Eric Dumazet <dada1@cosmosbay.com> wrote:
>>
>>> Stephen Hemminger a écrit :
>>>> Replace existing reader/writer lock with Read-Copy-Update to
>>>> elminate the overhead of a read lock on each incoming packet.
>>>> This should reduce the overhead of iptables especially on SMP
>>>> systems.
>>>>
>>>> The previous code used a reader-writer lock for two purposes.
>>>> The first was to ensure that the xt_table_info reference was not in
>>>> process of being changed. Since xt_table_info is only freed via one
>>>> routine, it was a direct conversion to RCU.
>>>>
>>>> The other use of the reader-writer lock was to to block changes
>>>> to counters while they were being read. This synchronization was
>>>> fixed by the previous patch.  But still need to make sure table info
>>>> isn't going away.
>>>>
>>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>>
>>>>
>>>> ---
>>>>  include/linux/netfilter/x_tables.h |   10 ++++++-
>>>>  net/ipv4/netfilter/arp_tables.c    |   12 ++++-----
>>>>  net/ipv4/netfilter/ip_tables.c     |   12 ++++-----
>>>>  net/ipv6/netfilter/ip6_tables.c    |   12 ++++-----
>>>>  net/netfilter/x_tables.c           |   48 ++++++++++++++++++++++++++-----------
>>>>  5 files changed, 60 insertions(+), 34 deletions(-)
>>>>
>>>> --- a/include/linux/netfilter/x_tables.h	2009-01-28 22:04:39.316517913 -0800
>>>> +++ b/include/linux/netfilter/x_tables.h	2009-01-28 22:14:54.648490491 -0800
>>>> @@ -352,8 +352,8 @@ struct xt_table
>>>>  	/* What hooks you will enter on */
>>>>  	unsigned int valid_hooks;
>>>>  
>>>> -	/* Lock for the curtain */
>>>> -	rwlock_t lock;
>>>> +	/* Lock for curtain */
>>>> +	spinlock_t lock;
>>>>  
>>>>  	/* Man behind the curtain... */
>>>>  	struct xt_table_info *private;
>>>> @@ -386,6 +386,12 @@ struct xt_table_info
>>>>  	/* Secret compartment */
>>>>  	seqcount_t *seq;
>>>>  
>>>> +	/* For the dustman... */
>>>> +	union {
>>>> +		struct rcu_head rcu;
>>>> +		struct work_struct work;
>>>> +	};
>>>> +
>>>>  	/* ipt_entry tables: one per CPU */
>>>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>>>>  	char *entries[1];
>>>> --- a/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:13:16.423490077 -0800
>>>> +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-28 22:14:54.648490491 -0800
>>>> @@ -238,8 +238,8 @@ unsigned int arpt_do_table(struct sk_buf
>>>>  	indev = in ? in->name : nulldevname;
>>>>  	outdev = out ? out->name : nulldevname;
>>>>  
>>>> -	read_lock_bh(&table->lock);
>>>> -	private = table->private;
>>>> +	rcu_read_lock_bh();
>>>> +	private = rcu_dereference(table->private);
>>>>  	table_base = (void *)private->entries[smp_processor_id()];
>>>>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>>> @@ -315,7 +315,7 @@ unsigned int arpt_do_table(struct sk_buf
>>>>  			e = (void *)e + e->next_offset;
>>>>  		}
>>>>  	} while (!hotdrop);
>>>> -	read_unlock_bh(&table->lock);
>>>> +	rcu_read_unlock_bh();
>>>>  
>>>>  	if (hotdrop)
>>>>  		return NF_DROP;
>>>> @@ -1163,8 +1163,8 @@ static int do_add_counters(struct net *n
>>>>  		goto free;
>>>>  	}
>>>>  
>>>> -	write_lock_bh(&t->lock);
>>>> -	private = t->private;
>>>> +	rcu_read_lock_bh();
>>>> +	private = rcu_dereference(t->private);
>>>>  	if (private->number != num_counters) {
>>>>  		ret = -EINVAL;
>>>>  		goto unlock_up_free;
>>>> @@ -1179,7 +1179,7 @@ static int do_add_counters(struct net *n
>>>>  			   paddc,
>>>>  			   &i);
>>>>   unlock_up_free:
>>>> -	write_unlock_bh(&t->lock);
>>>> +	rcu_read_unlock_bh();
>>>>  	xt_table_unlock(t);
>>>>  	module_put(t->me);
>>>>   free:
>>>> --- a/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:06:10.596739805 -0800
>>>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-28 22:14:54.648490491 -0800
>>>> @@ -348,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
>>>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>>>  	tgpar.hooknum = hook;
>>>>  
>>>> -	read_lock_bh(&table->lock);
>>>> +	rcu_read_lock_bh();
>>>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>>>> -	private = table->private;
>>>> +	private = rcu_dereference(table->private);
>>>>  	table_base = (void *)private->entries[smp_processor_id()];
>>>>  	seq = per_cpu_ptr(private->seq, smp_processor_id());
>>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>>> @@ -449,7 +449,7 @@ ipt_do_table(struct sk_buff *skb,
>>>>  		}
>>>>  	} while (!hotdrop);
>>>>  
>>>> -	read_unlock_bh(&table->lock);
>>>> +	rcu_read_unlock_bh();
>>>>  
>>>>  #ifdef DEBUG_ALLOW_ALL
>>>>  	return NF_ACCEPT;
>>>> @@ -1408,8 +1408,8 @@ do_add_counters(struct net *net, void __
>>>>  		goto free;
>>>>  	}
>>>>  
>>>> -	write_lock_bh(&t->lock);
>>>> -	private = t->private;
>>>> +	rcu_read_lock_bh();
>>>> +	private = rcu_dereference(t->private);
>>> I feel litle bit nervous seeing a write_lock_bh() changed to a rcu_read_lock()
>> Facts, it is only updating entries on current cpu
> 
> Yes, like done in ipt_do_table() ;)
> 
> Fact is we need to tell other threads, running on other cpus, that an update
>  of our entries is running.
> 
> Let me check if your v4 and xt_counters abstraction already solved this problem.

Hum, I just checked and indeed there is a problem...

#define SUM_COUNTER(s,c)  do { (s).bcnt += (c).bcnt; (s).pcnt += (c).pcnt; } while(0)

need to be changed to use 

#define SUM_COUNTER(s, c)  do { xt_incr_counter(s, (c).cnt, (c).pcnt);} while (0)




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH 5/5] netfilter: convert x_tables to use RCU
  2009-01-30  7:02         ` Eric Dumazet
@ 2009-01-30  7:05           ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2009-01-30  7:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Eric Dumazet a écrit :
> 
> Hum, I just checked and indeed there is a problem...
> 
> #define SUM_COUNTER(s,c)  do { (s).bcnt += (c).bcnt; (s).pcnt += (c).pcnt; } while(0)
> 
> need to be changed to use 
> 
> #define SUM_COUNTER(s, c)  do { xt_incr_counter(s, (c).cnt, (c).pcnt);} while (0)
> 

Oops

#define SUM_COUNTER(s, c)  xt_incr_counter(s, (c).bcnt, (c).pcnt)


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2009-01-30  7:05 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-29  6:25 [PATCH 0/5] iptables lockless receive (v0.3) Stephen Hemminger
2009-01-29  6:25 ` [PATCH 1/5] netfilter: change elements in x_tables Stephen Hemminger
2009-01-29  6:25 ` [PATCH 2/5] netfilter: remove unneeded initializations Stephen Hemminger
2009-01-29  6:25 ` [PATCH 3/5] ebtables: " Stephen Hemminger
2009-01-29  6:25 ` [PATCH 4/5] netfilter: use sequence number synchronization for counters Stephen Hemminger
2009-01-29  8:47   ` Eric Dumazet
2009-01-29  6:25 ` [PATCH 5/5] netfilter: convert x_tables to use RCU Stephen Hemminger
2009-01-29 23:04   ` Eric Dumazet
2009-01-29 23:16     ` Stephen Hemminger
2009-01-30  6:53       ` Eric Dumazet
2009-01-30  7:02         ` Eric Dumazet
2009-01-30  7:05           ` Eric Dumazet
2009-01-29  8:07 ` [PATCH 0/5] iptables lockless receive (v0.3) Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).