[PATCH 0/6] iptables: eliminate read/write lock (v0.4)

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/6] iptables: eliminate read/write lock (v0.4)
@ 2009-01-30 21:57 Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 1/6] netfilter: change elements in x_tables Stephen Hemminger
                   ` (5 more replies)
  0 siblings, 6 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

More of Eric's suggestions, handle the case of aggregation of
counters correctly.  Unfortunately, can't easily make xt_add_counter
inline because DEFINE_PER_CPU doesn't seem to export to modules correctly.

-- 

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 1/6] netfilter: change elements in x_tables
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 2/6] netfilter: remove unneeded initializations Stephen Hemminger
                   ` (4 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: x_tables.patch --]
[-- Type: text/plain, Size: 1062 bytes --]

Change to proper type on private pointer rather than anonymous void.
Keep active elements on same cache line.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-01-26 17:24:43.251543415 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-26 17:29:12.510649107 -0800
@@ -349,9 +349,6 @@ struct xt_table
 {
 	struct list_head list;
 
-	/* A unique name... */
-	const char name[XT_TABLE_MAXNAMELEN];
-
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
@@ -359,13 +356,15 @@ struct xt_table
 	rwlock_t lock;
 
 	/* Man behind the curtain... */
-	//struct ip6t_table_info *private;
-	void *private;
+	struct xt_table_info *private;
 
 	/* Set this to THIS_MODULE if you are a module, otherwise NULL */
 	struct module *me;
 
 	u_int8_t af;		/* address/protocol family */
+
+	/* A unique name... */
+	const char name[XT_TABLE_MAXNAMELEN];
 };
 
 #include <linux/netfilter_ipv4.h>

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 2/6] netfilter: remove unneeded initializations
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 1/6] netfilter: change elements in x_tables Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 3/6] ebtables: " Stephen Hemminger
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: iptables-lock-init.patch --]
[-- Type: text/plain, Size: 4593 bytes --]

Later patches change the locking on xt_table and the initialization of
the lock element is not needed since the lock is always initialized in
xt_table_register anyway.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 net/ipv4/netfilter/arptable_filter.c   |    2 --
 net/ipv4/netfilter/iptable_filter.c    |    1 -
 net/ipv4/netfilter/iptable_mangle.c    |    1 -
 net/ipv4/netfilter/iptable_raw.c       |    1 -
 net/ipv4/netfilter/iptable_security.c  |    1 -
 net/ipv4/netfilter/nf_nat_rule.c       |    1 -
 net/ipv6/netfilter/ip6table_filter.c   |    1 -
 net/ipv6/netfilter/ip6table_mangle.c   |    1 -
 net/ipv6/netfilter/ip6table_raw.c      |    1 -
 net/ipv6/netfilter/ip6table_security.c |    1 -
 10 files changed, 11 deletions(-)

--- a/net/ipv4/netfilter/arptable_filter.c	2009-01-26 17:24:43.687542005 -0800
+++ b/net/ipv4/netfilter/arptable_filter.c	2009-01-26 19:50:37.891042244 -0800
@@ -48,8 +48,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
-	.private	= NULL,
 	.me		= THIS_MODULE,
 	.af		= NFPROTO_ARP,
 };
--- a/net/ipv4/netfilter/iptable_filter.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_filter.c	2009-01-26 19:50:37.891042244 -0800
@@ -56,7 +56,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv4/netfilter/iptable_raw.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_raw.c	2009-01-26 19:50:37.891042244 -0800
@@ -39,7 +39,6 @@ static struct
 static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks =  RAW_VALID_HOOKS,
-	.lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE,
 	.af = AF_INET,
 };
--- a/net/ipv4/netfilter/iptable_security.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_security.c	2009-01-26 19:50:37.891042244 -0800
@@ -60,7 +60,6 @@ static struct
 static struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(security_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv4/netfilter/nf_nat_rule.c	2009-01-26 17:24:43.695541481 -0800
+++ b/net/ipv4/netfilter/nf_nat_rule.c	2009-01-26 19:51:20.338030618 -0800
@@ -61,7 +61,6 @@ static struct
 static struct xt_table nat_table = {
 	.name		= "nat",
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(nat_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };
--- a/net/ipv6/netfilter/ip6table_filter.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_filter.c	2009-01-26 19:50:37.895044361 -0800
@@ -54,7 +54,6 @@ static struct
 static struct xt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_mangle.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_mangle.c	2009-01-26 19:50:37.895044361 -0800
@@ -60,7 +60,6 @@ static struct
 static struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_raw.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_raw.c	2009-01-26 19:50:37.895044361 -0800
@@ -38,7 +38,6 @@ static struct
 static struct xt_table packet_raw = {
 	.name = "raw",
 	.valid_hooks = RAW_VALID_HOOKS,
-	.lock = __RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE,
 	.af = AF_INET6,
 };
--- a/net/ipv6/netfilter/ip6table_security.c	2009-01-26 17:24:43.735541493 -0800
+++ b/net/ipv6/netfilter/ip6table_security.c	2009-01-26 19:50:37.895044361 -0800
@@ -59,7 +59,6 @@ static struct
 static struct xt_table security_table = {
 	.name		= "security",
 	.valid_hooks	= SECURITY_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(security_table.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET6,
 };
--- a/net/ipv4/netfilter/iptable_mangle.c	2009-01-26 17:24:43.691541994 -0800
+++ b/net/ipv4/netfilter/iptable_mangle.c	2009-01-26 19:50:37.895044361 -0800
@@ -67,7 +67,6 @@ static struct
 static struct xt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 	.af		= AF_INET,
 };

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 3/6] ebtables: remove unneeded initializations
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 1/6] netfilter: change elements in x_tables Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 2/6] netfilter: remove unneeded initializations Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 4/6] netfilter: abstract xt_counters Stephen Hemminger
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: ebtables-lock-init.patch --]
[-- Type: text/plain, Size: 1537 bytes --]

The initialization of the lock element is not needed
since the lock is always initialized in ebt_register_table.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 net/bridge/netfilter/ebtable_broute.c |    1 -
 net/bridge/netfilter/ebtable_filter.c |    1 -
 net/bridge/netfilter/ebtable_nat.c    |    1 -
 3 files changed, 3 deletions(-)

--- a/net/bridge/netfilter/ebtable_broute.c	2009-01-27 17:09:10.313100854 -0800
+++ b/net/bridge/netfilter/ebtable_broute.c	2009-01-27 17:09:15.862142852 -0800
@@ -46,7 +46,6 @@ static struct ebt_table broute_table =
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.lock		= __RW_LOCK_UNLOCKED(broute_table.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
--- a/net/bridge/netfilter/ebtable_filter.c	2009-01-27 17:08:50.725100955 -0800
+++ b/net/bridge/netfilter/ebtable_filter.c	2009-01-27 17:08:53.828611768 -0800
@@ -55,7 +55,6 @@ static struct ebt_table frame_filter =
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_filter.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
--- a/net/bridge/netfilter/ebtable_nat.c	2009-01-27 17:09:22.896602465 -0800
+++ b/net/bridge/netfilter/ebtable_nat.c	2009-01-27 17:09:31.589085328 -0800
@@ -55,7 +55,6 @@ static struct ebt_table frame_nat =
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= __RW_LOCK_UNLOCKED(frame_nat.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 4/6] netfilter: abstract xt_counters
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
                   ` (2 preceding siblings ...)
  2009-01-30 21:57 ` [PATCH 3/6] ebtables: " Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-02-01 12:25   ` Eric Dumazet
  2009-01-30 21:57 ` [PATCH 5/6] netfilter: use sequence number synchronization for counters Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 6/6] netfilter: convert x_tables to use RCU Stephen Hemminger
  5 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: xtables-counter.patch --]
[-- Type: text/plain, Size: 6013 bytes --]

Break out the parts of the x_tables code that manipulates counters so
changes to locking are easier.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |    6 +++++-
 net/ipv4/netfilter/arp_tables.c    |    9 +++++----
 net/ipv4/netfilter/ip_tables.c     |    9 +++++----
 net/ipv6/netfilter/ip6_tables.c    |   21 +++++++++++----------
 net/netfilter/x_tables.c           |   15 +++++++++++++++
 5 files changed, 41 insertions(+), 19 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-01-30 08:31:48.630454493 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-30 09:14:01.294680339 -0800
@@ -105,13 +105,17 @@ struct _xt_align
 #define XT_ERROR_TARGET "ERROR"
 
 #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
 struct xt_counters
 {
 	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
 };
 
+extern void xt_add_counter(struct xt_counters *c, unsigned b, unsigned p);
+extern void xt_sum_counter(struct xt_counters *t,
+			   int cpu, const struct xt_counters *c);
+
+
 /* The argument to IPT_SO_ADD_COUNTERS. */
 struct xt_counters_info
 {
--- a/net/ipv4/netfilter/arp_tables.c	2009-01-30 08:31:48.569479503 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-01-30 09:12:40.181542286 -0800
@@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buf
 
 			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
 				(2 * skb->dev->addr_len);
-			ADD_COUNTER(e->counters, hdr_len, 1);
+			xt_add_counter(&e->counters, hdr_len, 1);
 
 			t = arpt_get_target(e);
 
@@ -662,10 +662,11 @@ static int translate_table(const char *n
 
 /* Gets counters. */
 static inline int add_entry_to_counter(const struct arpt_entry *e,
+				       int cpu,
 				       struct xt_counters total[],
 				       unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+	xt_sum_counter(total, cpu, &e->counters);
 
 	(*i)++;
 	return 0;
@@ -709,6 +710,7 @@ static void get_counters(const struct xt
 		ARPT_ENTRY_ITERATE(t->entries[cpu],
 				   t->size,
 				   add_entry_to_counter,
+				   cpu,
 				   counters,
 				   &i);
 	}
@@ -1082,8 +1084,7 @@ static inline int add_counter_to_entry(s
 				       const struct xt_counters addme[],
 				       unsigned int *i)
 {
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
 
 	(*i)++;
 	return 0;
--- a/net/ipv4/netfilter/ip_tables.c	2009-01-30 08:31:48.538730580 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-01-30 09:12:40.169542168 -0800
@@ -366,7 +366,7 @@ ipt_do_table(struct sk_buff *skb,
 			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
-			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
+			xt_add_counter(&e->counters, ntohs(ip->tot_len), 1);
 
 			t = ipt_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -872,10 +872,11 @@ translate_table(const char *name,
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ipt_entry *e,
+		     int cpu,
 		     struct xt_counters total[],
 		     unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+	xt_sum_counter(total, cpu, &e->counters);
 
 	(*i)++;
 	return 0;
@@ -921,6 +922,7 @@ get_counters(const struct xt_table_info 
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
+				  cpu,
 				  counters,
 				  &i);
 	}
@@ -1327,8 +1329,7 @@ add_counter_to_entry(struct ipt_entry *e
 		 (long unsigned int)addme[*i].pcnt,
 		 (long unsigned int)addme[*i].bcnt);
 #endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
 
 	(*i)++;
 	return 0;
--- a/net/ipv6/netfilter/ip6_tables.c	2009-01-30 08:31:48.605479850 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-01-30 09:12:40.205542065 -0800
@@ -392,9 +392,9 @@ ip6t_do_table(struct sk_buff *skb,
 			if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
 				goto no_match;
 
-			ADD_COUNTER(e->counters,
-				    ntohs(ipv6_hdr(skb)->payload_len) +
-				    sizeof(struct ipv6hdr), 1);
+			xt_add_counter(&e->counters,
+					ntohs(ipv6_hdr(skb)->payload_len) +
+					sizeof(struct ipv6hdr), 1);
 
 			t = ip6t_get_target(e);
 			IP_NF_ASSERT(t->u.kernel.target);
@@ -901,10 +901,11 @@ translate_table(const char *name,
 /* Gets counters. */
 static inline int
 add_entry_to_counter(const struct ip6t_entry *e,
+		     int cpu,
 		     struct xt_counters total[],
 		     unsigned int *i)
 {
-	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
+	xt_sum_counter(total, cpu, &e->counters);
 
 	(*i)++;
 	return 0;
@@ -948,10 +949,11 @@ get_counters(const struct xt_table_info 
 			continue;
 		i = 0;
 		IP6T_ENTRY_ITERATE(t->entries[cpu],
-				  t->size,
-				  add_entry_to_counter,
-				  counters,
-				  &i);
+				   t->size,
+				   add_entry_to_counter,
+				   cpu,
+				   counters,
+				   &i);
 	}
 }
 
@@ -1357,8 +1359,7 @@ add_counter_to_entry(struct ip6t_entry *
 		 (long unsigned int)addme[*i].pcnt,
 		 (long unsigned int)addme[*i].bcnt);
 #endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
 
 	(*i)++;
 	return 0;
--- a/net/netfilter/x_tables.c	2009-01-30 08:38:32.949293300 -0800
+++ b/net/netfilter/x_tables.c	2009-01-30 09:13:27.636792850 -0800
@@ -577,6 +577,21 @@ int xt_compat_target_to_user(struct xt_e
 EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 #endif
 
+void xt_add_counter(struct xt_counters *c, unsigned b, unsigned p)
+{
+	c->bcnt += b;
+	c->pcnt += p;
+}
+EXPORT_SYMBOL_GPL(xt_add_counter);
+
+void xt_sum_counter(struct xt_counters *t, int cpu,
+		    const struct xt_counters *c)
+{
+	t->pcnt += c->pcnt;
+	t->bcnt += c->bcnt;
+}
+EXPORT_SYMBOL_GPL(xt_sum_counter);
+
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 4/6] netfilter: abstract xt_counters
  2009-01-30 21:57 ` [PATCH 4/6] netfilter: abstract xt_counters Stephen Hemminger
@ 2009-02-01 12:25   ` Eric Dumazet
  2009-02-02 23:33     ` [PATCH 3/3] iptables: lock free counters (alternate version) Stephen Hemminger
  0 siblings, 1 reply; 21+ messages in thread
From: Eric Dumazet @ 2009-02-01 12:25 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Break out the parts of the x_tables code that manipulates counters so
> changes to locking are easier.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> ---
>  include/linux/netfilter/x_tables.h |    6 +++++-
>  net/ipv4/netfilter/arp_tables.c    |    9 +++++----
>  net/ipv4/netfilter/ip_tables.c     |    9 +++++----
>  net/ipv6/netfilter/ip6_tables.c    |   21 +++++++++++----------
>  net/netfilter/x_tables.c           |   15 +++++++++++++++
>  5 files changed, 41 insertions(+), 19 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-01-30 08:31:48.630454493 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-01-30 09:14:01.294680339 -0800
> @@ -105,13 +105,17 @@ struct _xt_align
>  #define XT_ERROR_TARGET "ERROR"
>  
>  #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
> -#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
>  
>  struct xt_counters
>  {
>  	u_int64_t pcnt, bcnt;			/* Packet and byte counters */
>  };
>  
> +extern void xt_add_counter(struct xt_counters *c, unsigned b, unsigned p);
> +extern void xt_sum_counter(struct xt_counters *t,
> +			   int cpu, const struct xt_counters *c);
> +
> +
>  /* The argument to IPT_SO_ADD_COUNTERS. */
>  struct xt_counters_info
>  {
> --- a/net/ipv4/netfilter/arp_tables.c	2009-01-30 08:31:48.569479503 -0800
> +++ b/net/ipv4/netfilter/arp_tables.c	2009-01-30 09:12:40.181542286 -0800
> @@ -256,7 +256,7 @@ unsigned int arpt_do_table(struct sk_buf
>  
>  			hdr_len = sizeof(*arp) + (2 * sizeof(struct in_addr)) +
>  				(2 * skb->dev->addr_len);
> -			ADD_COUNTER(e->counters, hdr_len, 1);
> +			xt_add_counter(&e->counters, hdr_len, 1);
>  
>  			t = arpt_get_target(e);
>  
> @@ -662,10 +662,11 @@ static int translate_table(const char *n
>  
>  /* Gets counters. */
>  static inline int add_entry_to_counter(const struct arpt_entry *e,
> +				       int cpu,
>  				       struct xt_counters total[],
>  				       unsigned int *i)
>  {
> -	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
> +	xt_sum_counter(total, cpu, &e->counters);
>  
>  	(*i)++;
>  	return 0;
> @@ -709,6 +710,7 @@ static void get_counters(const struct xt
>  		ARPT_ENTRY_ITERATE(t->entries[cpu],
>  				   t->size,
>  				   add_entry_to_counter,
> +				   cpu,
>  				   counters,
>  				   &i);
>  	}
> @@ -1082,8 +1084,7 @@ static inline int add_counter_to_entry(s
>  				       const struct xt_counters addme[],
>  				       unsigned int *i)
>  {
> -
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
>  
>  	(*i)++;
>  	return 0;
> --- a/net/ipv4/netfilter/ip_tables.c	2009-01-30 08:31:48.538730580 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-01-30 09:12:40.169542168 -0800
> @@ -366,7 +366,7 @@ ipt_do_table(struct sk_buff *skb,
>  			if (IPT_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
>  				goto no_match;
>  
> -			ADD_COUNTER(e->counters, ntohs(ip->tot_len), 1);
> +			xt_add_counter(&e->counters, ntohs(ip->tot_len), 1);
>  
>  			t = ipt_get_target(e);
>  			IP_NF_ASSERT(t->u.kernel.target);
> @@ -872,10 +872,11 @@ translate_table(const char *name,
>  /* Gets counters. */
>  static inline int
>  add_entry_to_counter(const struct ipt_entry *e,
> +		     int cpu,
>  		     struct xt_counters total[],
>  		     unsigned int *i)
>  {
> -	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
> +	xt_sum_counter(total, cpu, &e->counters);
>  
>  	(*i)++;
>  	return 0;
> @@ -921,6 +922,7 @@ get_counters(const struct xt_table_info 
>  		IPT_ENTRY_ITERATE(t->entries[cpu],
>  				  t->size,
>  				  add_entry_to_counter,
> +				  cpu,
>  				  counters,
>  				  &i);
>  	}
> @@ -1327,8 +1329,7 @@ add_counter_to_entry(struct ipt_entry *e
>  		 (long unsigned int)addme[*i].pcnt,
>  		 (long unsigned int)addme[*i].bcnt);
>  #endif
> -
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
>  
>  	(*i)++;
>  	return 0;
> --- a/net/ipv6/netfilter/ip6_tables.c	2009-01-30 08:31:48.605479850 -0800
> +++ b/net/ipv6/netfilter/ip6_tables.c	2009-01-30 09:12:40.205542065 -0800
> @@ -392,9 +392,9 @@ ip6t_do_table(struct sk_buff *skb,
>  			if (IP6T_MATCH_ITERATE(e, do_match, skb, &mtpar) != 0)
>  				goto no_match;
>  
> -			ADD_COUNTER(e->counters,
> -				    ntohs(ipv6_hdr(skb)->payload_len) +
> -				    sizeof(struct ipv6hdr), 1);
> +			xt_add_counter(&e->counters,
> +					ntohs(ipv6_hdr(skb)->payload_len) +
> +					sizeof(struct ipv6hdr), 1);
>  
>  			t = ip6t_get_target(e);
>  			IP_NF_ASSERT(t->u.kernel.target);
> @@ -901,10 +901,11 @@ translate_table(const char *name,
>  /* Gets counters. */
>  static inline int
>  add_entry_to_counter(const struct ip6t_entry *e,
> +		     int cpu,
>  		     struct xt_counters total[],
>  		     unsigned int *i)
>  {
> -	ADD_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt);
> +	xt_sum_counter(total, cpu, &e->counters);
>  
>  	(*i)++;
>  	return 0;
> @@ -948,10 +949,11 @@ get_counters(const struct xt_table_info 
>  			continue;
>  		i = 0;
>  		IP6T_ENTRY_ITERATE(t->entries[cpu],
> -				  t->size,
> -				  add_entry_to_counter,
> -				  counters,
> -				  &i);
> +				   t->size,
> +				   add_entry_to_counter,
> +				   cpu,
> +				   counters,
> +				   &i);
>  	}
>  }
>  
> @@ -1357,8 +1359,7 @@ add_counter_to_entry(struct ip6t_entry *
>  		 (long unsigned int)addme[*i].pcnt,
>  		 (long unsigned int)addme[*i].bcnt);
>  #endif
> -
> -	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
> +	xt_add_counter(&e->counters, addme[*i].bcnt, addme[*i].pcnt);
>  
>  	(*i)++;
>  	return 0;
> --- a/net/netfilter/x_tables.c	2009-01-30 08:38:32.949293300 -0800
> +++ b/net/netfilter/x_tables.c	2009-01-30 09:13:27.636792850 -0800
> @@ -577,6 +577,21 @@ int xt_compat_target_to_user(struct xt_e
>  EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
>  #endif
>  
> +void xt_add_counter(struct xt_counters *c, unsigned b, unsigned p)
> +{
> +	c->bcnt += b;
> +	c->pcnt += p;
> +}
> +EXPORT_SYMBOL_GPL(xt_add_counter);
> +
> +void xt_sum_counter(struct xt_counters *t, int cpu,
> +		    const struct xt_counters *c)
> +{
> +	t->pcnt += c->pcnt;
> +	t->bcnt += c->bcnt;
> +}
> +EXPORT_SYMBOL_GPL(xt_sum_counter);
> +
>  struct xt_table_info *xt_alloc_table_info(unsigned int size)
>  {
>  	struct xt_table_info *newinfo;
> 

First I wondered if adding out of line xt_add_counter() could slow firewalls,
I did some testings, with tbench and a small iptables setup (about 16 matched rules per packet)


CPU: Core 2, speed 3000.11 MHz (estimated)
Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
samples  %        symbol name
3166081  12.8996  ipt_do_table
2471426  10.0694  copy_from_user
1016717   4.1424  copy_to_user
902072    3.6753  schedule
687266    2.8001  xt_add_counter
589899    2.4034  tcp_sendmsg
579619    2.3615  tcp_ack
455883    1.8574  tcp_transmit_skb
 


c044d110 <xt_add_counter>: /* xt_add_counter total: 687266  2.8001 */
 80675  0.3287 :c044d110:       push   %ebp
 15574  0.0635 :c044d111:       mov    %esp,%ebp
     2 8.1e-06 :c044d113:       sub    $0xc,%esp
 39583  0.1613 :c044d116:       mov    %ebx,(%esp)
  4387  0.0179 :c044d119:       mov    %edi,0x8(%esp)
  1187  0.0048 :c044d11d:       mov    %esi,0x4(%esp)
  1601  0.0065 :c044d121:       mov    $0xc068ae4c,%edi
 38881  0.1584 :c044d126:       mov    %fs:0xc0688540,%ebx
  5585  0.0228 :c044d12d:       add    %ebx,%edi
  3910  0.0159 :c044d12f:       incl   (%edi)
133482  0.5438 :c044d131:       xor    %esi,%esi
    32 1.3e-04 :c044d133:       add    %edx,0x8(%eax)
 71181  0.2900 :c044d136:       mov    %ecx,%edx
  7986  0.0325 :c044d138:       adc    %esi,0xc(%eax)
 88695  0.3614 :c044d13b:       xor    %ecx,%ecx
    15 6.1e-05 :c044d13d:       add    %edx,(%eax)
 41496  0.1691 :c044d13f:       adc    %ecx,0x4(%eax)
 52944  0.2157 :c044d142:       incl   (%edi)
 30759  0.1253 :c044d144:       mov    (%esp),%ebx
 20241  0.0825 :c044d147:       mov    0x4(%esp),%esi
  5662  0.0231 :c044d14b:       mov    0x8(%esp),%edi
  2288  0.0093 :c044d14f:       leave
 41100  0.1675 :c044d150:       ret

tbench 8 results here, after all your patches applied :

Throughput 2331 MB/sec 8 procs

And if inlined :

Throughput 2359.06 MB/sec 8 procs

and if we check inlined code/costs we see :

  2597  0.1719 :c048dfee:       mov    -0x64(%ebp),%edx
   182  0.0120 :c048dff1:       movzwl 0x2(%edx),%eax
   524  0.0347 :c048dff5:       mov    %fs:0xc0688540,%ecx
     7 4.6e-04 :c048dffc:       add    -0x70(%ebp),%ecx
  2465  0.1632 :c048dfff:       incl   (%ecx)
  9476  0.6273 :c048e001:       addl   $0x1,0x60(%edi)
 10068  0.6665 :c048e005:       adcl   $0x0,0x64(%edi)
  6543  0.4332 :c048e009:       xor    %edx,%edx
     1 6.6e-05 :c048e00b:       rol    $0x8,%ax
   234  0.0155 :c048e00f:       movzwl %ax,%eax
  2198  0.1455 :c048e012:       add    %eax,0x68(%edi)
    80  0.0053 :c048e015:       adc    %edx,0x6c(%edi)
  2858  0.1892 :c048e018:       incl   (%ecx)


With upcoming work on fast percpu accesses, we might in the future see following code:
(no need for registers to compute address of variable)

      mov    -0x64(%ebp),%edx
      movzwl 0x2(%edx),%eax
      incl   %fs:xt_counter_sequence
      addl   $0x1,0x60(%edi)
      adcl   $0x0,0x64(%edi)
      xor    %edx,%edx
      rol    $0x8,%ax
      movzwl %ax,%eax
      add    %eax,0x68(%edi)
      adc    %edx,0x6c(%edi)
      incl   %fs:xt_counter_sequence



^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-01 12:25   ` Eric Dumazet
@ 2009-02-02 23:33     ` Stephen Hemminger
  2009-02-03 19:00       ` Eric Dumazet
  0 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2009-02-02 23:33 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: David Miller, netdev

This is an alternative to earlier RCU/seqcount_t version of counters.
The counters operate as usual without locking, but when counters are rotated
around the CPU's entries.  RCU is used in two ways, first to handle the
counter rotation, second for replace.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |   10 +++-
 net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
 net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
 net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
 net/netfilter/x_tables.c           |   43 +++++++++++++++------
 5 files changed, 197 insertions(+), 72 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
@@ -353,7 +353,7 @@ struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -383,9 +383,15 @@ struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* For the dustman... */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
--- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
 	return 0;
 }
 
+static inline int
+set_counter_to_entry(struct ipt_entry *e,
+		     const struct ipt_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+
 static void
-get_counters(const struct xt_table_info *t,
+get_counters(struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct ipt_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
-
+	e = t->entries[curcpu];
 	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[curcpu],
+	IPT_ENTRY_ITERATE(e,
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
 			  &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters = ((struct xt_counters) { 0, 0 });
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		IPT_ENTRY_ITERATE(t->entries[cpu],
+		IPT_ENTRY_ITERATE(e,
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
 	}
+	i = 0;
+	t->entries[curcpu] = e;
+	IPT_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -942,9 +972,9 @@ static struct xt_counters * alloc_counte
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1393,13 +1423,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1439,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-02-02 15:06:29.708249745 -0800
+++ b/net/netfilter/x_tables.c	2009-02-02 15:30:33.718573969 -0800
@@ -611,18 +611,36 @@ struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
 {
 	int cpu;
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
 
-	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
+	for_each_possible_cpu(cpu)
+		vfree(info->entries[cpu]);
 	kfree(info);
 }
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+ 	if (info->size <= PAGE_SIZE) {
+		unsigned int cpu;
+ 		for_each_possible_cpu(cpu)
+ 			kfree(info->entries[cpu]);
+ 		kfree(info);
+ 	} else {
+ 		/* can't safely call vfree in current context */
+ 		INIT_WORK(&info->work, xt_free_table_info_work);
+ 		schedule_work(&info->work);
+  	}
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ 	call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
@@ -671,20 +689,20 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return oldinfo;
 }
@@ -719,7 +737,8 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
--- a/net/ipv4/netfilter/arp_tables.c	2009-02-02 15:06:29.696250564 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-02-02 15:14:45.969206521 -0800
@@ -237,9 +237,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -311,7 +312,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -681,44 +683,77 @@ static inline int set_entry_to_counter(c
 	return 0;
 }
 
-static void get_counters(const struct xt_table_info *t,
-			 struct xt_counters counters[])
+
+static inline int
+set_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+ 	(*i)++;
+ 	return 0;
+}
+
+
+static void
+get_counters(struct xt_table_info *t,
+	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct arpt_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
 
 	i = 0;
-	ARPT_ENTRY_ITERATE(t->entries[curcpu],
+ 	e = t->entries[curcpu];
+	ARPT_ENTRY_ITERATE(e,
 			   t->size,
 			   set_entry_to_counter,
 			   counters,
 			   &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters.bcnt = 0;
+		e->counters.pcnt = 0;
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		ARPT_ENTRY_ITERATE(t->entries[cpu],
+		ARPT_ENTRY_ITERATE(e,
 				   t->size,
 				   add_entry_to_counter,
 				   counters,
 				   &i);
 	}
+
+	i = 0;
+	t->entries[curcpu] = e;
+	ARPT_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static inline struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -731,9 +766,9 @@ static inline struct xt_counters *alloc_
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1148,13 +1183,14 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1164,7 +1200,8 @@ static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-02-02 15:06:29.724250485 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-02-02 15:15:05.352498160 -0800
@@ -373,10 +373,12 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -474,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -921,45 +923,72 @@ set_entry_to_counter(const struct ip6t_e
 	return 0;
 }
 
+static inline int
+set_counter_to_entry(struct ip6t_entry *e,
+		     const struct ip6t_counters total[],
+		     unsigned int *i)
+{
+	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
 static void
-get_counters(const struct xt_table_info *t,
+get_counters(struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
 	unsigned int cpu;
 	unsigned int i;
 	unsigned int curcpu;
+	struct ip6t_entry *e;
 
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
+	preempt_disable();
 	curcpu = raw_smp_processor_id();
-
+	e = t->entries[curcpu];
 	i = 0;
-	IP6T_ENTRY_ITERATE(t->entries[curcpu],
-			   t->size,
-			   set_entry_to_counter,
-			   counters,
-			   &i);
+	IP6T_ENTRY_ITERATE(e,
+			  t->size,
+			  set_entry_to_counter,
+			  counters,
+			  &i);
 
 	for_each_possible_cpu(cpu) {
+		void *p;
+
 		if (cpu == curcpu)
 			continue;
+
+		/* Swizzle the values and wait */
+		e->counters = ((struct xt_counters) { 0, 0 });
+		p = t->entries[cpu];
+		rcu_assign_pointer(t->entries[cpu], e);
+		synchronize_net();
+		e = p;
+
 		i = 0;
-		IP6T_ENTRY_ITERATE(t->entries[cpu],
+		IP6T_ENTRY_ITERATE(e,
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
 				  &i);
 	}
+	i = 0;
+	t->entries[curcpu] = e;
+	IP6T_ENTRY_ITERATE(e,
+			  t->size,
+			  set_counter_to_entry,
+			  counters,
+			  &i);
+
+	preempt_enable();
 }
 
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -971,9 +1000,9 @@ static struct xt_counters *alloc_counter
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
 	return counters;
 }
@@ -1424,13 +1453,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1439,8 +1469,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-02 23:33     ` [PATCH 3/3] iptables: lock free counters (alternate version) Stephen Hemminger
@ 2009-02-03 19:00       ` Eric Dumazet
  2009-02-03 19:19         ` Eric Dumazet
  2009-02-03 19:32         ` Paul E. McKenney
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2009-02-03 19:00 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> This is an alternative to earlier RCU/seqcount_t version of counters.
> The counters operate as usual without locking, but when counters are rotated
> around the CPU's entries.  RCU is used in two ways, first to handle the
> counter rotation, second for replace.
> 

Is it a working patch or just a prototype ?

> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> ---
>  include/linux/netfilter/x_tables.h |   10 +++-
>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>  5 files changed, 197 insertions(+), 72 deletions(-)
> 
> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> @@ -353,7 +353,7 @@ struct xt_table
>  	unsigned int valid_hooks;
>  
>  	/* Lock for the curtain */
> -	rwlock_t lock;
> +	struct mutex lock;
>  
>  	/* Man behind the curtain... */
>  	struct xt_table_info *private;
> @@ -383,9 +383,15 @@ struct xt_table_info
>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>  	unsigned int underflow[NF_INET_NUMHOOKS];
>  
> +	/* For the dustman... */
> +	union {
> +		struct rcu_head rcu;
> +		struct work_struct work;
> +	};
> +
>  	/* ipt_entry tables: one per CPU */
>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> -	char *entries[1];
> +	void *entries[1];
>  };
>  
>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>  	tgpar.hooknum = hook;
>  
> -	read_lock_bh(&table->lock);
>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> -	private = table->private;
> -	table_base = (void *)private->entries[smp_processor_id()];
> +
> +	rcu_read_lock_bh();
> +	private = rcu_dereference(table->private);
> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> +
>  	e = get_entry(table_base, private->hook_entry[hook]);
>  
>  	/* For return from builtin chain */
> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>  		}
>  	} while (!hotdrop);
>  
> -	read_unlock_bh(&table->lock);
> +	rcu_read_unlock_bh();
>  
>  #ifdef DEBUG_ALLOW_ALL
>  	return NF_ACCEPT;
> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>  	return 0;
>  }
>  
> +static inline int
> +set_counter_to_entry(struct ipt_entry *e,
> +		     const struct ipt_counters total[],
> +		     unsigned int *i)
> +{
> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> +
> +	(*i)++;
> +	return 0;
> +}
> +
> +
>  static void
> -get_counters(const struct xt_table_info *t,
> +get_counters(struct xt_table_info *t,
>  	     struct xt_counters counters[])
>  {
>  	unsigned int cpu;
>  	unsigned int i;
>  	unsigned int curcpu;
> +	struct ipt_entry *e;
>  
> -	/* Instead of clearing (by a previous call to memset())
> -	 * the counters and using adds, we set the counters
> -	 * with data used by 'current' CPU
> -	 * We dont care about preemption here.
> -	 */
> +	preempt_disable();
>  	curcpu = raw_smp_processor_id();
> -
> +	e = t->entries[curcpu];
>  	i = 0;
> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> +	IPT_ENTRY_ITERATE(e,
>  			  t->size,
>  			  set_entry_to_counter,

Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
set_entry_to_counter() might get garbage.
I suppose I already mentioned it :)

>  			  counters,
>  			  &i);
>  
>  	for_each_possible_cpu(cpu) {
> +		void *p;
> +
>  		if (cpu == curcpu)
>  			continue;
> +
> +		/* Swizzle the values and wait */
> +		e->counters = ((struct xt_counters) { 0, 0 });

I dont see what you want to do here...

e->counters is the counter associated with rule #0

> +		p = t->entries[cpu];
> +		rcu_assign_pointer(t->entries[cpu], e);
> +		synchronize_net();


Oh well, not this synchronize_net() :)

This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
on big machines (NR_CPUS >= 64)

What problem do we want to solve here ?


Current iptables is able to do an atomic snapshot because of the rwlock.

If we really want to keep this feature (but get rid of rwlock), we might do the reverse
with two seqlocks + RCU

One seqlock (seqlock_counters) to protect counter updates
One seqlock (seqlock_rules) to protect rules changes

Ie :

ipt_do_table() doing :
{
rcu_read_lock 
read_seqbegin(&table->seqlock_rules);
rcu fetch priv table pointer and work on it
 do {
 for all counters updates, use 
 do {
   seq = read_seqbegin(table->seqlock_counters);
   update counters
   }
 } while (!hotdrop);
rcu_read_unlock()
}

for get_counter() (iptables -L)
writer doing a write_seqlock(&table->seqlock_counters), waiting one RCU grace period, 
{
get / sum all counters (no updates of counters are allowed)
}
write_sequnlock();


for iptables_update/replace (iptables -A|I|D|R|Z|F...)
writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
{
change rules/counters
}
write_sequnlock();



^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 19:00       ` Eric Dumazet
@ 2009-02-03 19:19         ` Eric Dumazet
  2009-02-03 19:32         ` Paul E. McKenney
  1 sibling, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2009-02-03 19:19 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> This is an alternative to earlier RCU/seqcount_t version of counters.
>> The counters operate as usual without locking, but when counters are rotated
>> around the CPU's entries.  RCU is used in two ways, first to handle the
>> counter rotation, second for replace.
>>
> 
> Is it a working patch or just a prototype ?
> 
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> ---
>>  include/linux/netfilter/x_tables.h |   10 +++-
>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>>  5 files changed, 197 insertions(+), 72 deletions(-)
>>
>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
>> @@ -353,7 +353,7 @@ struct xt_table
>>  	unsigned int valid_hooks;
>>  
>>  	/* Lock for the curtain */
>> -	rwlock_t lock;
>> +	struct mutex lock;
>>  
>>  	/* Man behind the curtain... */
>>  	struct xt_table_info *private;
>> @@ -383,9 +383,15 @@ struct xt_table_info
>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>>  	unsigned int underflow[NF_INET_NUMHOOKS];
>>  
>> +	/* For the dustman... */
>> +	union {
>> +		struct rcu_head rcu;
>> +		struct work_struct work;
>> +	};
>> +
>>  	/* ipt_entry tables: one per CPU */
>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>> -	char *entries[1];
>> +	void *entries[1];
>>  };
>>  
>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>  	tgpar.hooknum = hook;
>>  
>> -	read_lock_bh(&table->lock);
>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>> -	private = table->private;
>> -	table_base = (void *)private->entries[smp_processor_id()];
>> +
>> +	rcu_read_lock_bh();
>> +	private = rcu_dereference(table->private);
>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
>> +
>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>  
>>  	/* For return from builtin chain */
>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>>  		}
>>  	} while (!hotdrop);
>>  
>> -	read_unlock_bh(&table->lock);
>> +	rcu_read_unlock_bh();
>>  
>>  #ifdef DEBUG_ALLOW_ALL
>>  	return NF_ACCEPT;
>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>>  	return 0;
>>  }
>>  
>> +static inline int
>> +set_counter_to_entry(struct ipt_entry *e,
>> +		     const struct ipt_counters total[],
>> +		     unsigned int *i)
>> +{
>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
>> +
>> +	(*i)++;
>> +	return 0;
>> +}
>> +
>> +
>>  static void
>> -get_counters(const struct xt_table_info *t,
>> +get_counters(struct xt_table_info *t,
>>  	     struct xt_counters counters[])
>>  {
>>  	unsigned int cpu;
>>  	unsigned int i;
>>  	unsigned int curcpu;
>> +	struct ipt_entry *e;
>>  
>> -	/* Instead of clearing (by a previous call to memset())
>> -	 * the counters and using adds, we set the counters
>> -	 * with data used by 'current' CPU
>> -	 * We dont care about preemption here.
>> -	 */
>> +	preempt_disable();
>>  	curcpu = raw_smp_processor_id();
>> -
>> +	e = t->entries[curcpu];
>>  	i = 0;
>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
>> +	IPT_ENTRY_ITERATE(e,
>>  			  t->size,
>>  			  set_entry_to_counter,
> 
> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> set_entry_to_counter() might get garbage.
> I suppose I already mentioned it :)
> 
>>  			  counters,
>>  			  &i);
>>  
>>  	for_each_possible_cpu(cpu) {
>> +		void *p;
>> +
>>  		if (cpu == curcpu)
>>  			continue;
>> +
>> +		/* Swizzle the values and wait */
>> +		e->counters = ((struct xt_counters) { 0, 0 });
> 
> I dont see what you want to do here...
> 
> e->counters is the counter associated with rule #0
> 
>> +		p = t->entries[cpu];
>> +		rcu_assign_pointer(t->entries[cpu], e);
>> +		synchronize_net();
> 
> 
> Oh well, not this synchronize_net() :)
> 
> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> on big machines (NR_CPUS >= 64)
> 
> What problem do we want to solve here ?
> 
> 
> Current iptables is able to do an atomic snapshot because of the rwlock.
> 
> If we really want to keep this feature (but get rid of rwlock), we might do the reverse
> with two seqlocks + RCU
> 
> One seqlock (seqlock_counters) to protect counter updates
> One seqlock (seqlock_rules) to protect rules changes
> 
> Ie :
> 
> ipt_do_table() doing :
> {
> rcu_read_lock 
> read_seqbegin(&table->seqlock_rules);
> rcu fetch priv table pointer and work on it
>  do {
>  for all counters updates, use 
>  do {
>    seq = read_seqbegin(table->seqlock_counters);

Hum... reading my mail, this one can be done once only, at the beginning

Why then I suggested two different seqlocks, I am wondering :)

I need to think a litle bit more, we might do something to allow several "iptables -L" in //

maybe an atomic_t to protect counters would be ok :


Ie :

One atomic_t (counters_readers) to protect counter updates
One seqlock (seqlock_rules) to protect rules changes



ipt_do_table() doing :
{
begin:
read_seqbegin(&table->seqlock_rules); // it could loop but with BH enabled
rcu_read_lock_bh();
while (atomic_read(&table->counters_readers) > 0) {
   cpu_relax();
   rcu_read_unlock_bh();
   goto begin;
}
private = rcu_dereference(table->private);
...
 do {
 for all counters updates, use 
 do {
   update counters
   }
 } while (!hotdrop);
rcu_read_unlock_bh()
}


for get_counter() (iptables -L)
atomic_inc(&table->counters_readers);
waiting one RCU grace period, 
{
get / sum all counters (no updates of counters are allowed)
}
atomic_dec(&table->counters_readers)



for iptables_update/replace (iptables -A|I|D|R|Z|F...)
writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
{
change rules/counters
}
write_sequnlock();






^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 19:00       ` Eric Dumazet
  2009-02-03 19:19         ` Eric Dumazet
@ 2009-02-03 19:32         ` Paul E. McKenney
  2009-02-03 20:20           ` Eric Dumazet
  1 sibling, 1 reply; 21+ messages in thread
From: Paul E. McKenney @ 2009-02-03 19:32 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev

On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> > This is an alternative to earlier RCU/seqcount_t version of counters.
> > The counters operate as usual without locking, but when counters are rotated
> > around the CPU's entries.  RCU is used in two ways, first to handle the
> > counter rotation, second for replace.
> 
> Is it a working patch or just a prototype ?
> 
> > Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > ---
> >  include/linux/netfilter/x_tables.h |   10 +++-
> >  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> >  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> >  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> >  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> >  5 files changed, 197 insertions(+), 72 deletions(-)
> > 
> > --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > @@ -353,7 +353,7 @@ struct xt_table
> >  	unsigned int valid_hooks;
> >  
> >  	/* Lock for the curtain */
> > -	rwlock_t lock;
> > +	struct mutex lock;
> >  
> >  	/* Man behind the curtain... */
> >  	struct xt_table_info *private;
> > @@ -383,9 +383,15 @@ struct xt_table_info
> >  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> >  	unsigned int underflow[NF_INET_NUMHOOKS];
> >  
> > +	/* For the dustman... */
> > +	union {
> > +		struct rcu_head rcu;
> > +		struct work_struct work;
> > +	};
> > +
> >  	/* ipt_entry tables: one per CPU */
> >  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > -	char *entries[1];
> > +	void *entries[1];
> >  };
> >  
> >  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> >  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> >  	tgpar.hooknum = hook;
> >  
> > -	read_lock_bh(&table->lock);
> >  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > -	private = table->private;
> > -	table_base = (void *)private->entries[smp_processor_id()];
> > +
> > +	rcu_read_lock_bh();
> > +	private = rcu_dereference(table->private);
> > +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > +
> >  	e = get_entry(table_base, private->hook_entry[hook]);
> >  
> >  	/* For return from builtin chain */
> > @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> >  		}
> >  	} while (!hotdrop);
> >  
> > -	read_unlock_bh(&table->lock);
> > +	rcu_read_unlock_bh();
> >  
> >  #ifdef DEBUG_ALLOW_ALL
> >  	return NF_ACCEPT;
> > @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> >  	return 0;
> >  }
> >  
> > +static inline int
> > +set_counter_to_entry(struct ipt_entry *e,
> > +		     const struct ipt_counters total[],
> > +		     unsigned int *i)
> > +{
> > +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > +
> > +	(*i)++;
> > +	return 0;
> > +}
> > +
> > +
> >  static void
> > -get_counters(const struct xt_table_info *t,
> > +get_counters(struct xt_table_info *t,
> >  	     struct xt_counters counters[])
> >  {
> >  	unsigned int cpu;
> >  	unsigned int i;
> >  	unsigned int curcpu;
> > +	struct ipt_entry *e;
> >  
> > -	/* Instead of clearing (by a previous call to memset())
> > -	 * the counters and using adds, we set the counters
> > -	 * with data used by 'current' CPU
> > -	 * We dont care about preemption here.
> > -	 */
> > +	preempt_disable();
> >  	curcpu = raw_smp_processor_id();
> > -
> > +	e = t->entries[curcpu];
> >  	i = 0;
> > -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > +	IPT_ENTRY_ITERATE(e,
> >  			  t->size,
> >  			  set_entry_to_counter,
> 
> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> set_entry_to_counter() might get garbage.
> I suppose I already mentioned it :)
> 
> >  			  counters,
> >  			  &i);
> >  
> >  	for_each_possible_cpu(cpu) {
> > +		void *p;
> > +
> >  		if (cpu == curcpu)
> >  			continue;
> > +
> > +		/* Swizzle the values and wait */
> > +		e->counters = ((struct xt_counters) { 0, 0 });
> 
> I dont see what you want to do here...
> 
> e->counters is the counter associated with rule #0
> 
> > +		p = t->entries[cpu];
> > +		rcu_assign_pointer(t->entries[cpu], e);
> > +		synchronize_net();
> 
> 
> Oh well, not this synchronize_net() :)
> 
> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> on big machines (NR_CPUS >= 64)

Why would this not provide the moral equivalent of atomic sampling?
The code above switches to another counter set, and waits for a grace
period.  Shouldn't this mean that all CPUs that were incrementing the
old set of counters have finished doing so, so that the aggregate count
covers all CPUs that started their increments before the pointer switch?
Same as acquiring a write lock, which would wait for all CPUs that
started their increments before starting the write-lock acquisition.
CPUs that started their increments after starting the write acquisition
would not be accounted for in the total, same as the RCU approach.

Steve's approach does delay reading out the counters, but it avoids
delaying any CPU trying to increment the counters.

So what am I missing here?

							Thanx, Paul

> What problem do we want to solve here ?
> 
> 
> Current iptables is able to do an atomic snapshot because of the rwlock.
> 
> If we really want to keep this feature (but get rid of rwlock), we might do the reverse
> with two seqlocks + RCU
> 
> One seqlock (seqlock_counters) to protect counter updates
> One seqlock (seqlock_rules) to protect rules changes
> 
> Ie :
> 
> ipt_do_table() doing :
> {
> rcu_read_lock 
> read_seqbegin(&table->seqlock_rules);
> rcu fetch priv table pointer and work on it
>  do {
>  for all counters updates, use 
>  do {
>    seq = read_seqbegin(table->seqlock_counters);
>    update counters
>    }
>  } while (!hotdrop);
> rcu_read_unlock()
> }
> 
> for get_counter() (iptables -L)
> writer doing a write_seqlock(&table->seqlock_counters), waiting one RCU grace period, 
> {
> get / sum all counters (no updates of counters are allowed)
> }
> write_sequnlock();
> 
> 
> for iptables_update/replace (iptables -A|I|D|R|Z|F...)
> writer doing a write_seqlock(&table->seqlock_rules), waiting one RCU grace period, 
> {
> change rules/counters
> }
> write_sequnlock();
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netdev" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 19:32         ` Paul E. McKenney
@ 2009-02-03 20:20           ` Eric Dumazet
  2009-02-03 20:44             ` Stephen Hemminger
  2009-02-03 21:10             ` Paul E. McKenney
  0 siblings, 2 replies; 21+ messages in thread
From: Eric Dumazet @ 2009-02-03 20:20 UTC (permalink / raw)
  To: paulmck; +Cc: Stephen Hemminger, David Miller, netdev

Paul E. McKenney a écrit :
> On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
>> Stephen Hemminger a écrit :
>>> This is an alternative to earlier RCU/seqcount_t version of counters.
>>> The counters operate as usual without locking, but when counters are rotated
>>> around the CPU's entries.  RCU is used in two ways, first to handle the
>>> counter rotation, second for replace.
>> Is it a working patch or just a prototype ?
>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>>
>>> ---
>>>  include/linux/netfilter/x_tables.h |   10 +++-
>>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
>>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
>>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
>>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
>>>  5 files changed, 197 insertions(+), 72 deletions(-)
>>>
>>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
>>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
>>> @@ -353,7 +353,7 @@ struct xt_table
>>>  	unsigned int valid_hooks;
>>>  
>>>  	/* Lock for the curtain */
>>> -	rwlock_t lock;
>>> +	struct mutex lock;
>>>  
>>>  	/* Man behind the curtain... */
>>>  	struct xt_table_info *private;
>>> @@ -383,9 +383,15 @@ struct xt_table_info
>>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
>>>  	unsigned int underflow[NF_INET_NUMHOOKS];
>>>  
>>> +	/* For the dustman... */
>>> +	union {
>>> +		struct rcu_head rcu;
>>> +		struct work_struct work;
>>> +	};
>>> +
>>>  	/* ipt_entry tables: one per CPU */
>>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
>>> -	char *entries[1];
>>> +	void *entries[1];
>>>  };
>>>  
>>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
>>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
>>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
>>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
>>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
>>>  	tgpar.hooknum = hook;
>>>  
>>> -	read_lock_bh(&table->lock);
>>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
>>> -	private = table->private;
>>> -	table_base = (void *)private->entries[smp_processor_id()];
>>> +
>>> +	rcu_read_lock_bh();
>>> +	private = rcu_dereference(table->private);
>>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
>>> +
>>>  	e = get_entry(table_base, private->hook_entry[hook]);
>>>  
>>>  	/* For return from builtin chain */
>>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
>>>  		}
>>>  	} while (!hotdrop);
>>>  
>>> -	read_unlock_bh(&table->lock);
>>> +	rcu_read_unlock_bh();
>>>  
>>>  #ifdef DEBUG_ALLOW_ALL
>>>  	return NF_ACCEPT;
>>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
>>>  	return 0;
>>>  }
>>>  
>>> +static inline int
>>> +set_counter_to_entry(struct ipt_entry *e,
>>> +		     const struct ipt_counters total[],
>>> +		     unsigned int *i)
>>> +{
>>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
>>> +
>>> +	(*i)++;
>>> +	return 0;
>>> +}
>>> +
>>> +
>>>  static void
>>> -get_counters(const struct xt_table_info *t,
>>> +get_counters(struct xt_table_info *t,
>>>  	     struct xt_counters counters[])
>>>  {
>>>  	unsigned int cpu;
>>>  	unsigned int i;
>>>  	unsigned int curcpu;
>>> +	struct ipt_entry *e;
>>>  
>>> -	/* Instead of clearing (by a previous call to memset())
>>> -	 * the counters and using adds, we set the counters
>>> -	 * with data used by 'current' CPU
>>> -	 * We dont care about preemption here.
>>> -	 */
>>> +	preempt_disable();
>>>  	curcpu = raw_smp_processor_id();
>>> -
>>> +	e = t->entries[curcpu];
>>>  	i = 0;
>>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
>>> +	IPT_ENTRY_ITERATE(e,
>>>  			  t->size,
>>>  			  set_entry_to_counter,
>> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
>> set_entry_to_counter() might get garbage.
>> I suppose I already mentioned it :)
>>
>>>  			  counters,
>>>  			  &i);
>>>  
>>>  	for_each_possible_cpu(cpu) {
>>> +		void *p;
>>> +
>>>  		if (cpu == curcpu)
>>>  			continue;
>>> +
>>> +		/* Swizzle the values and wait */
>>> +		e->counters = ((struct xt_counters) { 0, 0 });
>> I dont see what you want to do here...
>>
>> e->counters is the counter associated with rule #0
>>
>>> +		p = t->entries[cpu];
>>> +		rcu_assign_pointer(t->entries[cpu], e);
>>> +		synchronize_net();
>>
>> Oh well, not this synchronize_net() :)
>>
>> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
>> on big machines (NR_CPUS >= 64)
> 
> Why would this not provide the moral equivalent of atomic sampling?
> The code above switches to another counter set, and waits for a grace
> period.  Shouldn't this mean that all CPUs that were incrementing the
> old set of counters have finished doing so, so that the aggregate count
> covers all CPUs that started their increments before the pointer switch?
> Same as acquiring a write lock, which would wait for all CPUs that
> started their increments before starting the write-lock acquisition.
> CPUs that started their increments after starting the write acquisition
> would not be accounted for in the total, same as the RCU approach.
> 
> Steve's approach does delay reading out the counters, but it avoids
> delaying any CPU trying to increment the counters.

I see your point, but this is not what Stephen implemented.

So.. CPU will increments which counters, if not delayed ?

How counters will be synced again after our 'iptables -L' finished ?

"iptable -L" is not supposed to miss some counters updates (only some packets
 might be droped at NIC level because we spend time in the collection).
If packets matches some rules, we really want up2date counters.

Maybe we need for this collection an extra "cpu", to collect 
all increments that were done when CPUs where directed to a 
"secondary table/counters"

> 
> So what am I missing here?
> 

Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?



General/intuitive idea would be :

switch pointers to a newly allocated table (and zeroed counters)
wait one RCU grace period
collect/sum all counters of "old" table + (all cpus) into user provided table
restore previous table
wait one RCU grace period
disable_bh()
collect/sum all counters of "new and temporary" table (all cpus) and
reinject them into local cpu table (this cpu should not be interrupted)
enable_bh()

This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

Thanks Paul


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 20:20           ` Eric Dumazet
@ 2009-02-03 20:44             ` Stephen Hemminger
  2009-02-03 21:05               ` Eric Dumazet
  2009-02-03 21:10             ` Paul E. McKenney
  1 sibling, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2009-02-03 20:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: paulmck, David Miller, netdev


> >>> +	e = t->entries[curcpu];
> >>>  	i = 0;
> >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> >>> +	IPT_ENTRY_ITERATE(e,
> >>>  			  t->size,
> >>>  			  set_entry_to_counter,
> >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> >> set_entry_to_counter() might get garbage.
> >> I suppose I already mentioned it :)


Need to change preempt_disable to local_bh_disable.

> >>>  			  counters,
> >>>  			  &i);
> >>>  
> >>>  	for_each_possible_cpu(cpu) {
> >>> +		void *p;
> >>> +
> >>>  		if (cpu == curcpu)
> >>>  			continue;
> >>> +
> >>> +		/* Swizzle the values and wait */
> >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> >> I dont see what you want to do here...
> >>
> >> e->counters is the counter associated with rule #0
> >>
> >>> +		p = t->entries[cpu];
> >>> +		rcu_assign_pointer(t->entries[cpu], e);
> >>> +		synchronize_net();
> >>
> >> Oh well, not this synchronize_net() :)
> >>
> >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> >> on big machines (NR_CPUS >= 64)

For large number of CPU's it  would be possible to allocate
a full set of ncpu-1 new counters, then swap them in and then do one synchronize net.
Or instead of synchronize_net, maybe a "wait for CPU #n" to go through
grace period.

> > Why would this not provide the moral equivalent of atomic sampling?
> > The code above switches to another counter set, and waits for a grace
> > period.  Shouldn't this mean that all CPUs that were incrementing the
> > old set of counters have finished doing so, so that the aggregate count
> > covers all CPUs that started their increments before the pointer switch?
> > Same as acquiring a write lock, which would wait for all CPUs that
> > started their increments before starting the write-lock acquisition.
> > CPUs that started their increments after starting the write acquisition
> > would not be accounted for in the total, same as the RCU approach.
> > 
> > Steve's approach does delay reading out the counters, but it avoids
> > delaying any CPU trying to increment the counters.
> 
> I see your point, but this is not what Stephen implemented.
> 
> So.. CPU will increments which counters, if not delayed ?

The concept is that only the sum of all the counters matters, not
which CPU has the count.

> How counters will be synced again after our 'iptables -L' finished ?
> 
> "iptable -L" is not supposed to miss some counters updates (only some packets
>  might be droped at NIC level because we spend time in the collection).
> If packets matches some rules, we really want up2date counters.
> 
> Maybe we need for this collection an extra "cpu", to collect 
> all increments that were done when CPUs where directed to a 
> "secondary table/counters"
> 
> > 
> > So what am I missing here?
> > 
> 
> Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> 
> 
> 
> General/intuitive idea would be :
> 
> switch pointers to a newly allocated table (and zeroed counters)
> wait one RCU grace period
> collect/sum all counters of "old" table + (all cpus) into user provided table
> restore previous table
> wait one RCU grace period
> disable_bh()
> collect/sum all counters of "new and temporary" table (all cpus) and
> reinject them into local cpu table (this cpu should not be interrupted)
> enable_bh()
> 
> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

Pretty much what I said.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 20:44             ` Stephen Hemminger
@ 2009-02-03 21:05               ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2009-02-03 21:05 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: paulmck, David Miller, netdev

Stephen Hemminger a écrit :
>>
>> General/intuitive idea would be :
>>
>> switch pointers to a newly allocated table (and zeroed counters)
>> wait one RCU grace period
>> collect/sum all counters of "old" table + (all cpus) into user provided table
>> restore previous table
>> wait one RCU grace period
>> disable_bh()
>> collect/sum all counters of "new and temporary" table (all cpus) and
>> reinject them into local cpu table (this cpu should not be interrupted)
>> enable_bh()
>>
>> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> 
> Pretty much what I said.


Cool then, sorry for misunderstanding your patch.


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 20:20           ` Eric Dumazet
  2009-02-03 20:44             ` Stephen Hemminger
@ 2009-02-03 21:10             ` Paul E. McKenney
  2009-02-03 21:22               ` Stephen Hemminger
  1 sibling, 1 reply; 21+ messages in thread
From: Paul E. McKenney @ 2009-02-03 21:10 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Stephen Hemminger, David Miller, netdev

On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> Paul E. McKenney a écrit :
> > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> >> Stephen Hemminger a écrit :
> >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> >>> The counters operate as usual without locking, but when counters are rotated
> >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> >>> counter rotation, second for replace.
> >> Is it a working patch or just a prototype ?
> >>
> >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >>>
> >>> ---
> >>>  include/linux/netfilter/x_tables.h |   10 +++-
> >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> >>>
> >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> >>> @@ -353,7 +353,7 @@ struct xt_table
> >>>  	unsigned int valid_hooks;
> >>>  
> >>>  	/* Lock for the curtain */
> >>> -	rwlock_t lock;
> >>> +	struct mutex lock;
> >>>  
> >>>  	/* Man behind the curtain... */
> >>>  	struct xt_table_info *private;
> >>> @@ -383,9 +383,15 @@ struct xt_table_info
> >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> >>>  
> >>> +	/* For the dustman... */
> >>> +	union {
> >>> +		struct rcu_head rcu;
> >>> +		struct work_struct work;
> >>> +	};
> >>> +
> >>>  	/* ipt_entry tables: one per CPU */
> >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> >>> -	char *entries[1];
> >>> +	void *entries[1];
> >>>  };
> >>>  
> >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> >>>  	tgpar.hooknum = hook;
> >>>  
> >>> -	read_lock_bh(&table->lock);
> >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> >>> -	private = table->private;
> >>> -	table_base = (void *)private->entries[smp_processor_id()];
> >>> +
> >>> +	rcu_read_lock_bh();
> >>> +	private = rcu_dereference(table->private);
> >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> >>> +
> >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> >>>  
> >>>  	/* For return from builtin chain */
> >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> >>>  		}
> >>>  	} while (!hotdrop);
> >>>  
> >>> -	read_unlock_bh(&table->lock);
> >>> +	rcu_read_unlock_bh();
> >>>  
> >>>  #ifdef DEBUG_ALLOW_ALL
> >>>  	return NF_ACCEPT;
> >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> >>>  	return 0;
> >>>  }
> >>>  
> >>> +static inline int
> >>> +set_counter_to_entry(struct ipt_entry *e,
> >>> +		     const struct ipt_counters total[],
> >>> +		     unsigned int *i)
> >>> +{
> >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> >>> +
> >>> +	(*i)++;
> >>> +	return 0;
> >>> +}
> >>> +
> >>> +
> >>>  static void
> >>> -get_counters(const struct xt_table_info *t,
> >>> +get_counters(struct xt_table_info *t,
> >>>  	     struct xt_counters counters[])
> >>>  {
> >>>  	unsigned int cpu;
> >>>  	unsigned int i;
> >>>  	unsigned int curcpu;
> >>> +	struct ipt_entry *e;
> >>>  
> >>> -	/* Instead of clearing (by a previous call to memset())
> >>> -	 * the counters and using adds, we set the counters
> >>> -	 * with data used by 'current' CPU
> >>> -	 * We dont care about preemption here.
> >>> -	 */
> >>> +	preempt_disable();
> >>>  	curcpu = raw_smp_processor_id();
> >>> -
> >>> +	e = t->entries[curcpu];
> >>>  	i = 0;
> >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> >>> +	IPT_ENTRY_ITERATE(e,
> >>>  			  t->size,
> >>>  			  set_entry_to_counter,
> >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> >> set_entry_to_counter() might get garbage.
> >> I suppose I already mentioned it :)
> >>
> >>>  			  counters,
> >>>  			  &i);
> >>>  
> >>>  	for_each_possible_cpu(cpu) {
> >>> +		void *p;
> >>> +
> >>>  		if (cpu == curcpu)
> >>>  			continue;
> >>> +
> >>> +		/* Swizzle the values and wait */
> >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> >> I dont see what you want to do here...
> >>
> >> e->counters is the counter associated with rule #0
> >>
> >>> +		p = t->entries[cpu];
> >>> +		rcu_assign_pointer(t->entries[cpu], e);
> >>> +		synchronize_net();
> >>
> >> Oh well, not this synchronize_net() :)
> >>
> >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> >> on big machines (NR_CPUS >= 64)
> > 
> > Why would this not provide the moral equivalent of atomic sampling?
> > The code above switches to another counter set, and waits for a grace
> > period.  Shouldn't this mean that all CPUs that were incrementing the
> > old set of counters have finished doing so, so that the aggregate count
> > covers all CPUs that started their increments before the pointer switch?
> > Same as acquiring a write lock, which would wait for all CPUs that
> > started their increments before starting the write-lock acquisition.
> > CPUs that started their increments after starting the write acquisition
> > would not be accounted for in the total, same as the RCU approach.
> > 
> > Steve's approach does delay reading out the counters, but it avoids
> > delaying any CPU trying to increment the counters.
> 
> I see your point, but this is not what Stephen implemented.
> 
> So.. CPU will increments which counters, if not delayed ?

The new set installed by the rcu_assign_pointer().

> How counters will be synced again after our 'iptables -L' finished ?

The usual approach would be to have three sets of counters, one currently
being incremented, one just removed from service, and the last one holding
the cumulative value.  After a synchronize_net() following removing
a set from service, you add in the values in the previous set removed
from service.  Then you keep the new set for the next 'iptables -L'.

> "iptable -L" is not supposed to miss some counters updates (only some packets
>  might be droped at NIC level because we spend time in the collection).
> If packets matches some rules, we really want up2date counters.

No counter updates would be lost using the above method.

> Maybe we need for this collection an extra "cpu", to collect 
> all increments that were done when CPUs where directed to a 
> "secondary table/counters"

It should be easier to maintain a third set of counters that hold the
accumulated counts from the earlier instances of 'iptables -L'.

> > So what am I missing here?
> 
> Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?

Good point, the for_each_possible_cpu() was cut out -- I should have
gone back and looked at the original patch.

Seems like it should be possible to do a single synchronize_net()
after swizzling all the counters...

> General/intuitive idea would be :
> 
> switch pointers to a newly allocated table (and zeroed counters)
> wait one RCU grace period
> collect/sum all counters of "old" table + (all cpus) into user provided table
> restore previous table
> wait one RCU grace period
> disable_bh()
> collect/sum all counters of "new and temporary" table (all cpus) and
> reinject them into local cpu table (this cpu should not be interrupted)
> enable_bh()
> 
> This way, "iptables -L" is not too expensive and doesnt block packet processing at all.

My thought would be:

o	acquire some sort of mutex.

o	switch counters to newly allocated (and zeroed) table (T1).
	The old table being switched out is T2.

o	wait one RCU grace period.

o	Sum T2 into a single global counter (G).

o	Free T2.

o	Copy G to a local variable.

o	release the mutex.

o	Return the value of the local variable.

Then you can repeat, allocating a new table again and using the new
value of G.

Which may well be what you are saying above,

							Thanx, Paul

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 21:10             ` Paul E. McKenney
@ 2009-02-03 21:22               ` Stephen Hemminger
  2009-02-03 21:27                 ` Rick Jones
  2009-02-03 23:11                 ` Paul E. McKenney
  0 siblings, 2 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-02-03 21:22 UTC (permalink / raw)
  To: paulmck; +Cc: Eric Dumazet, David Miller, netdev

On Tue, 3 Feb 2009 13:10:00 -0800
"Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:

> On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> > Paul E. McKenney a écrit :
> > > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> > >> Stephen Hemminger a écrit :
> > >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> > >>> The counters operate as usual without locking, but when counters are rotated
> > >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> > >>> counter rotation, second for replace.
> > >> Is it a working patch or just a prototype ?
> > >>
> > >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > >>>
> > >>> ---
> > >>>  include/linux/netfilter/x_tables.h |   10 +++-
> > >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> > >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> > >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> > >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> > >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> > >>>
> > >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > >>> @@ -353,7 +353,7 @@ struct xt_table
> > >>>  	unsigned int valid_hooks;
> > >>>  
> > >>>  	/* Lock for the curtain */
> > >>> -	rwlock_t lock;
> > >>> +	struct mutex lock;
> > >>>  
> > >>>  	/* Man behind the curtain... */
> > >>>  	struct xt_table_info *private;
> > >>> @@ -383,9 +383,15 @@ struct xt_table_info
> > >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> > >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> > >>>  
> > >>> +	/* For the dustman... */
> > >>> +	union {
> > >>> +		struct rcu_head rcu;
> > >>> +		struct work_struct work;
> > >>> +	};
> > >>> +
> > >>>  	/* ipt_entry tables: one per CPU */
> > >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > >>> -	char *entries[1];
> > >>> +	void *entries[1];
> > >>>  };
> > >>>  
> > >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> > >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> > >>>  	tgpar.hooknum = hook;
> > >>>  
> > >>> -	read_lock_bh(&table->lock);
> > >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > >>> -	private = table->private;
> > >>> -	table_base = (void *)private->entries[smp_processor_id()];
> > >>> +
> > >>> +	rcu_read_lock_bh();
> > >>> +	private = rcu_dereference(table->private);
> > >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > >>> +
> > >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> > >>>  
> > >>>  	/* For return from builtin chain */
> > >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> > >>>  		}
> > >>>  	} while (!hotdrop);
> > >>>  
> > >>> -	read_unlock_bh(&table->lock);
> > >>> +	rcu_read_unlock_bh();
> > >>>  
> > >>>  #ifdef DEBUG_ALLOW_ALL
> > >>>  	return NF_ACCEPT;
> > >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> > >>>  	return 0;
> > >>>  }
> > >>>  
> > >>> +static inline int
> > >>> +set_counter_to_entry(struct ipt_entry *e,
> > >>> +		     const struct ipt_counters total[],
> > >>> +		     unsigned int *i)
> > >>> +{
> > >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > >>> +
> > >>> +	(*i)++;
> > >>> +	return 0;
> > >>> +}
> > >>> +
> > >>> +
> > >>>  static void
> > >>> -get_counters(const struct xt_table_info *t,
> > >>> +get_counters(struct xt_table_info *t,
> > >>>  	     struct xt_counters counters[])
> > >>>  {
> > >>>  	unsigned int cpu;
> > >>>  	unsigned int i;
> > >>>  	unsigned int curcpu;
> > >>> +	struct ipt_entry *e;
> > >>>  
> > >>> -	/* Instead of clearing (by a previous call to memset())
> > >>> -	 * the counters and using adds, we set the counters
> > >>> -	 * with data used by 'current' CPU
> > >>> -	 * We dont care about preemption here.
> > >>> -	 */
> > >>> +	preempt_disable();
> > >>>  	curcpu = raw_smp_processor_id();
> > >>> -
> > >>> +	e = t->entries[curcpu];
> > >>>  	i = 0;
> > >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > >>> +	IPT_ENTRY_ITERATE(e,
> > >>>  			  t->size,
> > >>>  			  set_entry_to_counter,
> > >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> > >> set_entry_to_counter() might get garbage.
> > >> I suppose I already mentioned it :)
> > >>
> > >>>  			  counters,
> > >>>  			  &i);
> > >>>  
> > >>>  	for_each_possible_cpu(cpu) {
> > >>> +		void *p;
> > >>> +
> > >>>  		if (cpu == curcpu)
> > >>>  			continue;
> > >>> +
> > >>> +		/* Swizzle the values and wait */
> > >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> > >> I dont see what you want to do here...
> > >>
> > >> e->counters is the counter associated with rule #0
> > >>
> > >>> +		p = t->entries[cpu];
> > >>> +		rcu_assign_pointer(t->entries[cpu], e);
> > >>> +		synchronize_net();
> > >>
> > >> Oh well, not this synchronize_net() :)
> > >>
> > >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> > >> on big machines (NR_CPUS >= 64)
> > > 
> > > Why would this not provide the moral equivalent of atomic sampling?
> > > The code above switches to another counter set, and waits for a grace
> > > period.  Shouldn't this mean that all CPUs that were incrementing the
> > > old set of counters have finished doing so, so that the aggregate count
> > > covers all CPUs that started their increments before the pointer switch?
> > > Same as acquiring a write lock, which would wait for all CPUs that
> > > started their increments before starting the write-lock acquisition.
> > > CPUs that started their increments after starting the write acquisition
> > > would not be accounted for in the total, same as the RCU approach.
> > > 
> > > Steve's approach does delay reading out the counters, but it avoids
> > > delaying any CPU trying to increment the counters.
> > 
> > I see your point, but this is not what Stephen implemented.
> > 
> > So.. CPU will increments which counters, if not delayed ?
> 
> The new set installed by the rcu_assign_pointer().
> 
> > How counters will be synced again after our 'iptables -L' finished ?
> 
> The usual approach would be to have three sets of counters, one currently
> being incremented, one just removed from service, and the last one holding
> the cumulative value.  After a synchronize_net() following removing
> a set from service, you add in the values in the previous set removed
> from service.  Then you keep the new set for the next 'iptables -L'.
> 
> > "iptable -L" is not supposed to miss some counters updates (only some packets
> >  might be droped at NIC level because we spend time in the collection).
> > If packets matches some rules, we really want up2date counters.
> 
> No counter updates would be lost using the above method.
> 
> > Maybe we need for this collection an extra "cpu", to collect 
> > all increments that were done when CPUs where directed to a 
> > "secondary table/counters"
> 
> It should be easier to maintain a third set of counters that hold the
> accumulated counts from the earlier instances of 'iptables -L'.
> 
> > > So what am I missing here?
> > 
> > Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> > Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> 
> Good point, the for_each_possible_cpu() was cut out -- I should have
> gone back and looked at the original patch.
> 
> Seems like it should be possible to do a single synchronize_net()
> after swizzling all the counters...
> 
> > General/intuitive idea would be :
> > 
> > switch pointers to a newly allocated table (and zeroed counters)
> > wait one RCU grace period
> > collect/sum all counters of "old" table + (all cpus) into user provided table
> > restore previous table
> > wait one RCU grace period
> > disable_bh()
> > collect/sum all counters of "new and temporary" table (all cpus) and
> > reinject them into local cpu table (this cpu should not be interrupted)
> > enable_bh()
> > 
> > This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> 
> My thought would be:
> 
> o	acquire some sort of mutex.
> 
> o	switch counters to newly allocated (and zeroed) table (T1).
> 	The old table being switched out is T2.
> 
> o	wait one RCU grace period.
> 
> o	Sum T2 into a single global counter (G).
> 
> o	Free T2.
> 
> o	Copy G to a local variable.
> 
> o	release the mutex.
> 
> o	Return the value of the local variable.
> 
> Then you can repeat, allocating a new table again and using the new
> value of G.
> 
> Which may well be what you are saying above,
> 

I was using the current CPU counter as the global counter G.


> 							Thanx, Paul

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 21:22               ` Stephen Hemminger
@ 2009-02-03 21:27                 ` Rick Jones
  2009-02-03 23:11                 ` Paul E. McKenney
  1 sibling, 0 replies; 21+ messages in thread
From: Rick Jones @ 2009-02-03 21:27 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: paulmck, Eric Dumazet, David Miller, netdev

So, is this patch stream at a point where I should try running the 32-core 
netperf tests against it?  And if so, against which tree should I try applying 
patches?

rick jones

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 21:22               ` Stephen Hemminger
  2009-02-03 21:27                 ` Rick Jones
@ 2009-02-03 23:11                 ` Paul E. McKenney
  2009-02-03 23:18                   ` Stephen Hemminger
  1 sibling, 1 reply; 21+ messages in thread
From: Paul E. McKenney @ 2009-02-03 23:11 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: Eric Dumazet, David Miller, netdev

On Tue, Feb 03, 2009 at 01:22:20PM -0800, Stephen Hemminger wrote:
> On Tue, 3 Feb 2009 13:10:00 -0800
> "Paul E. McKenney" <paulmck@linux.vnet.ibm.com> wrote:
> 
> > On Tue, Feb 03, 2009 at 09:20:04PM +0100, Eric Dumazet wrote:
> > > Paul E. McKenney a écrit :
> > > > On Tue, Feb 03, 2009 at 08:00:16PM +0100, Eric Dumazet wrote:
> > > >> Stephen Hemminger a écrit :
> > > >>> This is an alternative to earlier RCU/seqcount_t version of counters.
> > > >>> The counters operate as usual without locking, but when counters are rotated
> > > >>> around the CPU's entries.  RCU is used in two ways, first to handle the
> > > >>> counter rotation, second for replace.
> > > >> Is it a working patch or just a prototype ?
> > > >>
> > > >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > > >>>
> > > >>> ---
> > > >>>  include/linux/netfilter/x_tables.h |   10 +++-
> > > >>>  net/ipv4/netfilter/arp_tables.c    |   73 +++++++++++++++++++++++++++---------
> > > >>>  net/ipv4/netfilter/ip_tables.c     |   68 ++++++++++++++++++++++++---------
> > > >>>  net/ipv6/netfilter/ip6_tables.c    |   75 ++++++++++++++++++++++++++-----------
> > > >>>  net/netfilter/x_tables.c           |   43 +++++++++++++++------
> > > >>>  5 files changed, 197 insertions(+), 72 deletions(-)
> > > >>>
> > > >>> --- a/include/linux/netfilter/x_tables.h	2009-02-02 15:06:39.893751845 -0800
> > > >>> +++ b/include/linux/netfilter/x_tables.h	2009-02-02 15:28:10.022574005 -0800
> > > >>> @@ -353,7 +353,7 @@ struct xt_table
> > > >>>  	unsigned int valid_hooks;
> > > >>>  
> > > >>>  	/* Lock for the curtain */
> > > >>> -	rwlock_t lock;
> > > >>> +	struct mutex lock;
> > > >>>  
> > > >>>  	/* Man behind the curtain... */
> > > >>>  	struct xt_table_info *private;
> > > >>> @@ -383,9 +383,15 @@ struct xt_table_info
> > > >>>  	unsigned int hook_entry[NF_INET_NUMHOOKS];
> > > >>>  	unsigned int underflow[NF_INET_NUMHOOKS];
> > > >>>  
> > > >>> +	/* For the dustman... */
> > > >>> +	union {
> > > >>> +		struct rcu_head rcu;
> > > >>> +		struct work_struct work;
> > > >>> +	};
> > > >>> +
> > > >>>  	/* ipt_entry tables: one per CPU */
> > > >>>  	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
> > > >>> -	char *entries[1];
> > > >>> +	void *entries[1];
> > > >>>  };
> > > >>>  
> > > >>>  #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
> > > >>> --- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
> > > >>> +++ b/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:14:13.256499021 -0800
> > > >>> @@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
> > > >>>  	mtpar.family  = tgpar.family = NFPROTO_IPV4;
> > > >>>  	tgpar.hooknum = hook;
> > > >>>  
> > > >>> -	read_lock_bh(&table->lock);
> > > >>>  	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
> > > >>> -	private = table->private;
> > > >>> -	table_base = (void *)private->entries[smp_processor_id()];
> > > >>> +
> > > >>> +	rcu_read_lock_bh();
> > > >>> +	private = rcu_dereference(table->private);
> > > >>> +	table_base = rcu_dereference(private->entries[smp_processor_id()]);
> > > >>> +
> > > >>>  	e = get_entry(table_base, private->hook_entry[hook]);
> > > >>>  
> > > >>>  	/* For return from builtin chain */
> > > >>> @@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
> > > >>>  		}
> > > >>>  	} while (!hotdrop);
> > > >>>  
> > > >>> -	read_unlock_bh(&table->lock);
> > > >>> +	rcu_read_unlock_bh();
> > > >>>  
> > > >>>  #ifdef DEBUG_ALLOW_ALL
> > > >>>  	return NF_ACCEPT;
> > > >>> @@ -892,45 +894,73 @@ set_entry_to_counter(const struct ipt_en
> > > >>>  	return 0;
> > > >>>  }
> > > >>>  
> > > >>> +static inline int
> > > >>> +set_counter_to_entry(struct ipt_entry *e,
> > > >>> +		     const struct ipt_counters total[],
> > > >>> +		     unsigned int *i)
> > > >>> +{
> > > >>> +	SET_COUNTER(e->counters, total[*i].bcnt, total[*i].pcnt);
> > > >>> +
> > > >>> +	(*i)++;
> > > >>> +	return 0;
> > > >>> +}
> > > >>> +
> > > >>> +
> > > >>>  static void
> > > >>> -get_counters(const struct xt_table_info *t,
> > > >>> +get_counters(struct xt_table_info *t,
> > > >>>  	     struct xt_counters counters[])
> > > >>>  {
> > > >>>  	unsigned int cpu;
> > > >>>  	unsigned int i;
> > > >>>  	unsigned int curcpu;
> > > >>> +	struct ipt_entry *e;
> > > >>>  
> > > >>> -	/* Instead of clearing (by a previous call to memset())
> > > >>> -	 * the counters and using adds, we set the counters
> > > >>> -	 * with data used by 'current' CPU
> > > >>> -	 * We dont care about preemption here.
> > > >>> -	 */
> > > >>> +	preempt_disable();
> > > >>>  	curcpu = raw_smp_processor_id();
> > > >>> -
> > > >>> +	e = t->entries[curcpu];
> > > >>>  	i = 0;
> > > >>> -	IPT_ENTRY_ITERATE(t->entries[curcpu],
> > > >>> +	IPT_ENTRY_ITERATE(e,
> > > >>>  			  t->size,
> > > >>>  			  set_entry_to_counter,
> > > >> Hum, current cpu might be interrupted by NIC, since you only disabled preemption.
> > > >> set_entry_to_counter() might get garbage.
> > > >> I suppose I already mentioned it :)
> > > >>
> > > >>>  			  counters,
> > > >>>  			  &i);
> > > >>>  
> > > >>>  	for_each_possible_cpu(cpu) {
> > > >>> +		void *p;
> > > >>> +
> > > >>>  		if (cpu == curcpu)
> > > >>>  			continue;
> > > >>> +
> > > >>> +		/* Swizzle the values and wait */
> > > >>> +		e->counters = ((struct xt_counters) { 0, 0 });
> > > >> I dont see what you want to do here...
> > > >>
> > > >> e->counters is the counter associated with rule #0
> > > >>
> > > >>> +		p = t->entries[cpu];
> > > >>> +		rcu_assign_pointer(t->entries[cpu], e);
> > > >>> +		synchronize_net();
> > > >>
> > > >> Oh well, not this synchronize_net() :)
> > > >>
> > > >> This wont provide atomic sampling of counters for whole CPUS, and introduce large delays
> > > >> on big machines (NR_CPUS >= 64)
> > > > 
> > > > Why would this not provide the moral equivalent of atomic sampling?
> > > > The code above switches to another counter set, and waits for a grace
> > > > period.  Shouldn't this mean that all CPUs that were incrementing the
> > > > old set of counters have finished doing so, so that the aggregate count
> > > > covers all CPUs that started their increments before the pointer switch?
> > > > Same as acquiring a write lock, which would wait for all CPUs that
> > > > started their increments before starting the write-lock acquisition.
> > > > CPUs that started their increments after starting the write acquisition
> > > > would not be accounted for in the total, same as the RCU approach.
> > > > 
> > > > Steve's approach does delay reading out the counters, but it avoids
> > > > delaying any CPU trying to increment the counters.
> > > 
> > > I see your point, but this is not what Stephen implemented.
> > > 
> > > So.. CPU will increments which counters, if not delayed ?
> > 
> > The new set installed by the rcu_assign_pointer().
> > 
> > > How counters will be synced again after our 'iptables -L' finished ?
> > 
> > The usual approach would be to have three sets of counters, one currently
> > being incremented, one just removed from service, and the last one holding
> > the cumulative value.  After a synchronize_net() following removing
> > a set from service, you add in the values in the previous set removed
> > from service.  Then you keep the new set for the next 'iptables -L'.
> > 
> > > "iptable -L" is not supposed to miss some counters updates (only some packets
> > >  might be droped at NIC level because we spend time in the collection).
> > > If packets matches some rules, we really want up2date counters.
> > 
> > No counter updates would be lost using the above method.
> > 
> > > Maybe we need for this collection an extra "cpu", to collect 
> > > all increments that were done when CPUs where directed to a 
> > > "secondary table/counters"
> > 
> > It should be easier to maintain a third set of counters that hold the
> > accumulated counts from the earlier instances of 'iptables -L'.
> > 
> > > > So what am I missing here?
> > > 
> > > Well, I saw one synchronize_net() inside the for_each_possible_cpu(cpu) loop.
> > > Say we have NR_CPUS=4096, how long will it takes to perform "iptables -L" ?
> > 
> > Good point, the for_each_possible_cpu() was cut out -- I should have
> > gone back and looked at the original patch.
> > 
> > Seems like it should be possible to do a single synchronize_net()
> > after swizzling all the counters...
> > 
> > > General/intuitive idea would be :
> > > 
> > > switch pointers to a newly allocated table (and zeroed counters)
> > > wait one RCU grace period
> > > collect/sum all counters of "old" table + (all cpus) into user provided table
> > > restore previous table
> > > wait one RCU grace period
> > > disable_bh()
> > > collect/sum all counters of "new and temporary" table (all cpus) and
> > > reinject them into local cpu table (this cpu should not be interrupted)
> > > enable_bh()
> > > 
> > > This way, "iptables -L" is not too expensive and doesnt block packet processing at all.
> > 
> > My thought would be:
> > 
> > o	acquire some sort of mutex.
> > 
> > o	switch counters to newly allocated (and zeroed) table (T1).
> > 	The old table being switched out is T2.
> > 
> > o	wait one RCU grace period.
> > 
> > o	Sum T2 into a single global counter (G).
> > 
> > o	Free T2.
> > 
> > o	Copy G to a local variable.
> > 
> > o	release the mutex.
> > 
> > o	Return the value of the local variable.
> > 
> > Then you can repeat, allocating a new table again and using the new
> > value of G.
> > 
> > Which may well be what you are saying above,
> > 
> 
> I was using the current CPU counter as the global counter G.

Sounds good, then!  ;-)

						Thanx, Paul

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 3/3] iptables: lock free counters (alternate version)
  2009-02-03 23:11                 ` Paul E. McKenney
@ 2009-02-03 23:18                   ` Stephen Hemminger
  0 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-02-03 23:18 UTC (permalink / raw)
  To: paulmck; +Cc: Eric Dumazet, David Miller, netdev

This is the "interesting bits" of what I am testing.
What it does is swap in new counters, sum, then put counter values
back onto the current cpu.

The replace case is easier since the counters are "old" at that point.


--- a/net/ipv4/netfilter/ip_tables.c	2009-02-02 15:06:29.684249364 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-03 15:08:30.230188116 -0800
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -924,13 +926,62 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
+
+}
+
+/* Swap existing counter entries with new zeroed ones */
+static void swap_counters(struct xt_table_info *o, struct xt_table_info *n)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *p = o->entries[cpu];
+		memset(n->entries[cpu], 0, n->size);
+
+		rcu_assign_pointer(o->entries[cpu], n->entries[cpu]);
+		n->entries[cpu] = p;
+	}
+
+	/* Wait until smoke has cleared */
+	synchronize_net();
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *tmp;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -939,14 +990,26 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	tmp = xt_alloc_table_info(private->size);
+	if (!tmp)
+		goto free_counters;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
+	swap_counters(private, tmp);
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(tmp);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1242,7 +1305,9 @@ __do_replace(struct net *net, const char
 		module_put(t->me);
 
 	/* Get the old counters. */
+	synchronize_net();
 	get_counters(oldinfo, counters);
+
 	/* Decrease module usage counts and free resource */
 	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
 	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,
@@ -1312,27 +1377,6 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1437,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1453,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 5/6] netfilter: use sequence number synchronization for counters
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
                   ` (3 preceding siblings ...)
  2009-01-30 21:57 ` [PATCH 4/6] netfilter: abstract xt_counters Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-01-30 21:57 ` [PATCH 6/6] netfilter: convert x_tables to use RCU Stephen Hemminger
  5 siblings, 0 replies; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: counters-seqcount.patch --]
[-- Type: text/plain, Size: 1330 bytes --]

Change how synchronization is done on the iptables counters. Use seqcount
wrapper instead of depending on reader/writer lock.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

--- a/net/netfilter/x_tables.c	2009-01-30 09:15:52.648542116 -0800
+++ b/net/netfilter/x_tables.c	2009-01-30 09:17:03.669061821 -0800
@@ -577,18 +577,36 @@ int xt_compat_target_to_user(struct xt_e
 EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 #endif
 
+static DEFINE_PER_CPU(seqcount_t, xt_counter_sequence);
+
+/* Update the counters on the current CPU. preempt must be disabled. */
 void xt_add_counter(struct xt_counters *c, unsigned b, unsigned p)
 {
+	seqcount_t *seq = &__get_cpu_var(xt_counter_sequence);
+
+	write_seqcount_begin(seq);
 	c->bcnt += b;
 	c->pcnt += p;
+	write_seqcount_end(seq);
 }
 EXPORT_SYMBOL_GPL(xt_add_counter);
 
+/* Fetch counters on other CPU. */
 void xt_sum_counter(struct xt_counters *t, int cpu,
 		    const struct xt_counters *c)
 {
-	t->pcnt += c->pcnt;
-	t->bcnt += c->bcnt;
+	seqcount_t *seq = &per_cpu(xt_counter_sequence, cpu);
+	unsigned start;
+	struct xt_counters v;
+
+	/* Atomic fetch of counter value */
+	do {
+		start = read_seqcount_begin(seq);
+		v = *c;
+	} while (read_seqcount_retry(seq, start));
+
+	t->pcnt += v.pcnt;
+	t->bcnt += v.bcnt;
 }
 EXPORT_SYMBOL_GPL(xt_sum_counter);
 

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH 6/6] netfilter: convert x_tables to use RCU
  2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
                   ` (4 preceding siblings ...)
  2009-01-30 21:57 ` [PATCH 5/6] netfilter: use sequence number synchronization for counters Stephen Hemminger
@ 2009-01-30 21:57 ` Stephen Hemminger
  2009-01-31 17:27   ` Eric Dumazet
  5 siblings, 1 reply; 21+ messages in thread
From: Stephen Hemminger @ 2009-01-30 21:57 UTC (permalink / raw)
  To: David Miller; +Cc: netdev

[-- Attachment #1: iptables-rcu.patch --]
[-- Type: text/plain, Size: 9803 bytes --]

Replace existing reader/writer lock with Read-Copy-Update to
elminate the overhead of a read lock on each incoming packet.
This should reduce the overhead of iptables especially on SMP
systems.

The previous code used a reader-writer lock for two purposes.
The first was to ensure that the xt_table_info reference was not in
process of being changed. Since xt_table_info is only freed via one
routine, it was a direct conversion to RCU.

The other use of the reader-writer lock was to to block changes
to counters while they were being read. This synchronization was
fixed by the previous patch.  But still need to make sure table info
isn't going away.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


---
 include/linux/netfilter/x_tables.h |   10 ++++++--
 net/ipv4/netfilter/arp_tables.c    |   16 ++++++-------
 net/ipv4/netfilter/ip_tables.c     |   27 +++++++++++-----------
 net/ipv6/netfilter/ip6_tables.c    |   16 ++++++-------
 net/netfilter/x_tables.c           |   45 ++++++++++++++++++++++++++-----------
 5 files changed, 70 insertions(+), 44 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-01-30 09:15:52.700542193 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-01-30 09:17:25.888041887 -0800
@@ -356,8 +356,8 @@ struct xt_table
 	/* What hooks you will enter on */
 	unsigned int valid_hooks;
 
-	/* Lock for the curtain */
-	rwlock_t lock;
+	/* Lock for curtain */
+	spinlock_t lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -387,6 +387,12 @@ struct xt_table_info
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+	/* For the dustman... */
+	union {
+		struct rcu_head rcu;
+		struct work_struct work;
+	};
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	char *entries[1];
--- a/net/ipv4/netfilter/arp_tables.c	2009-01-30 09:15:52.636542607 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-01-30 09:24:52.004793273 -0800
@@ -237,8 +237,8 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -311,7 +311,7 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -733,9 +733,9 @@ static inline struct xt_counters *alloc_
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_enable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_disable();
 
 	return counters;
 }
@@ -1149,8 +1149,8 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1165,7 +1165,7 @@ static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv4/netfilter/ip_tables.c	2009-01-30 09:15:52.624542483 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-01-30 09:25:07.776040828 -0800
@@ -66,11 +66,12 @@ do {								\
 #endif
 
 /*
-   We keep a set of rules for each CPU, so we can avoid write-locking
-   them in the softirq when updating the counters and therefore
-   only need to read-lock in the softirq; doing a write_lock_bh() in user
-   context stops packets coming through and allows user context to read
-   the counters or update the rules.
+   We keep a set of rules for each CPU, so we can avoid locking
+   them in the softirq when updating the counters. We use a sequence
+   counter to keep the counters consistent and RCU to prevent
+   handle counters during replace operation. When reading the
+   counters, need to have bottom half and preempt disabled to
+   get a consistent data.
 
    Hence the start of any table is given by get_table() below.  */
 
@@ -347,9 +348,9 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
+	rcu_read_lock_bh();
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -445,7 +446,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -944,9 +945,9 @@ static struct xt_counters * alloc_counte
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_disable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_enable();
 
 	return counters;
 }
@@ -1394,8 +1395,8 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1410,7 +1411,7 @@ do_add_counters(struct net *net, void __
 			  paddc,
 			  &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-01-30 09:15:52.684541784 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-01-30 09:25:43.756056066 -0800
@@ -373,9 +373,9 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
+	rcu_read_lock_bh();
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
+	private = rcu_dereference(table->private);
 	table_base = (void *)private->entries[smp_processor_id()];
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -474,7 +474,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock_bh();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -973,9 +973,9 @@ static struct xt_counters *alloc_counter
 		return ERR_PTR(-ENOMEM);
 
 	/* First, sum counters... */
-	write_lock_bh(&table->lock);
+	local_bh_disable();
 	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	local_bh_enable();
 
 	return counters;
 }
@@ -1425,8 +1425,8 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
-	private = t->private;
+	rcu_read_lock_bh();
+	private = rcu_dereference(t->private);
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
@@ -1441,7 +1441,7 @@ do_add_counters(struct net *net, void __
 			  paddc,
 			  &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	rcu_read_unlock_bh();
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-01-30 09:17:03.669061821 -0800
+++ b/net/netfilter/x_tables.c	2009-01-30 09:17:25.892042053 -0800
@@ -644,18 +644,37 @@ struct xt_table_info *xt_alloc_table_inf
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
 
-void xt_free_table_info(struct xt_table_info *info)
+/* callback to do free for vmalloc'd case */
+static void xt_free_table_info_work(struct work_struct *arg)
 {
-	int cpu;
+	struct xt_table_info *info = container_of(arg, struct xt_table_info, work);
+	unsigned int cpu;
 
-	for_each_possible_cpu(cpu) {
-		if (info->size <= PAGE_SIZE)
-			kfree(info->entries[cpu]);
-		else
-			vfree(info->entries[cpu]);
-	}
+	for_each_possible_cpu(cpu)
+		vfree(info->entries[cpu]);
 	kfree(info);
 }
+
+static void xt_free_table_info_rcu(struct rcu_head *arg)
+{
+ 	struct xt_table_info *info = container_of(arg, struct xt_table_info, rcu);
+
+ 	if (info->size <= PAGE_SIZE) {
+		unsigned int cpu;
+ 		for_each_possible_cpu(cpu)
+ 			kfree(info->entries[cpu]);
+ 		kfree(info);
+ 	} else {
+ 		/* can't safely call vfree in current context */
+ 		INIT_WORK(&info->work, xt_free_table_info_work);
+ 		schedule_work(&info->work);
+  	}
+}
+
+void xt_free_table_info(struct xt_table_info *info)
+{
+ 	call_rcu(&info->rcu, xt_free_table_info_rcu);
+}
 EXPORT_SYMBOL(xt_free_table_info);
 
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
@@ -704,20 +723,20 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	spin_lock_bh(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		spin_unlock_bh(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	spin_unlock_bh(&table->lock);
 
 	return oldinfo;
 }
@@ -752,7 +771,7 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	spin_lock_init(&table->lock);
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 

-- 


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH 6/6] netfilter: convert x_tables to use RCU
  2009-01-30 21:57 ` [PATCH 6/6] netfilter: convert x_tables to use RCU Stephen Hemminger
@ 2009-01-31 17:27   ` Eric Dumazet
  0 siblings, 0 replies; 21+ messages in thread
From: Eric Dumazet @ 2009-01-31 17:27 UTC (permalink / raw)
  To: Stephen Hemminger; +Cc: David Miller, netdev

Stephen Hemminger a écrit :
> Replace existing reader/writer lock with Read-Copy-Update to
> elminate the overhead of a read lock on each incoming packet.
> This should reduce the overhead of iptables especially on SMP
> systems.
> 
> The previous code used a reader-writer lock for two purposes.
> The first was to ensure that the xt_table_info reference was not in
> process of being changed. Since xt_table_info is only freed via one
> routine, it was a direct conversion to RCU.
> 
> The other use of the reader-writer lock was to to block changes
> to counters while they were being read. This synchronization was
> fixed by the previous patch.  But still need to make sure table info
> isn't going away.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> ---
>  include/linux/netfilter/x_tables.h |   10 ++++++--
>  net/ipv4/netfilter/arp_tables.c    |   16 ++++++-------
>  net/ipv4/netfilter/ip_tables.c     |   27 +++++++++++-----------
>  net/ipv6/netfilter/ip6_tables.c    |   16 ++++++-------
>  net/netfilter/x_tables.c           |   45 ++++++++++++++++++++++++++-----------
>  5 files changed, 70 insertions(+), 44 deletions(-)

OK, I have a last comment Stephen

(I tested your patches on my dev machine and got nice results)

In net/ipv4/netfilter/ip_tables.c,
get_counters() can be quite slow on large firewalls, and it is
called from alloc_counters() with BH disabled, but also from __do_replace()
without BH disabled.

So get_counters() first iterates on set_entry_to_counter() for current cpu
but might be interrupted and we get bad results...

Solution is to always use seqcount_t protection, and no BH disabling at all.

Then, I wonder if all this is valid, since alloc_counters() might be supposed
to perform an atomic snapshots of all counters. This was possible because
of a write_lock_bh(), but does it make any sense to block all packets for
such a long time ??? If this is a strong requirement, I am afraid we have
to make other changes...


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 net/ipv4/netfilter/ip_tables.c |   25 +++----------------------
 1 files changed, 3 insertions(+), 22 deletions(-)

diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 7c50e2e..d208b25 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -900,25 +900,8 @@ get_counters(const struct xt_table_info *t,
 {
 	unsigned int cpu;
 	unsigned int i;
-	unsigned int curcpu;
-
-	/* Instead of clearing (by a previous call to memset())
-	 * the counters and using adds, we set the counters
-	 * with data used by 'current' CPU
-	 * We dont care about preemption here.
-	 */
-	curcpu = raw_smp_processor_id();
-
-	i = 0;
-	IPT_ENTRY_ITERATE(t->entries[curcpu],
-			  t->size,
-			  set_entry_to_counter,
-			  counters,
-			  &i);
 
 	for_each_possible_cpu(cpu) {
-		if (cpu == curcpu)
-			continue;
 		i = 0;
 		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
@@ -939,15 +922,12 @@ static struct xt_counters * alloc_counters(struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vmalloc_node(countersize, numa_node_id());
+	counters = __vmalloc(countersize, GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	/* First, sum counters... */
-	local_bh_disable();
 	get_counters(private, counters);
-	local_bh_enable();
 
 	return counters;
 }
@@ -1209,7 +1189,8 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	void *loc_cpu_old_entry;
 
 	ret = 0;
-	counters = vmalloc(num_counters * sizeof(struct xt_counters));
+	counters = __vmalloc(num_counters * sizeof(struct xt_counters),
+			     GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL);
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;


^ permalink raw reply related	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2009-02-03 23:18 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-01-30 21:57 [PATCH 0/6] iptables: eliminate read/write lock (v0.4) Stephen Hemminger
2009-01-30 21:57 ` [PATCH 1/6] netfilter: change elements in x_tables Stephen Hemminger
2009-01-30 21:57 ` [PATCH 2/6] netfilter: remove unneeded initializations Stephen Hemminger
2009-01-30 21:57 ` [PATCH 3/6] ebtables: " Stephen Hemminger
2009-01-30 21:57 ` [PATCH 4/6] netfilter: abstract xt_counters Stephen Hemminger
2009-02-01 12:25   ` Eric Dumazet
2009-02-02 23:33     ` [PATCH 3/3] iptables: lock free counters (alternate version) Stephen Hemminger
2009-02-03 19:00       ` Eric Dumazet
2009-02-03 19:19         ` Eric Dumazet
2009-02-03 19:32         ` Paul E. McKenney
2009-02-03 20:20           ` Eric Dumazet
2009-02-03 20:44             ` Stephen Hemminger
2009-02-03 21:05               ` Eric Dumazet
2009-02-03 21:10             ` Paul E. McKenney
2009-02-03 21:22               ` Stephen Hemminger
2009-02-03 21:27                 ` Rick Jones
2009-02-03 23:11                 ` Paul E. McKenney
2009-02-03 23:18                   ` Stephen Hemminger
2009-01-30 21:57 ` [PATCH 5/6] netfilter: use sequence number synchronization for counters Stephen Hemminger
2009-01-30 21:57 ` [PATCH 6/6] netfilter: convert x_tables to use RCU Stephen Hemminger
2009-01-31 17:27   ` Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).