[RFT 0/4] Netfilter/iptables performance improvements

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [RFT 0/4] Netfilter/iptables performance improvements
@ 2009-02-18  5:19 Stephen Hemminger
  2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
                   ` (4 more replies)
  0 siblings, 5 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18  5:19 UTC (permalink / raw)
  To: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet
  Cc: netdev, netfilter-devel

Bring together the three performance improvements suggested.
  1) RCU for ip_tables entries
  2) mod_timer_noact for conntrack timer
  3) eliminate tcp_lock

I took the patches for 2 & 3 and made them build and basically work.

This patch set is against Patrick's netfilter next tree since
it is where it should end up.
	git.kernel.org:/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git

-- 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* [RFT 1/4] iptables: lock free counters
  2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
@ 2009-02-18  5:19 ` Stephen Hemminger
  2009-02-18 10:02   ` Patrick McHardy
  2009-02-19 19:47   ` [PATCH] " Stephen Hemminger
  2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18  5:19 UTC (permalink / raw)
  To: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet
  Cc: netdev, netfilter-devel

[-- Attachment #1: iptables-swizzle.patch --]
[-- Type: text/plain, Size: 18434 bytes --]

The reader/writer lock in ip_tables is acquired in the critical path of
processing packets and is one of the reasons just loading iptables can cause
a 20% performance loss. The rwlock serves two functions:

1) it prevents changes to table state (xt_replace) while table is in use.
   This is now handled by doing rcu on the xt_table. When table is
   replaced, the new table(s) are put in and the old one table(s) are freed
   after RCU period.

2) it provides synchronization when accesing the counter values.
   This is now handled by swapping in new table_info entries for each cpu
   then summing the old values, and putting the result back onto one
   cpu.  On a busy system it may cause sampling to occur at different
   times on each cpu, but no packet/byte counts are lost in the process.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
 include/linux/netfilter/x_tables.h |    6 +
 net/ipv4/netfilter/arp_tables.c    |  114 ++++++++++++++++++++++++++---------
 net/ipv4/netfilter/ip_tables.c     |  120 ++++++++++++++++++++++++++-----------
 net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++++++++++++++++-----------
 net/netfilter/x_tables.c           |   26 ++++++--
 5 files changed, 283 insertions(+), 102 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-02-17 11:04:08.112056798 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-02-17 11:04:09.639778762 -0800
@@ -353,7 +353,7 @@ struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -385,7 +385,7 @@ struct xt_table_info
 
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
+				    struct xt_table_info *new);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
--- a/net/ipv4/netfilter/ip_tables.c	2009-02-17 11:04:08.064030719 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-17 11:04:09.639778762 -0800
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+
+static inline int
+zero_entry_counter(struct ipt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-02-17 11:04:08.084036668 -0800
+++ b/net/netfilter/x_tables.c	2009-02-17 11:04:09.643754342 -0800
@@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
+void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
+			     struct xt_table_info *newinfo)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *p = oldinfo->entries[cpu];
+		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+		newinfo->entries[cpu] = p;
+	}
+
+}
+EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
+
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
+	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
--- a/net/ipv4/netfilter/arp_tables.c	2009-02-17 11:04:08.076036089 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-02-17 11:04:09.643754342 -0800
@@ -237,9 +237,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -311,7 +312,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -714,11 +716,65 @@ static void get_counters(const struct xt
 	}
 }
 
-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct arpt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -728,14 +784,30 @@ static inline struct xt_counters *alloc_
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1075,20 +1147,6 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct xt_counters addme[],
-				       unsigned int *i)
-{
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1148,13 +1206,14 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1164,7 +1223,8 @@ static int do_add_counters(struct net *n
 			   paddc,
 			   &i);
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-02-17 11:04:08.096042470 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-02-17 11:04:09.643754342 -0800
@@ -373,10 +373,12 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -474,7 +476,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -955,11 +957,64 @@ get_counters(const struct xt_table_info 
 	}
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[cpu],
+			   t->size,
+			   add_counter_to_entry,
+			   counters,
+			   &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct ip6t_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				   zero_entry_counter, NULL);
+	}
+}
+
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -968,14 +1023,28 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
 
-	return counters;
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1342,28 +1411,6 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1424,13 +1471,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1439,8 +1487,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

-- 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 1/4] iptables: lock free counters
  2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
@ 2009-02-18 10:02   ` Patrick McHardy
  2009-02-19 19:47   ` [PATCH] " Stephen Hemminger
  1 sibling, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 10:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Rick Jones, Eric Dumazet, netdev, netfilter-devel

Stephen Hemminger wrote:

> @@ -1148,13 +1206,14 @@ static int do_add_counters(struct net *n
>  		goto free;
>  	}
>  
> -	write_lock_bh(&t->lock);
> +	mutex_lock(&t->lock);
>  	private = t->private;
>  	if (private->number != num_counters) {
>  		ret = -EINVAL;
>  		goto unlock_up_free;
>  	}
>  
> +	preempt_disable();
>  	i = 0;
>  	/* Choose the copy that is on our node */
>  	loc_cpu_entry = private->entries[smp_processor_id()];
> @@ -1164,7 +1223,8 @@ static int do_add_counters(struct net *n
>  			   paddc,
>  			   &i);
>   unlock_up_free:
> -	write_unlock_bh(&t->lock);
> +	mutex_unlock(&t->lock);
> +
>  	xt_table_unlock(t);
>  	module_put(t->me);
>   free:

This part (arptables.c) seems to be missing a preempt_enable().

Other than this, the patch looks good to me, if you want I can
already apply this one.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH] iptables: lock free counters
  2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
  2009-02-18 10:02   ` Patrick McHardy
@ 2009-02-19 19:47   ` Stephen Hemminger
  2009-02-19 23:46     ` Eric Dumazet
  1 sibling, 1 reply; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-19 19:47 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet, netdev,
	netfilter-devel

The reader/writer lock in ip_tables is acquired in the critical path of
processing packets and is one of the reasons just loading iptables can cause
a 20% performance loss. The rwlock serves two functions:

1) it prevents changes to table state (xt_replace) while table is in use.
   This is now handled by doing rcu on the xt_table. When table is
   replaced, the new table(s) are put in and the old one table(s) are freed
   after RCU period.

2) it provides synchronization when accesing the counter values.
   This is now handled by swapping in new table_info entries for each cpu
   then summing the old values, and putting the result back onto one
   cpu.  On a busy system it may cause sampling to occur at different
   times on each cpu, but no packet/byte counts are lost in the process.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>

---
Added missing preempt_enable.  Patch against nf-next-2.6 git tree.

 include/linux/netfilter/x_tables.h |    6 +
 net/ipv4/netfilter/arp_tables.c    |  115 +++++++++++++++++++++++++++--------
 net/ipv4/netfilter/ip_tables.c     |  120 ++++++++++++++++++++++++++-----------
 net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++++++++++++++++-----------
 net/netfilter/x_tables.c           |   26 ++++++--
 5 files changed, 284 insertions(+), 102 deletions(-)

--- a/include/linux/netfilter/x_tables.h	2009-02-19 11:42:43.060110657 -0800
+++ b/include/linux/netfilter/x_tables.h	2009-02-19 11:42:58.863663575 -0800
@@ -353,7 +353,7 @@ struct xt_table
 	unsigned int valid_hooks;
 
 	/* Lock for the curtain */
-	rwlock_t lock;
+	struct mutex lock;
 
 	/* Man behind the curtain... */
 	struct xt_table_info *private;
@@ -385,7 +385,7 @@ struct xt_table_info
 
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	char *entries[1];
+	void *entries[1];
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
@@ -432,6 +432,8 @@ extern void xt_proto_fini(struct net *ne
 
 extern struct xt_table_info *xt_alloc_table_info(unsigned int size);
 extern void xt_free_table_info(struct xt_table_info *info);
+extern void xt_table_entry_swap_rcu(struct xt_table_info *old,
+				    struct xt_table_info *new);
 
 #ifdef CONFIG_COMPAT
 #include <net/compat.h>
--- a/net/ipv4/netfilter/ip_tables.c	2009-02-19 11:42:12.968410890 -0800
+++ b/net/ipv4/netfilter/ip_tables.c	2009-02-19 11:42:58.863663575 -0800
@@ -347,10 +347,12 @@ ipt_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV4;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -445,7 +447,7 @@ ipt_do_table(struct sk_buff *skb,
 		}
 	} while (!hotdrop);
 
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -924,13 +926,68 @@ get_counters(const struct xt_table_info 
 				  counters,
 				  &i);
 	}
+
+}
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ipt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+
+static inline int
+zero_entry_counter(struct ipt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
 }
 
 static struct xt_counters * alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -939,14 +996,30 @@ static struct xt_counters * alloc_counte
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
+
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1312,27 +1385,6 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static int
-add_counter_to_entry(struct ipt_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
 
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len, int compat)
@@ -1393,13 +1445,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1408,8 +1461,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/netfilter/x_tables.c	2009-02-19 11:42:12.988414682 -0800
+++ b/net/netfilter/x_tables.c	2009-02-19 11:42:58.863663575 -0800
@@ -625,6 +625,20 @@ void xt_free_table_info(struct xt_table_
 }
 EXPORT_SYMBOL(xt_free_table_info);
 
+void xt_table_entry_swap_rcu(struct xt_table_info *oldinfo,
+			     struct xt_table_info *newinfo)
+{
+	unsigned int cpu;
+
+	for_each_possible_cpu(cpu) {
+		void *p = oldinfo->entries[cpu];
+		rcu_assign_pointer(oldinfo->entries[cpu], newinfo->entries[cpu]);
+		newinfo->entries[cpu] = p;
+	}
+
+}
+EXPORT_SYMBOL_GPL(xt_table_entry_swap_rcu);
+
 /* Find table by name, grabs mutex & ref.  Returns ERR_PTR() on error. */
 struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
 				    const char *name)
@@ -671,21 +685,22 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *oldinfo, *private;
 
 	/* Do the substitution. */
-	write_lock_bh(&table->lock);
+	mutex_lock(&table->lock);
 	private = table->private;
 	/* Check inside lock: is the old number correct? */
 	if (num_counters != private->number) {
 		duprintf("num_counters != table->private->number (%u/%u)\n",
 			 num_counters, private->number);
-		write_unlock_bh(&table->lock);
+		mutex_unlock(&table->lock);
 		*error = -EAGAIN;
 		return NULL;
 	}
 	oldinfo = private;
-	table->private = newinfo;
+	rcu_assign_pointer(table->private, newinfo);
 	newinfo->initial_entries = oldinfo->initial_entries;
-	write_unlock_bh(&table->lock);
+	mutex_unlock(&table->lock);
 
+	synchronize_net();
 	return oldinfo;
 }
 EXPORT_SYMBOL_GPL(xt_replace_table);
@@ -719,7 +734,8 @@ struct xt_table *xt_register_table(struc
 
 	/* Simplifies replace_table code. */
 	table->private = bootstrap;
-	rwlock_init(&table->lock);
+	mutex_init(&table->lock);
+
 	if (!xt_replace_table(table, 0, newinfo, &ret))
 		goto unlock;
 
--- a/net/ipv4/netfilter/arp_tables.c	2009-02-19 11:42:43.064477910 -0800
+++ b/net/ipv4/netfilter/arp_tables.c	2009-02-19 11:42:58.863663575 -0800
@@ -261,9 +261,10 @@ unsigned int arpt_do_table(struct sk_buf
 	indev = in ? in->name : nulldevname;
 	outdev = out ? out->name : nulldevname;
 
-	read_lock_bh(&table->lock);
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
 
@@ -335,7 +336,8 @@ unsigned int arpt_do_table(struct sk_buf
 			e = (void *)e + e->next_offset;
 		}
 	} while (!hotdrop);
-	read_unlock_bh(&table->lock);
+
+	rcu_read_unlock();
 
 	if (hotdrop)
 		return NF_DROP;
@@ -738,11 +740,65 @@ static void get_counters(const struct xt
 	}
 }
 
-static inline struct xt_counters *alloc_counters(struct xt_table *table)
+
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct arpt_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	ARPT_ENTRY_ITERATE(t->entries[cpu],
+			  t->size,
+			  add_counter_to_entry,
+			  counters,
+			  &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct arpt_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		ARPT_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				  zero_entry_counter, NULL);
+	}
+}
+
+static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	 * (other than comefrom, which userspace doesn't care
@@ -752,14 +808,30 @@ static inline struct xt_counters *alloc_
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
+
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
 
 	return counters;
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int copy_entries_to_user(unsigned int total_size,
@@ -1099,20 +1171,6 @@ static int do_replace(struct net *net, v
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK.
- */
-static inline int add_counter_to_entry(struct arpt_entry *e,
-				       const struct xt_counters addme[],
-				       unsigned int *i)
-{
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int do_add_counters(struct net *net, void __user *user, unsigned int len,
 			   int compat)
 {
@@ -1172,13 +1230,14 @@ static int do_add_counters(struct net *n
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[smp_processor_id()];
@@ -1187,8 +1246,10 @@ static int do_add_counters(struct net *n
 			   add_counter_to_entry,
 			   paddc,
 			   &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
+
 	xt_table_unlock(t);
 	module_put(t->me);
  free:
--- a/net/ipv6/netfilter/ip6_tables.c	2009-02-19 11:42:54.219410544 -0800
+++ b/net/ipv6/netfilter/ip6_tables.c	2009-02-19 11:42:58.867668311 -0800
@@ -382,10 +382,12 @@ ip6t_do_table(struct sk_buff *skb,
 	mtpar.family  = tgpar.family = NFPROTO_IPV6;
 	tgpar.hooknum = hook;
 
-	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	private = table->private;
-	table_base = (void *)private->entries[smp_processor_id()];
+
+	rcu_read_lock();
+	private = rcu_dereference(table->private);
+	table_base = rcu_dereference(private->entries[smp_processor_id()]);
+
 	e = get_entry(table_base, private->hook_entry[hook]);
 
 	/* For return from builtin chain */
@@ -483,7 +485,7 @@ ip6t_do_table(struct sk_buff *skb,
 #ifdef CONFIG_NETFILTER_DEBUG
 	((struct ip6t_entry *)table_base)->comefrom = NETFILTER_LINK_POISON;
 #endif
-	read_unlock_bh(&table->lock);
+	rcu_read_unlock();
 
 #ifdef DEBUG_ALLOW_ALL
 	return NF_ACCEPT;
@@ -964,11 +966,64 @@ get_counters(const struct xt_table_info 
 	}
 }
 
+/* We're lazy, and add to the first CPU; overflow works its fey magic
+ * and everything is OK. */
+static int
+add_counter_to_entry(struct ip6t_entry *e,
+		     const struct xt_counters addme[],
+		     unsigned int *i)
+{
+	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
+
+	(*i)++;
+	return 0;
+}
+
+/* Take values from counters and add them back onto the current cpu */
+static void put_counters(struct xt_table_info *t,
+			 const struct xt_counters counters[])
+{
+	unsigned int i, cpu;
+
+	local_bh_disable();
+	cpu = smp_processor_id();
+	i = 0;
+	IP6T_ENTRY_ITERATE(t->entries[cpu],
+			   t->size,
+			   add_counter_to_entry,
+			   counters,
+			   &i);
+	local_bh_enable();
+}
+
+static inline int
+zero_entry_counter(struct ip6t_entry *e, void *arg)
+{
+	e->counters.bcnt = 0;
+	e->counters.pcnt = 0;
+	return 0;
+}
+
+static void
+clone_counters(struct xt_table_info *newinfo, const struct xt_table_info *info)
+{
+	unsigned int cpu;
+	const void *loc_cpu_entry = info->entries[raw_smp_processor_id()];
+
+	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
+	for_each_possible_cpu(cpu) {
+		memcpy(newinfo->entries[cpu], loc_cpu_entry, info->size);
+		IP6T_ENTRY_ITERATE(newinfo->entries[cpu], newinfo->size,
+				   zero_entry_counter, NULL);
+	}
+}
+
 static struct xt_counters *alloc_counters(struct xt_table *table)
 {
 	unsigned int countersize;
 	struct xt_counters *counters;
-	const struct xt_table_info *private = table->private;
+	struct xt_table_info *private = table->private;
+	struct xt_table_info *info;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -977,14 +1032,28 @@ static struct xt_counters *alloc_counter
 	counters = vmalloc_node(countersize, numa_node_id());
 
 	if (counters == NULL)
-		return ERR_PTR(-ENOMEM);
+		goto nomem;
 
-	/* First, sum counters... */
-	write_lock_bh(&table->lock);
-	get_counters(private, counters);
-	write_unlock_bh(&table->lock);
+	info = xt_alloc_table_info(private->size);
+	if (!info)
+		goto free_counters;
+
+	clone_counters(info, private);
 
-	return counters;
+	mutex_lock(&table->lock);
+	xt_table_entry_swap_rcu(private, info);
+	synchronize_net();	/* Wait until smoke has cleared */
+
+	get_counters(info, counters);
+	put_counters(private, counters);
+	mutex_unlock(&table->lock);
+
+	xt_free_table_info(info);
+
+ free_counters:
+	vfree(counters);
+ nomem:
+	return ERR_PTR(-ENOMEM);
 }
 
 static int
@@ -1351,28 +1420,6 @@ do_replace(struct net *net, void __user 
 	return ret;
 }
 
-/* We're lazy, and add to the first CPU; overflow works its fey magic
- * and everything is OK. */
-static inline int
-add_counter_to_entry(struct ip6t_entry *e,
-		     const struct xt_counters addme[],
-		     unsigned int *i)
-{
-#if 0
-	duprintf("add_counter: Entry %u %lu/%lu + %lu/%lu\n",
-		 *i,
-		 (long unsigned int)e->counters.pcnt,
-		 (long unsigned int)e->counters.bcnt,
-		 (long unsigned int)addme[*i].pcnt,
-		 (long unsigned int)addme[*i].bcnt);
-#endif
-
-	ADD_COUNTER(e->counters, addme[*i].bcnt, addme[*i].pcnt);
-
-	(*i)++;
-	return 0;
-}
-
 static int
 do_add_counters(struct net *net, void __user *user, unsigned int len,
 		int compat)
@@ -1433,13 +1480,14 @@ do_add_counters(struct net *net, void __
 		goto free;
 	}
 
-	write_lock_bh(&t->lock);
+	mutex_lock(&t->lock);
 	private = t->private;
 	if (private->number != num_counters) {
 		ret = -EINVAL;
 		goto unlock_up_free;
 	}
 
+	preempt_disable();
 	i = 0;
 	/* Choose the copy that is on our node */
 	loc_cpu_entry = private->entries[raw_smp_processor_id()];
@@ -1448,8 +1496,9 @@ do_add_counters(struct net *net, void __
 			  add_counter_to_entry,
 			  paddc,
 			  &i);
+	preempt_enable();
  unlock_up_free:
-	write_unlock_bh(&t->lock);
+	mutex_unlock(&t->lock);
 	xt_table_unlock(t);
 	module_put(t->me);
  free:

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-19 19:47   ` [PATCH] " Stephen Hemminger
@ 2009-02-19 23:46     ` Eric Dumazet
  2009-02-19 23:56       ` Rick Jones
                         ` (3 more replies)
  0 siblings, 4 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-02-19 23:46 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, netdev,
	netfilter-devel

Stephen Hemminger a écrit :
> The reader/writer lock in ip_tables is acquired in the critical path of
> processing packets and is one of the reasons just loading iptables can cause
> a 20% performance loss. The rwlock serves two functions:
> 
> 1) it prevents changes to table state (xt_replace) while table is in use.
>    This is now handled by doing rcu on the xt_table. When table is
>    replaced, the new table(s) are put in and the old one table(s) are freed
>    after RCU period.
> 
> 2) it provides synchronization when accesing the counter values.
>    This is now handled by swapping in new table_info entries for each cpu
>    then summing the old values, and putting the result back onto one
>    cpu.  On a busy system it may cause sampling to occur at different
>    times on each cpu, but no packet/byte counts are lost in the process.
> 
> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>


Acked-by: Eric Dumazet <dada1@cosmosbay.com>

Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)

BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)

Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)


> 
> ---
> Added missing preempt_enable.  Patch against nf-next-2.6 git tree.
> 
>  include/linux/netfilter/x_tables.h |    6 +
>  net/ipv4/netfilter/arp_tables.c    |  115 +++++++++++++++++++++++++++--------
>  net/ipv4/netfilter/ip_tables.c     |  120 ++++++++++++++++++++++++++-----------
>  net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++++++++++++++++-----------
>  net/netfilter/x_tables.c           |   26 ++++++--
>  5 files changed, 284 insertions(+), 102 deletions(-)
> 



^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-19 23:46     ` Eric Dumazet
@ 2009-02-19 23:56       ` Rick Jones
  2009-02-20  1:03         ` Stephen Hemminger
  2009-02-20  9:37       ` Patrick McHardy
                         ` (2 subsequent siblings)
  3 siblings, 1 reply; 83+ messages in thread
From: Rick Jones @ 2009-02-19 23:56 UTC (permalink / raw)
  To: Eric Dumazet, Stephen Hemminger
  Cc: David Miller, Patrick McHardy, netdev, netfilter-devel

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
> 
>>The reader/writer lock in ip_tables is acquired in the critical path of
>>processing packets and is one of the reasons just loading iptables can cause
>>a 20% performance loss. The rwlock serves two functions:
>>
>>1) it prevents changes to table state (xt_replace) while table is in use.
>>   This is now handled by doing rcu on the xt_table. When table is
>>   replaced, the new table(s) are put in and the old one table(s) are freed
>>   after RCU period.
>>
>>2) it provides synchronization when accesing the counter values.
>>   This is now handled by swapping in new table_info entries for each cpu
>>   then summing the old values, and putting the result back onto one
>>   cpu.  On a busy system it may cause sampling to occur at different
>>   times on each cpu, but no packet/byte counts are lost in the process.
>>
>>Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> 
> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Sucessfully tested on my dual quad core machine too, but iptables only (no
> ipv6 here)
> 
> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> 
> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)

Do you folks need/want further testing against the 32-core setup?

rick jones

> 
> 
>>---
>>Added missing preempt_enable.  Patch against nf-next-2.6 git tree.
>>
>> include/linux/netfilter/x_tables.h |    6 +
>> net/ipv4/netfilter/arp_tables.c    |  115 +++++++++++++++++++++++++++--------
>> net/ipv4/netfilter/ip_tables.c     |  120 ++++++++++++++++++++++++++-----------
>> net/ipv6/netfilter/ip6_tables.c    |  119 +++++++++++++++++++++++++-----------
>> net/netfilter/x_tables.c           |   26 ++++++--
>> 5 files changed, 284 insertions(+), 102 deletions(-)
>>
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-19 23:56       ` Rick Jones
@ 2009-02-20  1:03         ` Stephen Hemminger
  2009-02-20  1:18           ` Rick Jones
  0 siblings, 1 reply; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-20  1:03 UTC (permalink / raw)
  To: Rick Jones
  Cc: Eric Dumazet, David Miller, Patrick McHardy, netdev,
	netfilter-devel

On Thu, 19 Feb 2009 15:56:18 -0800
Rick Jones <rick.jones2@hp.com> wrote:

> Eric Dumazet wrote:
> > Stephen Hemminger a écrit :
> > 
> >>The reader/writer lock in ip_tables is acquired in the critical path of
> >>processing packets and is one of the reasons just loading iptables can cause
> >>a 20% performance loss. The rwlock serves two functions:
> >>
> >>1) it prevents changes to table state (xt_replace) while table is in use.
> >>   This is now handled by doing rcu on the xt_table. When table is
> >>   replaced, the new table(s) are put in and the old one table(s) are freed
> >>   after RCU period.
> >>
> >>2) it provides synchronization when accesing the counter values.
> >>   This is now handled by swapping in new table_info entries for each cpu
> >>   then summing the old values, and putting the result back onto one
> >>   cpu.  On a busy system it may cause sampling to occur at different
> >>   times on each cpu, but no packet/byte counts are lost in the process.
> >>
> >>Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> > 
> > 
> > 
> > Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> > 
> > Sucessfully tested on my dual quad core machine too, but iptables only (no
> > ipv6 here)
> > 
> > BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> > 
> > Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)
> 
> Do you folks need/want further testing against the 32-core setup?

It would be good to combine all 3 (iptables-rcu, timer change, and conntrack lock)
to see what the overhead change is.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-20  1:03         ` Stephen Hemminger
@ 2009-02-20  1:18           ` Rick Jones
  2009-02-20  9:42             ` Patrick McHardy
  0 siblings, 1 reply; 83+ messages in thread
From: Rick Jones @ 2009-02-20  1:18 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, David Miller, Patrick McHardy, netdev,
	netfilter-devel

>>>Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)
>>
>>Do you folks need/want further testing against the 32-core setup?
> 
> 
> It would be good to combine all 3 (iptables-rcu, timer change, and conntrack lock)
> to see what the overhead change is.

Fair enough. Is there a tree somewhere I can pull with all those in it, or do I 
need to go back through the emails and apply patches?

rick jones

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-20  1:18           ` Rick Jones
@ 2009-02-20  9:42             ` Patrick McHardy
  2009-02-20 22:57               ` Rick Jones
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-20  9:42 UTC (permalink / raw)
  To: Rick Jones
  Cc: Stephen Hemminger, Eric Dumazet, David Miller, netdev,
	netfilter-devel

Rick Jones wrote:
>>>> Thanks Stephen, thats very cool stuff, yet another rwlock out of 
>>>> kernel :)
>>>
>>> Do you folks need/want further testing against the 32-core setup?
>>
>>
>> It would be good to combine all 3 (iptables-rcu, timer change, and 
>> conntrack lock)
>> to see what the overhead change is.
> 
> Fair enough. Is there a tree somewhere I can pull with all those in it, 
> or do I need to go back through the emails and apply patches?

You can use my nf-next.git tree from:

git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git

It contains the lock free counters, as well as smaller optimizations
from Eric. The last timer patch I've seen missed the actual conversion
to use mod_timer_pending(), but it would be great to have some numbers
on the conntrack lock changes. Thanks Rick!

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-20  9:42             ` Patrick McHardy
@ 2009-02-20 22:57               ` Rick Jones
  2009-02-21  0:35                 ` Rick Jones
  0 siblings, 1 reply; 83+ messages in thread
From: Rick Jones @ 2009-02-20 22:57 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, Eric Dumazet, David Miller, netdev,
	netfilter-devel

>> Fair enough. Is there a tree somewhere I can pull with all those in 
>> it, or do I need to go back through the emails and apply patches?
> 
> 
> You can use my nf-next.git tree from:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git
> 
> It contains the lock free counters, as well as smaller optimizations
> from Eric. 

So, by the time this hits inboxes, under:

ftp://ftp.netperf.org/nf-next-2.6-results

should be a directory called "baseline" which are the results from just a clone 
of your tree from earlier today.  There you will find the config file, the log of 
the build and then three subdirectories:

none - results without doing iptables --list
empty - results after doing iptables --list
full - results after doing an iptables-restore of a config from the "iptables" 
file also up there

In each will be the netperf results in csv format, and four different caliper 
(using the perfmon interface) profiles:

"cycles" uses a profile which is able to take samples with interrupts disabled
"fprof" is a plain flat profile that does not see things happening with interrupt 
s disabled - comparing an fprof to cycles is sometimes interesting
"dcache" tries to take cache miss profiles. iirc that uses the data ear in the 
Itanium PMU to do its thing - I cannot recall the effect of interrupt disabling there
"scgprof" is a sampled call graph profile - likely as not with interrupt 
limitations similar to those of an fprof profile.

> The last timer patch I've seen missed the actual conversion
> to use mod_timer_pending(), but it would be great to have some numbers
> on the conntrack lock changes. Thanks Rick!

I will go back through my email now and try to find the conntrack lock changes 
and apply them to the tree and turn the crank.

happy benchmarking

rick jones

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-20 22:57               ` Rick Jones
@ 2009-02-21  0:35                 ` Rick Jones
  0 siblings, 0 replies; 83+ messages in thread
From: Rick Jones @ 2009-02-21  0:35 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, Eric Dumazet, David Miller, netdev,
	netfilter-devel

Rick Jones wrote:
> So, by the time this hits inboxes, under:
> 
> ftp://ftp.netperf.org/nf-next-2.6-results
> ...
> I will go back through my email now and try to find the conntrack lock 
> changes and apply them to the tree and turn the crank.

Under the base URL above there is now a "conntrack" subdir with the usual "none," 
"empty," and "full" subdirs.  This is with the patch from message ID 
<20090219140303.4329f860@extreme> titled "Re: [RFT 4/4] netfilter: Get rid of 
central rwlock in tcp conntracking" which my mail client says has a date of 
02/19/09 14:03.

On the plus side, only one of the 64 concurrent netperfs died during the "full" 
test compared with more than 10 without the patch.  Also, there were no soft 
lockups reported as there were without the patch.

The rwlock time is gone, naturally, replaced with boatloads of spinlock 
contention.  Hopefully the scgprof profile will help show the source.  Perhaps 
there is yet another patch I should have applied :)

happy benchmarking,

rick jones

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-19 23:46     ` Eric Dumazet
  2009-02-19 23:56       ` Rick Jones
@ 2009-02-20  9:37       ` Patrick McHardy
  2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
  2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
  3 siblings, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-20  9:37 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel

Eric Dumazet wrote:
> Stephen Hemminger a écrit :
>> The reader/writer lock in ip_tables is acquired in the critical path of
>> processing packets and is one of the reasons just loading iptables can cause
>> a 20% performance loss. The rwlock serves two functions:
>>
>> 1) it prevents changes to table state (xt_replace) while table is in use.
>>    This is now handled by doing rcu on the xt_table. When table is
>>    replaced, the new table(s) are put in and the old one table(s) are freed
>>    after RCU period.
>>
>> 2) it provides synchronization when accesing the counter values.
>>    This is now handled by swapping in new table_info entries for each cpu
>>    then summing the old values, and putting the result back onto one
>>    cpu.  On a busy system it may cause sampling to occur at different
>>    times on each cpu, but no packet/byte counts are lost in the process.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
> 
> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> 
> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)

Applied, thanks everyone. I've also addes Eric's tbench results
to the changelog.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH] iptables: xt_hashlimit fix
  2009-02-19 23:46     ` Eric Dumazet
  2009-02-19 23:56       ` Rick Jones
  2009-02-20  9:37       ` Patrick McHardy
@ 2009-02-20 18:10       ` Eric Dumazet
  2009-02-20 18:33         ` Jan Engelhardt
  2009-02-24 14:31         ` Patrick McHardy
  2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
  3 siblings, 2 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-02-20 18:10 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> The reader/writer lock in ip_tables is acquired in the critical path of
>> processing packets and is one of the reasons just loading iptables can cause
>> a 20% performance loss. The rwlock serves two functions:
>>
>> 1) it prevents changes to table state (xt_replace) while table is in use.
>>    This is now handled by doing rcu on the xt_table. When table is
>>    replaced, the new table(s) are put in and the old one table(s) are freed
>>    after RCU period.
>>
>> 2) it provides synchronization when accesing the counter values.
>>    This is now handled by swapping in new table_info entries for each cpu
>>    then summing the old values, and putting the result back onto one
>>    cpu.  On a busy system it may cause sampling to occur at different
>>    times on each cpu, but no packet/byte counts are lost in the process.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
> 
> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> 
> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)

Damned this broke xt_hashlimit, version=0

Look file "net/netfilter/xt_hashlimit.c" line 706

    /* Ugly hack: For SMP, we only want to use one set */
    r->u.master = r;

File "include/linux/netfilter/xt_hashlimit.h"

struct xt_hashlimit_info {
        char name [IFNAMSIZ];           /* name */
        struct hashlimit_cfg cfg;

        /* Used internally by the kernel */
        struct xt_hashlimit_htable *hinfo;
        union {
                void *ptr;
                struct xt_hashlimit_info *master;
        } u;
};


So, it appears some modules are using pointers to themselves, what a hack :(

We probably need an audit of other modules.

(net/netfilter/xt_statistic.c, net/netfilter/xt_quota.c,
net/netfilter/xt_limit.c ...)

Unfortunatly I wont have time to do this in following days, any volunteer ?

Thank you

[PATCH] netfilter: xt_hashlimit fix

Commit 784544739a25c30637397ace5489eeb6e15d7d49
(netfilter: iptables: lock free counters) broke xt_hashlimit netfilter module :

This module was storing a pointer inside its xt_hashlimit_info, and this pointer
is not relocated when we temporarly switch tables (iptables -L).

This hack is not not needed at all (probably a leftover from
ancient time), as each cpu should and can access to its own copy.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---

diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 2482055..a5b5369 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -565,8 +565,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo,
 static bool
 hashlimit_mt_v0(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	const struct xt_hashlimit_info *r =
-		((const struct xt_hashlimit_info *)par->matchinfo)->u.master;
+	const struct xt_hashlimit_info *r = par->matchinfo;
 	struct xt_hashlimit_htable *hinfo = r->hinfo;
 	unsigned long now = jiffies;
 	struct dsthash_ent *dh;
@@ -702,8 +701,6 @@ static bool hashlimit_mt_check_v0(const struct xt_mtchk_param *par)
 	}
 	mutex_unlock(&hlimit_mutex);
 
-	/* Ugly hack: For SMP, we only want to use one set */
-	r->u.master = r;
 	return true;
 }
 


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: xt_hashlimit fix
  2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
@ 2009-02-20 18:33         ` Jan Engelhardt
  2009-02-28  1:54           ` Jan Engelhardt
  2009-02-24 14:31         ` Patrick McHardy
  1 sibling, 1 reply; 83+ messages in thread
From: Jan Engelhardt @ 2009-02-20 18:33 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, Rick Jones,
	netdev, netfilter-devel


On Friday 2009-02-20 19:10, Eric Dumazet wrote:
>
>Damned this broke xt_hashlimit, version=0
>Look file "net/netfilter/xt_hashlimit.c" line 706
>
>    /* Ugly hack: For SMP, we only want to use one set */
>    r->u.master = r;
>
>So, it appears some modules are using pointers to themselves, what a hack :(
>We probably need an audit of other modules.

xt_limit and xt_statistic are affected; I'll happily fix that up.

>Commit 784544739a25c30637397ace5489eeb6e15d7d49
>(netfilter: iptables: lock free counters) broke xt_hashlimit netfilter module :
>
>This module was storing a pointer inside its xt_hashlimit_info, and 
>this pointer is not relocated when we temporarly switch tables 
>(iptables -L).

Patch ok.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: xt_hashlimit fix
  2009-02-20 18:33         ` Jan Engelhardt
@ 2009-02-28  1:54           ` Jan Engelhardt
  2009-02-28  6:56             ` Eric Dumazet
  0 siblings, 1 reply; 83+ messages in thread
From: Jan Engelhardt @ 2009-02-28  1:54 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, Rick Jones,
	netdev, netfilter-devel


On Friday 2009-02-20 19:33, Jan Engelhardt wrote:
>On Friday 2009-02-20 19:10, Eric Dumazet wrote:
>>
>>Damned this broke xt_hashlimit, version=0
>>Look file "net/netfilter/xt_hashlimit.c" line 706
>>
>>    /* Ugly hack: For SMP, we only want to use one set */
>>    r->u.master = r;
>>
>>So, it appears some modules are using pointers to themselves, what a hack :(
>>We probably need an audit of other modules.
>
>xt_limit and xt_statistic are affected; I'll happily fix that up.

Please have a look!

---8<---
parent b2bd9ab764d65237232c4aad5ab8d5d8b5714f72 (v2.6.29-rc6-31-gb2bd9ab)
commit 3e7ee1dcc808b8eec82eddfa0436f78e31d2004a
Author: Jan Engelhardt <jengelh@medozas.de>
Date:   Sat Feb 28 02:49:28 2009 +0100

netfilter: xtables: avoid pointer to self

Signed-off-by: Jan Engelhardt <jengelh@medozas.de>
---
 include/linux/netfilter/xt_limit.h     |    9 +++--
 include/linux/netfilter/xt_statistic.h |    7 ++--
 net/netfilter/xt_limit.c               |   40 +++++++++++++++++------
 net/netfilter/xt_statistic.c           |   28 +++++++++++++---
 4 files changed, 61 insertions(+), 23 deletions(-)

diff --git a/include/linux/netfilter/xt_limit.h b/include/linux/netfilter/xt_limit.h
index b3ce653..fda222c 100644
--- a/include/linux/netfilter/xt_limit.h
+++ b/include/linux/netfilter/xt_limit.h
@@ -4,6 +4,8 @@
 /* timings are in milliseconds. */
 #define XT_LIMIT_SCALE 10000
 
+struct xt_limit_priv;
+
 /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
    seconds, or one every 59 hours. */
 struct xt_rateinfo {
@@ -11,11 +13,10 @@ struct xt_rateinfo {
 	u_int32_t burst;  /* Period multiplier for upper limit. */
 
 	/* Used internally by the kernel */
-	unsigned long prev;
-	u_int32_t credit;
+	unsigned long prev; /* moved to xt_limit_priv */
+	u_int32_t credit; /* moved to xt_limit_priv */
 	u_int32_t credit_cap, cost;
 
-	/* Ugly, ugly fucker. */
-	struct xt_rateinfo *master;
+	struct xt_limit_priv *master;
 };
 #endif /*_XT_RATE_H*/
diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
index 3d38bc9..8f521ab 100644
--- a/include/linux/netfilter/xt_statistic.h
+++ b/include/linux/netfilter/xt_statistic.h
@@ -13,6 +13,8 @@ enum xt_statistic_flags {
 };
 #define XT_STATISTIC_MASK		0x1
 
+struct xt_statistic_priv;
+
 struct xt_statistic_info {
 	u_int16_t			mode;
 	u_int16_t			flags;
@@ -23,11 +25,10 @@ struct xt_statistic_info {
 		struct {
 			u_int32_t	every;
 			u_int32_t	packet;
-			/* Used internally by the kernel */
-			u_int32_t	count;
+			u_int32_t	count; /* unused */
 		} nth;
 	} u;
-	struct xt_statistic_info	*master __attribute__((aligned(8)));
+	struct xt_statistic_priv *master __attribute__((aligned(8)));
 };
 
 #endif /* _XT_STATISTIC_H */
diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
index c908d69..2e8089e 100644
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -14,6 +14,11 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_limit.h>
 
+struct xt_limit_priv {
+	unsigned long prev;
+	uint32_t credit;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
 MODULE_DESCRIPTION("Xtables: rate-limit match");
@@ -60,18 +65,18 @@ static DEFINE_SPINLOCK(limit_lock);
 static bool
 limit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_rateinfo *r =
-		((const struct xt_rateinfo *)par->matchinfo)->master;
+	const struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv = r->master;
 	unsigned long now = jiffies;
 
 	spin_lock_bh(&limit_lock);
-	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
-	if (r->credit > r->credit_cap)
-		r->credit = r->credit_cap;
+	priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
+	if (priv->credit > r->credit_cap)
+		priv->credit = r->credit_cap;
 
-	if (r->credit >= r->cost) {
+	if (priv->credit >= r->cost) {
 		/* We're not limited. */
-		r->credit -= r->cost;
+		priv->credit -= r->cost;
 		spin_unlock_bh(&limit_lock);
 		return true;
 	}
@@ -95,6 +100,7 @@ user2credits(u_int32_t user)
 static bool limit_mt_check(const struct xt_mtchk_param *par)
 {
 	struct xt_rateinfo *r = par->matchinfo;
+	struct xt_limit_priv *priv;
 
 	/* Check for overflow. */
 	if (r->burst == 0
@@ -104,19 +110,30 @@ static bool limit_mt_check(const struct xt_mtchk_param *par)
 		return false;
 	}
 
-	/* For SMP, we only want to use one set of counters. */
-	r->master = r;
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (priv == NULL)
+		return -ENOMEM;
+
+	/* For SMP, we only want to use one set of state. */
+	r->master = priv;
 	if (r->cost == 0) {
 		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
 		   128. */
-		r->prev = jiffies;
-		r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
+		priv->prev = jiffies;
+		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
 		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
 		r->cost = user2credits(r->avg);
 	}
 	return true;
 }
 
+static void limit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_rateinfo *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
 #ifdef CONFIG_COMPAT
 struct compat_xt_rateinfo {
 	u_int32_t avg;
@@ -167,6 +184,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
 	.family           = NFPROTO_UNSPEC,
 	.match            = limit_mt,
 	.checkentry       = limit_mt_check,
+	.destroy          = limit_mt_destroy,
 	.matchsize        = sizeof(struct xt_rateinfo),
 #ifdef CONFIG_COMPAT
 	.compatsize       = sizeof(struct compat_xt_rateinfo),
diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
index 0d75141..d8c0f8f 100644
--- a/net/netfilter/xt_statistic.c
+++ b/net/netfilter/xt_statistic.c
@@ -16,6 +16,10 @@
 #include <linux/netfilter/xt_statistic.h>
 #include <linux/netfilter/x_tables.h>
 
+struct xt_statistic_priv {
+	uint32_t count;
+};
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
 MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
@@ -27,7 +31,7 @@ static DEFINE_SPINLOCK(nth_lock);
 static bool
 statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 {
-	struct xt_statistic_info *info = (void *)par->matchinfo;
+	const struct xt_statistic_info *info = par->matchinfo;
 	bool ret = info->flags & XT_STATISTIC_INVERT;
 
 	switch (info->mode) {
@@ -36,10 +40,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
 			ret = !ret;
 		break;
 	case XT_STATISTIC_MODE_NTH:
-		info = info->master;
 		spin_lock_bh(&nth_lock);
-		if (info->u.nth.count++ == info->u.nth.every) {
-			info->u.nth.count = 0;
+		if (info->master->count++ == info->u.nth.every) {
+			info->master->count = 0;
 			ret = !ret;
 		}
 		spin_unlock_bh(&nth_lock);
@@ -56,16 +59,31 @@ static bool statistic_mt_check(const struct xt_mtchk_param *par)
 	if (info->mode > XT_STATISTIC_MODE_MAX ||
 	    info->flags & ~XT_STATISTIC_MASK)
 		return false;
-	info->master = info;
+
+	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
+	if (info->master == NULL) {
+		printk(KERN_ERR KBUILD_MODNAME ": Out of memory\n");
+		return false;
+	}
+	info->master->count = info->u.nth.count;
+
 	return true;
 }
 
+static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_statistic_info *info = par->matchinfo;
+
+	kfree(info->master);
+}
+
 static struct xt_match xt_statistic_mt_reg __read_mostly = {
 	.name       = "statistic",
 	.revision   = 0,
 	.family     = NFPROTO_UNSPEC,
 	.match      = statistic_mt,
 	.checkentry = statistic_mt_check,
+	.destroy    = statistic_mt_destroy,
 	.matchsize  = sizeof(struct xt_statistic_info),
 	.me         = THIS_MODULE,
 };
-- 
# Created with git-export-patch

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: xt_hashlimit fix
  2009-02-28  1:54           ` Jan Engelhardt
@ 2009-02-28  6:56             ` Eric Dumazet
  2009-02-28  8:22               ` Jan Engelhardt
  0 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-02-28  6:56 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, Rick Jones,
	netdev, netfilter-devel

Jan Engelhardt a écrit :
> On Friday 2009-02-20 19:33, Jan Engelhardt wrote:
>> On Friday 2009-02-20 19:10, Eric Dumazet wrote:
>>> Damned this broke xt_hashlimit, version=0
>>> Look file "net/netfilter/xt_hashlimit.c" line 706
>>>
>>>    /* Ugly hack: For SMP, we only want to use one set */
>>>    r->u.master = r;
>>>
>>> So, it appears some modules are using pointers to themselves, what a hack :(
>>> We probably need an audit of other modules.
>> xt_limit and xt_statistic are affected; I'll happily fix that up.
> 
> Please have a look!
> 
> ---8<---
> parent b2bd9ab764d65237232c4aad5ab8d5d8b5714f72 (v2.6.29-rc6-31-gb2bd9ab)
> commit 3e7ee1dcc808b8eec82eddfa0436f78e31d2004a
> Author: Jan Engelhardt <jengelh@medozas.de>
> Date:   Sat Feb 28 02:49:28 2009 +0100
> 
> netfilter: xtables: avoid pointer to self
> 
> Signed-off-by: Jan Engelhardt <jengelh@medozas.de>

Seems good to me ! Thanks Jan !

Reviewed-by: Eric Dumazet <dada1@cosmosbay.com>

Are you sure xt_quota doesnt need some tweak, or should we not care of changes
done in quota while temporary tables are installed (iptables -L) ?

> ---
>  include/linux/netfilter/xt_limit.h     |    9 +++--
>  include/linux/netfilter/xt_statistic.h |    7 ++--
>  net/netfilter/xt_limit.c               |   40 +++++++++++++++++------
>  net/netfilter/xt_statistic.c           |   28 +++++++++++++---
>  4 files changed, 61 insertions(+), 23 deletions(-)
> 
> diff --git a/include/linux/netfilter/xt_limit.h b/include/linux/netfilter/xt_limit.h
> index b3ce653..fda222c 100644
> --- a/include/linux/netfilter/xt_limit.h
> +++ b/include/linux/netfilter/xt_limit.h
> @@ -4,6 +4,8 @@
>  /* timings are in milliseconds. */
>  #define XT_LIMIT_SCALE 10000
>  
> +struct xt_limit_priv;
> +
>  /* 1/10,000 sec period => max of 10,000/sec.  Min rate is then 429490
>     seconds, or one every 59 hours. */
>  struct xt_rateinfo {
> @@ -11,11 +13,10 @@ struct xt_rateinfo {
>  	u_int32_t burst;  /* Period multiplier for upper limit. */
>  
>  	/* Used internally by the kernel */
> -	unsigned long prev;
> -	u_int32_t credit;
> +	unsigned long prev; /* moved to xt_limit_priv */
> +	u_int32_t credit; /* moved to xt_limit_priv */
>  	u_int32_t credit_cap, cost;
>  
> -	/* Ugly, ugly fucker. */
> -	struct xt_rateinfo *master;
> +	struct xt_limit_priv *master;
>  };
>  #endif /*_XT_RATE_H*/
> diff --git a/include/linux/netfilter/xt_statistic.h b/include/linux/netfilter/xt_statistic.h
> index 3d38bc9..8f521ab 100644
> --- a/include/linux/netfilter/xt_statistic.h
> +++ b/include/linux/netfilter/xt_statistic.h
> @@ -13,6 +13,8 @@ enum xt_statistic_flags {
>  };
>  #define XT_STATISTIC_MASK		0x1
>  
> +struct xt_statistic_priv;
> +
>  struct xt_statistic_info {
>  	u_int16_t			mode;
>  	u_int16_t			flags;
> @@ -23,11 +25,10 @@ struct xt_statistic_info {
>  		struct {
>  			u_int32_t	every;
>  			u_int32_t	packet;
> -			/* Used internally by the kernel */
> -			u_int32_t	count;
> +			u_int32_t	count; /* unused */
>  		} nth;
>  	} u;
> -	struct xt_statistic_info	*master __attribute__((aligned(8)));
> +	struct xt_statistic_priv *master __attribute__((aligned(8)));
>  };
>  
>  #endif /* _XT_STATISTIC_H */
> diff --git a/net/netfilter/xt_limit.c b/net/netfilter/xt_limit.c
> index c908d69..2e8089e 100644
> --- a/net/netfilter/xt_limit.c
> +++ b/net/netfilter/xt_limit.c
> @@ -14,6 +14,11 @@
>  #include <linux/netfilter/x_tables.h>
>  #include <linux/netfilter/xt_limit.h>
>  
> +struct xt_limit_priv {
> +	unsigned long prev;
> +	uint32_t credit;
> +};
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Herve Eychenne <rv@wallfire.org>");
>  MODULE_DESCRIPTION("Xtables: rate-limit match");
> @@ -60,18 +65,18 @@ static DEFINE_SPINLOCK(limit_lock);
>  static bool
>  limit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
>  {
> -	struct xt_rateinfo *r =
> -		((const struct xt_rateinfo *)par->matchinfo)->master;
> +	const struct xt_rateinfo *r = par->matchinfo;
> +	struct xt_limit_priv *priv = r->master;
>  	unsigned long now = jiffies;
>  
>  	spin_lock_bh(&limit_lock);
> -	r->credit += (now - xchg(&r->prev, now)) * CREDITS_PER_JIFFY;
> -	if (r->credit > r->credit_cap)
> -		r->credit = r->credit_cap;
> +	priv->credit += (now - xchg(&priv->prev, now)) * CREDITS_PER_JIFFY;
> +	if (priv->credit > r->credit_cap)
> +		priv->credit = r->credit_cap;
>  
> -	if (r->credit >= r->cost) {
> +	if (priv->credit >= r->cost) {
>  		/* We're not limited. */
> -		r->credit -= r->cost;
> +		priv->credit -= r->cost;
>  		spin_unlock_bh(&limit_lock);
>  		return true;
>  	}
> @@ -95,6 +100,7 @@ user2credits(u_int32_t user)
>  static bool limit_mt_check(const struct xt_mtchk_param *par)
>  {
>  	struct xt_rateinfo *r = par->matchinfo;
> +	struct xt_limit_priv *priv;
>  
>  	/* Check for overflow. */
>  	if (r->burst == 0
> @@ -104,19 +110,30 @@ static bool limit_mt_check(const struct xt_mtchk_param *par)
>  		return false;
>  	}
>  
> -	/* For SMP, we only want to use one set of counters. */
> -	r->master = r;
> +	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
> +	if (priv == NULL)
> +		return -ENOMEM;
> +
> +	/* For SMP, we only want to use one set of state. */
> +	r->master = priv;
>  	if (r->cost == 0) {
>  		/* User avg in seconds * XT_LIMIT_SCALE: convert to jiffies *
>  		   128. */
> -		r->prev = jiffies;
> -		r->credit = user2credits(r->avg * r->burst);	 /* Credits full. */
> +		priv->prev = jiffies;
> +		priv->credit = user2credits(r->avg * r->burst); /* Credits full. */
>  		r->credit_cap = user2credits(r->avg * r->burst); /* Credits full. */
>  		r->cost = user2credits(r->avg);
>  	}
>  	return true;
>  }
>  
> +static void limit_mt_destroy(const struct xt_mtdtor_param *par)
> +{
> +	const struct xt_rateinfo *info = par->matchinfo;
> +
> +	kfree(info->master);
> +}
> +
>  #ifdef CONFIG_COMPAT
>  struct compat_xt_rateinfo {
>  	u_int32_t avg;
> @@ -167,6 +184,7 @@ static struct xt_match limit_mt_reg __read_mostly = {
>  	.family           = NFPROTO_UNSPEC,
>  	.match            = limit_mt,
>  	.checkentry       = limit_mt_check,
> +	.destroy          = limit_mt_destroy,
>  	.matchsize        = sizeof(struct xt_rateinfo),
>  #ifdef CONFIG_COMPAT
>  	.compatsize       = sizeof(struct compat_xt_rateinfo),
> diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c
> index 0d75141..d8c0f8f 100644
> --- a/net/netfilter/xt_statistic.c
> +++ b/net/netfilter/xt_statistic.c
> @@ -16,6 +16,10 @@
>  #include <linux/netfilter/xt_statistic.h>
>  #include <linux/netfilter/x_tables.h>
>  
> +struct xt_statistic_priv {
> +	uint32_t count;
> +};
> +
>  MODULE_LICENSE("GPL");
>  MODULE_AUTHOR("Patrick McHardy <kaber@trash.net>");
>  MODULE_DESCRIPTION("Xtables: statistics-based matching (\"Nth\", random)");
> @@ -27,7 +31,7 @@ static DEFINE_SPINLOCK(nth_lock);
>  static bool
>  statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
>  {
> -	struct xt_statistic_info *info = (void *)par->matchinfo;
> +	const struct xt_statistic_info *info = par->matchinfo;
>  	bool ret = info->flags & XT_STATISTIC_INVERT;
>  
>  	switch (info->mode) {
> @@ -36,10 +40,9 @@ statistic_mt(const struct sk_buff *skb, const struct xt_match_param *par)
>  			ret = !ret;
>  		break;
>  	case XT_STATISTIC_MODE_NTH:
> -		info = info->master;
>  		spin_lock_bh(&nth_lock);
> -		if (info->u.nth.count++ == info->u.nth.every) {
> -			info->u.nth.count = 0;
> +		if (info->master->count++ == info->u.nth.every) {
> +			info->master->count = 0;
>  			ret = !ret;
>  		}
>  		spin_unlock_bh(&nth_lock);
> @@ -56,16 +59,31 @@ static bool statistic_mt_check(const struct xt_mtchk_param *par)
>  	if (info->mode > XT_STATISTIC_MODE_MAX ||
>  	    info->flags & ~XT_STATISTIC_MASK)
>  		return false;
> -	info->master = info;
> +
> +	info->master = kzalloc(sizeof(*info->master), GFP_KERNEL);
> +	if (info->master == NULL) {
> +		printk(KERN_ERR KBUILD_MODNAME ": Out of memory\n");
> +		return false;
> +	}
> +	info->master->count = info->u.nth.count;
> +
>  	return true;
>  }
>  
> +static void statistic_mt_destroy(const struct xt_mtdtor_param *par)
> +{
> +	const struct xt_statistic_info *info = par->matchinfo;
> +
> +	kfree(info->master);
> +}
> +
>  static struct xt_match xt_statistic_mt_reg __read_mostly = {
>  	.name       = "statistic",
>  	.revision   = 0,
>  	.family     = NFPROTO_UNSPEC,
>  	.match      = statistic_mt,
>  	.checkentry = statistic_mt_check,
> +	.destroy    = statistic_mt_destroy,
>  	.matchsize  = sizeof(struct xt_statistic_info),
>  	.me         = THIS_MODULE,
>  };


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: xt_hashlimit fix
  2009-02-28  6:56             ` Eric Dumazet
@ 2009-02-28  8:22               ` Jan Engelhardt
  0 siblings, 0 replies; 83+ messages in thread
From: Jan Engelhardt @ 2009-02-28  8:22 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, Rick Jones,
	netdev, netfilter-devel


On Saturday 2009-02-28 07:56, Eric Dumazet wrote:
>> commit 3e7ee1dcc808b8eec82eddfa0436f78e31d2004a
>> Author: Jan Engelhardt <jengelh@medozas.de>
>> Date:   Sat Feb 28 02:49:28 2009 +0100
>> 
>> netfilter: xtables: avoid pointer to self
>
>Seems good to me ! Thanks Jan !
>
>Reviewed-by: Eric Dumazet <dada1@cosmosbay.com>
>
>Are you sure xt_quota doesnt need some tweak, or should we not care of changes
>done in quota while temporary tables are installed (iptables -L) ?

Oh right, xt_quota.. thanks for noticing! Patch will follow.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: xt_hashlimit fix
  2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
  2009-02-20 18:33         ` Jan Engelhardt
@ 2009-02-24 14:31         ` Patrick McHardy
  1 sibling, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-24 14:31 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel

Eric Dumazet wrote:
> Damned this broke xt_hashlimit, version=0
> 
> ...
> So, it appears some modules are using pointers to themselves, what a hack :(

Indeed. This is unfortunately necessary in some cases to make sure
that modules using global state actually use global state instead
of the per-CPU copies.

> We probably need an audit of other modules.
> 
> (net/netfilter/xt_statistic.c, net/netfilter/xt_quota.c,
> net/netfilter/xt_limit.c ...)

This seems fine in case of hashlimit since it the match data
is read-only. In case of statistic and quota I think we still
need it I think.

> Unfortunatly I wont have time to do this in following days, any volunteer ?
> 
> Thank you
> 
> [PATCH] netfilter: xt_hashlimit fix
> 
> Commit 784544739a25c30637397ace5489eeb6e15d7d49
> (netfilter: iptables: lock free counters) broke xt_hashlimit netfilter module :
> 
> This module was storing a pointer inside its xt_hashlimit_info, and this pointer
> is not relocated when we temporarly switch tables (iptables -L).
> 
> This hack is not not needed at all (probably a leftover from
> ancient time), as each cpu should and can access to its own copy.

Applied, thanks.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-19 23:46     ` Eric Dumazet
                         ` (2 preceding siblings ...)
  2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
@ 2009-02-27 14:02       ` Eric Dumazet
  2009-02-27 16:08         ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
  2009-03-02 10:55         ` [PATCH] iptables: lock free counters Patrick McHardy
  3 siblings, 2 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-02-27 14:02 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, netdev,
	netfilter-devel, Paul E. McKenney

Eric Dumazet a écrit :
> Stephen Hemminger a écrit :
>> The reader/writer lock in ip_tables is acquired in the critical path of
>> processing packets and is one of the reasons just loading iptables can cause
>> a 20% performance loss. The rwlock serves two functions:
>>
>> 1) it prevents changes to table state (xt_replace) while table is in use.
>>    This is now handled by doing rcu on the xt_table. When table is
>>    replaced, the new table(s) are put in and the old one table(s) are freed
>>    after RCU period.
>>
>> 2) it provides synchronization when accesing the counter values.
>>    This is now handled by swapping in new table_info entries for each cpu
>>    then summing the old values, and putting the result back onto one
>>    cpu.  On a busy system it may cause sampling to occur at different
>>    times on each cpu, but no packet/byte counts are lost in the process.
>>
>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> 
> 
> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> 
> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
> 
> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> 
> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)
>

While testing multicast flooding stuff, I found that "iptables -nvL" can 
have a *very* slow response time on my dual quad core machine...

   LatencyTOP version 0.5       (C) 2008 Intel Corporation

Cause                                                Maximum     Percentage
synchronize_rcu synchronize_net do_ipt_get_ctl nf_1878.6 msec          3.1 %
Scheduler: waiting for cpu                        160.3 msec         13.6 %
do_get_write_access journal_get_write_access __ext 11.0 msec          0.0 %
do_get_write_access journal_get_write_access __ext  7.7 msec          0.0 %
poll_schedule_timeout do_select core_sys_select sy  4.9 msec          0.0 %
do_wait sys_wait4 sys_waitpid sysenter_do_call      3.4 msec          0.1 %
call_usermodehelper_exec request_module netlink_cr  1.6 msec          0.0 %
__skb_recv_datagram skb_recv_datagram raw_recvmsg   1.5 msec          0.0 %
do_wait sys_wait4 sysenter_do_call                  0.7 msec          0.0 %


# time iptables -nvL
Chain INPUT (policy ACCEPT 416M packets, 64G bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
 pkts bytes target     prot opt in     out     source               destination

Chain OUTPUT (policy ACCEPT 401M packets, 62G bytes)
 pkts bytes target     prot opt in     out     source               destination

real    0m1.810s
user    0m0.000s
sys     0m0.001s


CONFIG_NO_HZ=y
CONFIG_HZ_1000=y
CONFIG_HZ=1000

One cpu is 100% handling softirqs, could it be the problem ?

Cpu0  :  1.0%us, 14.7%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
Cpu1  :  3.6%us, 23.2%sy,  0.0%ni, 71.6%id,  0.0%wa,  0.0%hi,  1.7%si,  0.0%st
Cpu2  :  0.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,100.0%si,  0.0%st
Cpu3  :  2.7%us, 23.9%sy,  0.0%ni, 71.1%id,  0.7%wa,  0.0%hi,  1.7%si,  0.0%st
Cpu4  :  1.3%us, 14.3%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
Cpu5  :  1.0%us, 14.2%sy,  0.0%ni, 83.4%id,  0.0%wa,  0.0%hi,  1.3%si,  0.0%st
Cpu6  :  0.3%us,  7.0%sy,  0.0%ni, 92.4%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
Cpu7  :  0.7%us,  8.0%sy,  0.0%ni, 90.0%id,  0.7%wa,  0.0%hi,  0.7%si,  0.0%st

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH] rcu: increment quiescent state counter in ksoftirqd()
  2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
@ 2009-02-27 16:08         ` Eric Dumazet
  2009-02-27 16:34           ` Paul E. McKenney
  2009-03-02 10:55         ` [PATCH] iptables: lock free counters Patrick McHardy
  1 sibling, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-02-27 16:08 UTC (permalink / raw)
  To: Paul E. McKenney
  Cc: Stephen Hemminger, David Miller, Patrick McHardy, Rick Jones,
	netdev, netfilter-devel, linux kernel

Eric Dumazet a écrit :
> Eric Dumazet a écrit :
>> Stephen Hemminger a écrit :
>>> The reader/writer lock in ip_tables is acquired in the critical path of
>>> processing packets and is one of the reasons just loading iptables can cause
>>> a 20% performance loss. The rwlock serves two functions:
>>>
>>> 1) it prevents changes to table state (xt_replace) while table is in use.
>>>    This is now handled by doing rcu on the xt_table. When table is
>>>    replaced, the new table(s) are put in and the old one table(s) are freed
>>>    after RCU period.
>>>
>>> 2) it provides synchronization when accesing the counter values.
>>>    This is now handled by swapping in new table_info entries for each cpu
>>>    then summing the old values, and putting the result back onto one
>>>    cpu.  On a busy system it may cause sampling to occur at different
>>>    times on each cpu, but no packet/byte counts are lost in the process.
>>>
>>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
>>
>> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
>>
>> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
>>
>> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
>>
>> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)
>>
> 
> While testing multicast flooding stuff, I found that "iptables -nvL" can 
> have a *very* slow response time on my dual quad core machine...
> 
> 
> # time iptables -nvL
> Chain INPUT (policy ACCEPT 416M packets, 64G bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> Chain OUTPUT (policy ACCEPT 401M packets, 62G bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> real    0m1.810s  <<<< HERE >>>>
> user    0m0.000s
> sys     0m0.001s
> 
> 
> CONFIG_NO_HZ=y
> CONFIG_HZ_1000=y
> CONFIG_HZ=1000
> 
> One cpu is 100% handling softirqs, could it be the problem ?
> 
> Cpu0  :  1.0%us, 14.7%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
> Cpu1  :  3.6%us, 23.2%sy,  0.0%ni, 71.6%id,  0.0%wa,  0.0%hi,  1.7%si,  0.0%st
> Cpu2  :  0.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,100.0%si,  0.0%st
> Cpu3  :  2.7%us, 23.9%sy,  0.0%ni, 71.1%id,  0.7%wa,  0.0%hi,  1.7%si,  0.0%st
> Cpu4  :  1.3%us, 14.3%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
> Cpu5  :  1.0%us, 14.2%sy,  0.0%ni, 83.4%id,  0.0%wa,  0.0%hi,  1.3%si,  0.0%st
> Cpu6  :  0.3%us,  7.0%sy,  0.0%ni, 92.4%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
> Cpu7  :  0.7%us,  8.0%sy,  0.0%ni, 90.0%id,  0.7%wa,  0.0%hi,  0.7%si,  0.0%st

Hi Paul

I found following patch helps if one cpu is looping inside ksoftirqd()

synchronize_rcu() now completes in 40 ms instead of 1800 ms.

Thank you

[PATCH] rcu: increment quiescent state counter in ksoftirqd()

If a machine is flooded by network frames, a cpu can loop 100% of its time
inside ksoftirqd() without calling schedule().
This can delay RCU grace period to insane values. 

Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de..9041ea7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
+			rcu_qsctr_inc((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH] rcu: increment quiescent state counter in ksoftirqd()
  2009-02-27 16:08         ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
@ 2009-02-27 16:34           ` Paul E. McKenney
  0 siblings, 0 replies; 83+ messages in thread
From: Paul E. McKenney @ 2009-02-27 16:34 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Patrick McHardy, Rick Jones,
	netdev, netfilter-devel, linux kernel

On Fri, Feb 27, 2009 at 05:08:04PM +0100, Eric Dumazet wrote:
> Eric Dumazet a écrit :
> > Eric Dumazet a écrit :
> >> Stephen Hemminger a écrit :
> >>> The reader/writer lock in ip_tables is acquired in the critical path of
> >>> processing packets and is one of the reasons just loading iptables can cause
> >>> a 20% performance loss. The rwlock serves two functions:
> >>>
> >>> 1) it prevents changes to table state (xt_replace) while table is in use.
> >>>    This is now handled by doing rcu on the xt_table. When table is
> >>>    replaced, the new table(s) are put in and the old one table(s) are freed
> >>>    after RCU period.
> >>>
> >>> 2) it provides synchronization when accesing the counter values.
> >>>    This is now handled by swapping in new table_info entries for each cpu
> >>>    then summing the old values, and putting the result back onto one
> >>>    cpu.  On a busy system it may cause sampling to occur at different
> >>>    times on each cpu, but no packet/byte counts are lost in the process.
> >>>
> >>> Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
> >>
> >> Acked-by: Eric Dumazet <dada1@cosmosbay.com>
> >>
> >> Sucessfully tested on my dual quad core machine too, but iptables only (no ipv6 here)
> >>
> >> BTW, my new "tbench 8" result is 2450 MB/s, (it was 2150 MB/s not so long ago)
> >>
> >> Thanks Stephen, thats very cool stuff, yet another rwlock out of kernel :)
> >>
> > 
> > While testing multicast flooding stuff, I found that "iptables -nvL" can 
> > have a *very* slow response time on my dual quad core machine...
> > 
> > 
> > # time iptables -nvL
> > Chain INPUT (policy ACCEPT 416M packets, 64G bytes)
> >  pkts bytes target     prot opt in     out     source               destination
> > 
> > Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
> >  pkts bytes target     prot opt in     out     source               destination
> > 
> > Chain OUTPUT (policy ACCEPT 401M packets, 62G bytes)
> >  pkts bytes target     prot opt in     out     source               destination
> > 
> > real    0m1.810s  <<<< HERE >>>>
> > user    0m0.000s
> > sys     0m0.001s
> > 
> > 
> > CONFIG_NO_HZ=y
> > CONFIG_HZ_1000=y
> > CONFIG_HZ=1000
> > 
> > One cpu is 100% handling softirqs, could it be the problem ?
> > 
> > Cpu0  :  1.0%us, 14.7%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
> > Cpu1  :  3.6%us, 23.2%sy,  0.0%ni, 71.6%id,  0.0%wa,  0.0%hi,  1.7%si,  0.0%st
> > Cpu2  :  0.0%us,  0.0%sy,  0.0%ni,  0.0%id,  0.0%wa,  0.0%hi,100.0%si,  0.0%st
> > Cpu3  :  2.7%us, 23.9%sy,  0.0%ni, 71.1%id,  0.7%wa,  0.0%hi,  1.7%si,  0.0%st
> > Cpu4  :  1.3%us, 14.3%sy,  0.0%ni, 83.3%id,  0.0%wa,  0.0%hi,  1.0%si,  0.0%st
> > Cpu5  :  1.0%us, 14.2%sy,  0.0%ni, 83.4%id,  0.0%wa,  0.0%hi,  1.3%si,  0.0%st
> > Cpu6  :  0.3%us,  7.0%sy,  0.0%ni, 92.4%id,  0.0%wa,  0.0%hi,  0.3%si,  0.0%st
> > Cpu7  :  0.7%us,  8.0%sy,  0.0%ni, 90.0%id,  0.7%wa,  0.0%hi,  0.7%si,  0.0%st
> 
> Hi Paul
> 
> I found following patch helps if one cpu is looping inside ksoftirqd()
> 
> synchronize_rcu() now completes in 40 ms instead of 1800 ms.
> 
> Thank you
> 
> [PATCH] rcu: increment quiescent state counter in ksoftirqd()
> 
> If a machine is flooded by network frames, a cpu can loop 100% of its time
> inside ksoftirqd() without calling schedule().
> This can delay RCU grace period to insane values. 
> 
> Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem.

Good catch!!!  This regression was a result of the recent change
from "schedule()" to "cond_resched()", which got rid of that quiescent
state in the common case where a reschedule is not needed.

Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>

> Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
> ---
> diff --git a/kernel/softirq.c b/kernel/softirq.c
> index bdbe9de..9041ea7 100644
> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
>  			preempt_enable_no_resched();
>  			cond_resched();
>  			preempt_disable();
> +			rcu_qsctr_inc((long)__bind_cpu);
>  		}
>  		preempt_enable();
>  		set_current_state(TASK_INTERRUPTIBLE);
> 

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
  2009-02-27 16:08         ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
@ 2009-03-02 10:55         ` Patrick McHardy
  2009-03-02 17:47           ` Eric Dumazet
  1 sibling, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-03-02 10:55 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel, Paul E. McKenney

Eric Dumazet wrote:
> # time iptables -nvL
> Chain INPUT (policy ACCEPT 416M packets, 64G bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> Chain OUTPUT (policy ACCEPT 401M packets, 62G bytes)
>  pkts bytes target     prot opt in     out     source               destination
> 
> real    0m1.810s
> user    0m0.000s
> sys     0m0.001s

Thats really slow ...

> CONFIG_NO_HZ=y
> CONFIG_HZ_1000=y
> CONFIG_HZ=1000
> 
> One cpu is 100% handling softirqs, could it be the problem ?

Is this fixed by your RCU quiescent state fix?

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 10:55         ` [PATCH] iptables: lock free counters Patrick McHardy
@ 2009-03-02 17:47           ` Eric Dumazet
  2009-03-02 21:56             ` Patrick McHardy
  0 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-03-02 17:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel, Paul E. McKenney

Patrick McHardy a écrit :
> Eric Dumazet wrote:
>> # time iptables -nvL
>> Chain INPUT (policy ACCEPT 416M packets, 64G bytes)
>>  pkts bytes target     prot opt in     out     source              
>> destination
>>
>> Chain FORWARD (policy ACCEPT 0 packets, 0 bytes)
>>  pkts bytes target     prot opt in     out     source              
>> destination
>>
>> Chain OUTPUT (policy ACCEPT 401M packets, 62G bytes)
>>  pkts bytes target     prot opt in     out     source              
>> destination
>>
>> real    0m1.810s
>> user    0m0.000s
>> sys     0m0.001s
> 
> Thats really slow ...
> 
>> CONFIG_NO_HZ=y
>> CONFIG_HZ_1000=y
>> CONFIG_HZ=1000
>>
>> One cpu is 100% handling softirqs, could it be the problem ?
> 
> Is this fixed by your RCU quiescent state fix?

Yes it is :)


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 17:47           ` Eric Dumazet
@ 2009-03-02 21:56             ` Patrick McHardy
  2009-03-02 22:02               ` Stephen Hemminger
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-03-02 21:56 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel, Paul E. McKenney

Eric Dumazet wrote:
> Patrick McHardy a écrit :
>> Eric Dumazet wrote:
>>> real    0m1.810s
>>> user    0m0.000s
>>> sys     0m0.001s
>> Thats really slow ...
>>
>>> CONFIG_NO_HZ=y
>>> CONFIG_HZ_1000=y
>>> CONFIG_HZ=1000
>>>
>>> One cpu is 100% handling softirqs, could it be the problem ?
>> Is this fixed by your RCU quiescent state fix?
> 
> Yes it is :)

Great, thanks :)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 21:56             ` Patrick McHardy
@ 2009-03-02 22:02               ` Stephen Hemminger
  2009-03-02 22:07                 ` Patrick McHardy
  2009-03-02 22:27                 ` Eric Dumazet
  0 siblings, 2 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-03-02 22:02 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Eric Dumazet, David Miller, Rick Jones, netdev, netfilter-devel,
	Paul E. McKenney

On Mon, 02 Mar 2009 22:56:39 +0100
Patrick McHardy <kaber@trash.net> wrote:

> Eric Dumazet wrote:
> > Patrick McHardy a écrit :
> >> Eric Dumazet wrote:
> >>> real    0m1.810s
> >>> user    0m0.000s
> >>> sys     0m0.001s
> >> Thats really slow ...
> >>
> >>> CONFIG_NO_HZ=y
> >>> CONFIG_HZ_1000=y
> >>> CONFIG_HZ=1000
> >>>
> >>> One cpu is 100% handling softirqs, could it be the problem ?
> >> Is this fixed by your RCU quiescent state fix?
> > 
> > Yes it is :)
> 
> Great, thanks :)

I wonder if the RCU quiescent fix should go in 2.6.29 because it
fixes other issues like route changing RCU latency under Dos attack.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 22:02               ` Stephen Hemminger
@ 2009-03-02 22:07                 ` Patrick McHardy
  2009-03-02 22:17                   ` Paul E. McKenney
  2009-03-02 22:27                 ` Eric Dumazet
  1 sibling, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-03-02 22:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, David Miller, Rick Jones, netdev, netfilter-devel,
	Paul E. McKenney

Stephen Hemminger wrote:
> On Mon, 02 Mar 2009 22:56:39 +0100
> Patrick McHardy <kaber@trash.net> wrote:
> 
>> Eric Dumazet wrote:
>>> Patrick McHardy a écrit :
>>>> Eric Dumazet wrote:
>>>>> real    0m1.810s
>>>>> user    0m0.000s
>>>>> sys     0m0.001s
>>>> Thats really slow ...
>>>>
>>>>> CONFIG_NO_HZ=y
>>>>> CONFIG_HZ_1000=y
>>>>> CONFIG_HZ=1000
>>>>>
>>>>> One cpu is 100% handling softirqs, could it be the problem ?
>>>> Is this fixed by your RCU quiescent state fix?
>>> Yes it is :)
>> Great, thanks :)
> 
> I wonder if the RCU quiescent fix should go in 2.6.29 because it
> fixes other issues like route changing RCU latency under Dos attack.

 From what I can tell, it should.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 22:07                 ` Patrick McHardy
@ 2009-03-02 22:17                   ` Paul E. McKenney
  0 siblings, 0 replies; 83+ messages in thread
From: Paul E. McKenney @ 2009-03-02 22:17 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, Eric Dumazet, David Miller, Rick Jones, netdev,
	netfilter-devel

On Mon, Mar 02, 2009 at 11:07:18PM +0100, Patrick McHardy wrote:
> Stephen Hemminger wrote:
>> On Mon, 02 Mar 2009 22:56:39 +0100
>> Patrick McHardy <kaber@trash.net> wrote:
>>> Eric Dumazet wrote:
>>>> Patrick McHardy a écrit :
>>>>> Eric Dumazet wrote:
>>>>>> real    0m1.810s
>>>>>> user    0m0.000s
>>>>>> sys     0m0.001s
>>>>> Thats really slow ...
>>>>>
>>>>>> CONFIG_NO_HZ=y
>>>>>> CONFIG_HZ_1000=y
>>>>>> CONFIG_HZ=1000
>>>>>>
>>>>>> One cpu is 100% handling softirqs, could it be the problem ?
>>>>> Is this fixed by your RCU quiescent state fix?
>>>> Yes it is :)
>>> Great, thanks :)
>> I wonder if the RCU quiescent fix should go in 2.6.29 because it
>> fixes other issues like route changing RCU latency under Dos attack.
>
> From what I can tell, it should.

I agree.

							Thanx, Paul

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] iptables: lock free counters
  2009-03-02 22:02               ` Stephen Hemminger
  2009-03-02 22:07                 ` Patrick McHardy
@ 2009-03-02 22:27                 ` Eric Dumazet
  1 sibling, 0 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-03-02 22:27 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Patrick McHardy, David Miller, Rick Jones, netdev,
	netfilter-devel, Paul E. McKenney

Stephen Hemminger a écrit :
> On Mon, 02 Mar 2009 22:56:39 +0100
> Patrick McHardy <kaber@trash.net> wrote:
> 
>> Eric Dumazet wrote:
>>> Patrick McHardy a écrit :
>>>> Eric Dumazet wrote:
>>>>> real    0m1.810s
>>>>> user    0m0.000s
>>>>> sys     0m0.001s
>>>> Thats really slow ...
>>>>
>>>>> CONFIG_NO_HZ=y
>>>>> CONFIG_HZ_1000=y
>>>>> CONFIG_HZ=1000
>>>>>
>>>>> One cpu is 100% handling softirqs, could it be the problem ?
>>>> Is this fixed by your RCU quiescent state fix?
>>> Yes it is :)
>> Great, thanks :)
> 
> I wonder if the RCU quiescent fix should go in 2.6.29 because it
> fixes other issues like route changing RCU latency under Dos attack.
> 
> 

Yes probably, and on stable versions too, since this problem is quite old...

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [RFT 2/4] Add mod_timer_noact
  2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
  2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
@ 2009-02-18  5:19 ` Stephen Hemminger
  2009-02-18  9:20   ` Ingo Molnar
  2009-02-18 10:29   ` [RFT 2/4] Add mod_timer_noact Patrick McHardy
  2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
                   ` (2 subsequent siblings)
  4 siblings, 2 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18  5:19 UTC (permalink / raw)
  To: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet
  Cc: netdev, netfilter-devel, tglx, Martin Josefsson

[-- Attachment #1: mod_timer_noact.patch --]
[-- Type: text/plain, Size: 4828 bytes --]

Introduce mod_timer_noact() which for example is to replace the calls to
del_timer()/add_timer() in __nf_ct_refresh_acct(). It works like mod_timer()
but doesn't activate or modify the timeout of an inactive timer which is the
behaviour we want in order to be able to use timers as a means of
synchronization in nf_conntrack.

A later patch will modify __nf_ct_refresh_acct() to use mod_timer_noact()
which will then save one spin_lock_irqsave() / spin_lock_irqrestore() pair per
conntrack timer update. This will also get rid of the race we currently have
without adding more locking in nf_conntrack.

Signed-off-by: Martin Josefsson <gandalf@wlug.westbo.se>

---
 include/linux/timer.h |    8 ++++++--
 kernel/relay.c        |    2 +-
 kernel/timer.c        |   40 +++++++++++++++++++++++++++++++++++-----
 3 files changed, 42 insertions(+), 8 deletions(-)

--- a/include/linux/timer.h	2009-02-17 10:55:33.427785986 -0800
+++ b/include/linux/timer.h	2009-02-17 11:04:10.291844534 -0800
@@ -25,6 +25,9 @@ struct timer_list {
 
 extern struct tvec_base boot_tvec_bases;
 
+#define TIMER_ACT	1
+#define TIMER_NOACT	0
+
 #define TIMER_INITIALIZER(_function, _expires, _data) {		\
 		.entry = { .prev = TIMER_ENTRY_STATIC },	\
 		.function = (_function),			\
@@ -86,8 +89,9 @@ static inline int timer_pending(const st
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
-extern int __mod_timer(struct timer_list *timer, unsigned long expires);
+extern int __mod_timer(struct timer_list *timer, unsigned long expires, int activate);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_noact(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
@@ -163,7 +167,7 @@ static inline void timer_stats_timer_cle
 static inline void add_timer(struct timer_list *timer)
 {
 	BUG_ON(timer_pending(timer));
-	__mod_timer(timer, timer->expires);
+	__mod_timer(timer, timer->expires, TIMER_ACT);
 }
 
 #ifdef CONFIG_SMP
--- a/kernel/timer.c	2009-02-17 10:55:33.403580297 -0800
+++ b/kernel/timer.c	2009-02-17 11:04:10.291844534 -0800
@@ -589,7 +589,7 @@ static struct tvec_base *lock_timer_base
 	}
 }
 
-int __mod_timer(struct timer_list *timer, unsigned long expires)
+int __mod_timer(struct timer_list *timer, unsigned long expires, int activate)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
@@ -603,7 +603,8 @@ int __mod_timer(struct timer_list *timer
 	if (timer_pending(timer)) {
 		detach_timer(timer, 0);
 		ret = 1;
-	}
+	} else if (activate == TIMER_NOACT)
+		goto out_unlock;
 
 	debug_timer_activate(timer);
 
@@ -629,8 +630,9 @@ int __mod_timer(struct timer_list *timer
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
-	spin_unlock_irqrestore(&base->lock, flags);
 
+out_unlock:
+	spin_unlock_irqrestore(&base->lock, flags);
 	return ret;
 }
 
@@ -699,11 +701,39 @@ int mod_timer(struct timer_list *timer, 
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires);
+	return __mod_timer(timer, expires, TIMER_ACT);
 }
 
 EXPORT_SYMBOL(mod_timer);
 
+/***
+ * mod_timer_noact - modify a timer's timeout
+ * @timer: the timer to be modified
+ *
+ * mod_timer_noact works like mod_timer except that it doesn't activate an
+ * inactive timer, instead it returns without updating timer->expires.
+ *
+ * The function returns whether it has modified a pending timer or not.
+ * (ie. mod_timer_noact() of an inactive timer returns 0, mod_timer_noact() of
+ * an active timer returns 1.)
+ */
+int mod_timer_noact(struct timer_list *timer, unsigned long expires)
+{
+	BUG_ON(!timer->function);
+
+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer->expires == expires && timer_pending(timer))
+		return 1;
+
+	return __mod_timer(timer, expires, TIMER_NOACT);
+}
+
+EXPORT_SYMBOL(mod_timer_noact);
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -1268,7 +1298,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire);
+	__mod_timer(&timer, expire, TIMER_ACT);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
--- a/kernel/relay.c	2009-02-17 10:55:33.416279439 -0800
+++ b/kernel/relay.c	2009-02-17 11:04:10.291844534 -0800
@@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_
 			 * from the scheduler (trying to re-grab
 			 * rq->lock), so defer it.
 			 */
-			__mod_timer(&buf->timer, jiffies + 1);
+			__mod_timer(&buf->timer, jiffies + 1, TIMER_NOACT);
 	}
 
 	old = buf->data;

-- 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
@ 2009-02-18  9:20   ` Ingo Molnar
  2009-02-18  9:30     ` David Miller
  2009-02-18 10:07     ` Patrick McHardy
  2009-02-18 10:29   ` [RFT 2/4] Add mod_timer_noact Patrick McHardy
  1 sibling, 2 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18  9:20 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet, netdev,
	netfilter-devel, tglx, Martin Josefsson


* Stephen Hemminger <shemminger@vyatta.com> wrote:

> Introduce mod_timer_noact() which for example is to replace the calls to
> del_timer()/add_timer() in __nf_ct_refresh_acct(). It works like mod_timer()
> but doesn't activate or modify the timeout of an inactive timer which is the
> behaviour we want in order to be able to use timers as a means of
> synchronization in nf_conntrack.
> 
> A later patch will modify __nf_ct_refresh_acct() to use mod_timer_noact()
> which will then save one spin_lock_irqsave() / spin_lock_irqrestore() pair per
> conntrack timer update. This will also get rid of the race we currently have
> without adding more locking in nf_conntrack.
> 
> Signed-off-by: Martin Josefsson <gandalf@wlug.westbo.se>
> 
> ---
>  include/linux/timer.h |    8 ++++++--
>  kernel/relay.c        |    2 +-
>  kernel/timer.c        |   40 +++++++++++++++++++++++++++++++++++-----
>  3 files changed, 42 insertions(+), 8 deletions(-)
> 
> --- a/include/linux/timer.h	2009-02-17 10:55:33.427785986 -0800
> +++ b/include/linux/timer.h	2009-02-17 11:04:10.291844534 -0800
> @@ -25,6 +25,9 @@ struct timer_list {
>  
>  extern struct tvec_base boot_tvec_bases;
>  
> +#define TIMER_ACT	1
> +#define TIMER_NOACT	0

Ugly flaggery.

> -extern int __mod_timer(struct timer_list *timer, unsigned long expires);
> +extern int __mod_timer(struct timer_list *timer, unsigned long expires, int activate);

This is not really acceptable, it slows down every single 
add_timer() and mod_timer() call in the kernel with a flag that 
has one specific value in all but your case. There's more than 
2000 such callsites in the kernel.

Why dont you use something like this instead:

	if (del_timer(timer))
		add_timer(timer);

?

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18  9:20   ` Ingo Molnar
@ 2009-02-18  9:30     ` David Miller
  2009-02-18 11:01       ` Ingo Molnar
  2009-02-18 10:07     ` Patrick McHardy
  1 sibling, 1 reply; 83+ messages in thread
From: David Miller @ 2009-02-18  9:30 UTC (permalink / raw)
  To: mingo
  Cc: shemminger, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf

From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 10:20:41 +0100

> Why dont you use something like this instead:
> 
> 	if (del_timer(timer))
> 		add_timer(timer);
> 
> ?

Why don't you read his commit message?

At least show him that much respect if you're going to
be against his patch.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18  9:30     ` David Miller
@ 2009-02-18 11:01       ` Ingo Molnar
  2009-02-18 11:39         ` Jarek Poplawski
                           ` (2 more replies)
  0 siblings, 3 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 11:01 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf, linux-kernel

* David Miller <davem@davemloft.net> wrote:

> From: Ingo Molnar <mingo@elte.hu>
> Date: Wed, 18 Feb 2009 10:20:41 +0100
> 
> > Why dont you use something like this instead:
> > 
> > 	if (del_timer(timer))
> > 		add_timer(timer);
> > 
> > ?
> 
> Why don't you read his commit message?

Uhm, of course i have read this piece of non-info:

| Introduce mod_timer_noact() which for example is to replace 
| the calls to del_timer()/add_timer() in 
| __nf_ct_refresh_acct(). It works like mod_timer() but doesn't 
| activate or modify the timeout of an inactive timer which is 
| the behaviour we want in order to be able to use timers as a 
| means of synchronization in nf_conntrack.

It does not mention the overhead to the regular timer interfaces 
at all, nor does it explain the reasons for this change 
adequately.

And that's why i'm asking, why is the sequence i suggested above 
inadequate? If del_timer(timer) returns 1 it means the timer was 
active - and we call add_timer() only in that case. I.e. we dont 
activate or modify the timeout of an inactive timer.

It can _only_ make a difference in the narrow special case of 
code using the timer list lock as serialization: but that is a 
pretty poor solution in this proposed form as it slows down the 
other 2000 users of timers for no good reason. The changelog was 
completely silent about that overhead aspect (which is small but 
real), and i refuse to believe that this effect was not 
realized.

In other words, the changelog is useless and even borderline 
deceptive. Not a good sign if you are trying to get a patch 
accepted to the kernel.

Furthermore, no performance figures were posted along with this 
modification - it only stated that these are "performance 
improvements". Especially in cases where a change slows down the 
common case the showing of a very substantial performance 
benefit is a must-have, before a patch is considered for 
upstream merging.

You might be aware of that and you might have planned to provide 
such info in the future, but the changelog and the submission 
does not show any realization of this necessity, so i'm asking 
for that here out of caution, to make sure it's done.

In fact, the submission incorrectly stated:

| This patch set is against Patrick's netfilter next tree since
| it is where it should end up.
|      
| git.kernel.org:/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git

This is wrong, the "netfilter next tree" is not where the "Add 
mod_timer_noact" change should end up, and you should ask your 
contributors to submit changes to other subsystems to their 
respective maintainer trees - the timer tree in this case.

> At least show him that much respect if you're going to be 
> against his patch.

Firstly, let me make clear what happened here.

Deep buried inside a networking patchset, Cc:-ed to the netdev 
and netfilter lists only, a core kernel change is embedded that 
in essence modifies 2000 callsites of the generic kernel. The 
patch was not Cc:-ed to lkml.

Secondly, all i'm doing here is reviewing patches of subsystems 
i maintain, so please stop attacking me for doing my job.

I noticed it because i read a lot of lists, but still, this was 
not done transparently at all. Please show minimal respect to 
Linux and post core kernel patches to lkml, and ask your 
sub-maintainers to do likewise. If there's someone here who has 
a moral basis for flaming here it's me, not you.

So, please, at minimum, follow the following well-established 
protocol of contribution:

 - Post timer patches to lkml (the mailing list mentioned in the 
   MAINTAINERS file), just like you expect networking patches to 
   be posted to netdev. It's basic courtesy and not doing so is 
   at minimum a double standard.

 - Declare negative performance impact to the common case very 
   prominently in the changelog, and include analysis about why 
   it's worth paying the price.

 - Include measurements that show clear positive performance
   impact at the new usage site - which offsets the negative 
   micro-costs that every other usage site pays.

 - Require your sub-contributors to write meaningful changelogs,
   that mention every substantial effect of a change, especially 
   when they change core kernel facilities. For example:

      Impact: add new API, slow down old APIs a tiny bit

   Would have alerted people straight away. I had to read the 
   actual patch to figure out this key information.

I'm also utterly puzzled by your apparent desire to flame me. 
This patch is wrong on so many levels that it's not even funny - 
and you as a long-time kernel contributor should have realized 
that straight away. Instead you forced me into wasting time on 
this rather long email (and you also forced the very unnecessary 
public embarrasment of a contributor), for what should have been 
an 'oops, right, will fix' routine matter.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 11:01       ` Ingo Molnar
@ 2009-02-18 11:39         ` Jarek Poplawski
  2009-02-18 12:37           ` Ingo Molnar
  2009-02-18 12:33         ` Patrick McHardy
  2009-02-18 21:39         ` David Miller
  2 siblings, 1 reply; 83+ messages in thread
From: Jarek Poplawski @ 2009-02-18 11:39 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, shemminger, kaber, rick.jones2, dada1, netdev,
	netfilter-devel, tglx, gandalf, linux-kernel

On 18-02-2009 12:01, Ingo Molnar wrote:
...
> that straight away. Instead you forced me into wasting time on 
> this rather long email (and you also forced the very unnecessary 
> public embarrasment of a contributor), for what should have been 
> an 'oops, right, will fix' routine matter.

No problem! But next time use this shorter routine, please...

Thanks,
Jarek P.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 11:39         ` Jarek Poplawski
@ 2009-02-18 12:37           ` Ingo Molnar
  0 siblings, 0 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 12:37 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: David Miller, shemminger, kaber, rick.jones2, dada1, netdev,
	netfilter-devel, tglx, gandalf, linux-kernel


* Jarek Poplawski <jarkao2@gmail.com> wrote:

> On 18-02-2009 12:01, Ingo Molnar wrote:
> ...
> > that straight away. Instead you forced me into wasting time on 
> > this rather long email (and you also forced the very unnecessary 
> > public embarrasment of a contributor), for what should have been 
> > an 'oops, right, will fix' routine matter.
> 
> No problem! But next time use this shorter routine, please...

Correct, the "oops, right, will fix" should have come as a reply 
to my mail, obviously - i did not submit the patch after all. 
Instead i got this accusatory mail from davem which certainly 
did not help bring the issue forward ...

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 11:01       ` Ingo Molnar
  2009-02-18 11:39         ` Jarek Poplawski
@ 2009-02-18 12:33         ` Patrick McHardy
  2009-02-18 21:39         ` David Miller
  2 siblings, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 12:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: David Miller, shemminger, rick.jones2, dada1, netdev,
	netfilter-devel, tglx, gandalf, linux-kernel

Ingo Molnar wrote:
> In other words, the changelog is useless and even borderline 
> deceptive. Not a good sign if you are trying to get a patch 
> accepted to the kernel.
> 
> Furthermore, no performance figures were posted along with this 
> modification - it only stated that these are "performance 
> improvements". Especially in cases where a change slows down the 
> common case the showing of a very substantial performance 
> benefit is a must-have, before a patch is considered for 
> upstream merging.

I think this is mainly a misunderstanding, Stephen posted these
patches as RFT so Rick and Eric could do benchmarks, they were
not intended for merging at this time.

> In fact, the submission incorrectly stated:
> 
> | This patch set is against Patrick's netfilter next tree since
> | it is where it should end up.
> |      
> | git.kernel.org:/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git
> 
> This is wrong, the "netfilter next tree" is not where the "Add 
> mod_timer_noact" change should end up, and you should ask your 
> contributors to submit changes to other subsystems to their 
> respective maintainer trees - the timer tree in this case.

Absolutely, I wouldn't have taken it, and Dave wouldn't have taken
it from me, so no cause for alarm :)



^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 11:01       ` Ingo Molnar
  2009-02-18 11:39         ` Jarek Poplawski
  2009-02-18 12:33         ` Patrick McHardy
@ 2009-02-18 21:39         ` David Miller
  2009-02-18 21:51           ` Ingo Molnar
  2 siblings, 1 reply; 83+ messages in thread
From: David Miller @ 2009-02-18 21:39 UTC (permalink / raw)
  To: mingo
  Cc: shemminger, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf, linux-kernel

From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 12:01:44 +0100

> * David Miller <davem@davemloft.net> wrote:
> 
> | Introduce mod_timer_noact() which for example is to replace 
> | the calls to del_timer()/add_timer() in 
> | __nf_ct_refresh_acct(). It works like mod_timer() but doesn't 
> | activate or modify the timeout of an inactive timer which is 
> | the behaviour we want in order to be able to use timers as a 
> | means of synchronization in nf_conntrack.
> 
> It does not mention the overhead to the regular timer interfaces 
> at all, nor does it explain the reasons for this change 
> adequately.

You (conveniently) skipped this part of his commit message, so
I guess this is the part you didn't read very carefully:

	A later patch will modify __nf_ct_refresh_acct() to use
	mod_timer_noact() which will then save one spin_lock_irqsave()
	/ spin_lock_irqrestore() pair per conntrack timer update. This
	will also get rid of the race we currently have without adding
	more locking in nf_conntrack.

The whole point is to avoid two spin_lock_irqsave() sequences, thus
taking the timer locks twice.

So Ingo, when you say in response:

	Why don't you use?

		if (del_timer())
			add_timer();

you really look foolish and, in fact, disrespectful to Stephen.

This was my objection to your email, it proved that you didn't
really read his changelog message.  He explained perfectly well
what the final goal was of his changes.

And you have this knee-jerk reaction quite often.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 21:39         ` David Miller
@ 2009-02-18 21:51           ` Ingo Molnar
  2009-02-18 22:04             ` David Miller
  0 siblings, 1 reply; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 21:51 UTC (permalink / raw)
  To: David Miller
  Cc: shemminger, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf, linux-kernel


* David Miller <davem@davemloft.net> wrote:

> From: Ingo Molnar <mingo@elte.hu>
> Date: Wed, 18 Feb 2009 12:01:44 +0100
> 
> > * David Miller <davem@davemloft.net> wrote:
> > 
> > | Introduce mod_timer_noact() which for example is to replace 
> > | the calls to del_timer()/add_timer() in 
> > | __nf_ct_refresh_acct(). It works like mod_timer() but doesn't 
> > | activate or modify the timeout of an inactive timer which is 
> > | the behaviour we want in order to be able to use timers as a 
> > | means of synchronization in nf_conntrack.
> > 
> > It does not mention the overhead to the regular timer interfaces 
> > at all, nor does it explain the reasons for this change 
> > adequately.
> 
> You (conveniently) skipped this part of his commit message, so
> I guess this is the part you didn't read very carefully:
> 
> 	A later patch will modify __nf_ct_refresh_acct() to use
> 	mod_timer_noact() which will then save one spin_lock_irqsave()
> 	/ spin_lock_irqrestore() pair per conntrack timer update. This
> 	will also get rid of the race we currently have without adding
> 	more locking in nf_conntrack.
> 
> The whole point is to avoid two spin_lock_irqsave() sequences, thus
> taking the timer locks twice.
> 
> So Ingo, when you say in response:
> 
> 	Why don't you use?
> 
> 		if (del_timer())
> 			add_timer();
> 
> you really look foolish and, in fact, disrespectful to Stephen.
> 
> This was my objection to your email, it proved that you didn't
> really read his changelog message.  He explained perfectly well
> what the final goal was of his changes.
> 
> And you have this knee-jerk reaction quite often.

You accusing me of knee-jerk reaction is the joke of the century 
;-)

Anyway, it's all handled, you just need to read the rest of the 
thread.

Thanks,

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 21:51           ` Ingo Molnar
@ 2009-02-18 22:04             ` David Miller
  2009-02-18 22:42               ` Peter Zijlstra
  0 siblings, 1 reply; 83+ messages in thread
From: David Miller @ 2009-02-18 22:04 UTC (permalink / raw)
  To: mingo
  Cc: shemminger, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf, linux-kernel

From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 22:51:40 +0100

> Anyway, it's all handled, you just need to read the rest of the 
> thread.

I did read the entire thread before replying, my objection
to your original posting still standed.

And as others have pointed out you also failed to recognize
the context of the patch posting.  It was part of a sequence
of patches for people to test some experimental netfilter
performance optimizations.  "RFT" was prefixed to every patch
subject line, if any more indication was necessary.

Yet you object that the patches are against the networking
and netfilter trees.

Again, your reactions were knee-jerk, by every definition of the
term.

I know how you work Ingo, you want to be fast and efficient.
But often, your "fast and efficient" is "careless", and this
wastes everyone elses time and in the final analysis makes
you "slow".

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 22:04             ` David Miller
@ 2009-02-18 22:42               ` Peter Zijlstra
  2009-02-18 22:47                 ` David Miller
  0 siblings, 1 reply; 83+ messages in thread
From: Peter Zijlstra @ 2009-02-18 22:42 UTC (permalink / raw)
  To: David Miller
  Cc: mingo, shemminger, kaber, rick.jones2, dada1, netdev,
	netfilter-devel, tglx, gandalf, linux-kernel

On Wed, 2009-02-18 at 14:04 -0800, David Miller wrote:

> And as others have pointed out you also failed to recognize
> the context of the patch posting.  It was part of a sequence
> of patches for people to test some experimental netfilter
> performance optimizations.  "RFT" was prefixed to every patch
> subject line, if any more indication was necessary.

Be that as it may, its a maintainer seeing a patch against his
subsystem, reviewing it (albeit early -- we should all want to get
around to reviewing that early) and asking for some clarification.

The fact is, Steve's changelog was very unclear to people not intimately
familiar with the problem space. Asking some clarification just isn't
weird in any way.

> Yet you object that the patches are against the networking
> and netfilter trees.
> 
> Again, your reactions were knee-jerk, by every definition of the
> term.
> 
> I know how you work Ingo, you want to be fast and efficient.
> But often, your "fast and efficient" is "careless", and this
> wastes everyone elses time and in the final analysis makes
> you "slow".

Can we please leave it at this, the technical issue seems to be delt
with. You and Ingo seems to have a gift to rub each other the wrong way,
it would be grand if you could both try to be a little forgiving and
just focus on the code/technical issues which makes Linux to what it is,
technically excellent ;-)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 22:42               ` Peter Zijlstra
@ 2009-02-18 22:47                 ` David Miller
  2009-02-18 22:56                   ` Stephen Hemminger
  0 siblings, 1 reply; 83+ messages in thread
From: David Miller @ 2009-02-18 22:47 UTC (permalink / raw)
  To: peterz
  Cc: mingo, shemminger, kaber, rick.jones2, dada1, netdev,
	netfilter-devel, tglx, gandalf, linux-kernel

From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Feb 2009 23:42:27 +0100

> Can we please leave it at this, the technical issue seems to be delt
> with. You and Ingo seems to have a gift to rub each other the wrong way,
> it would be grand if you could both try to be a little forgiving and
> just focus on the code/technical issues which makes Linux to what it is,
> technically excellent ;-)

Like it or not, open source development is as much about people
and their personalitites as it is about technical issues.

So every timeone someone says to concentrate on the technical
issues and get past the personalities, they really are missing
the point, and at best are being naive.

The Linux kernel has been shaped by overtly emotional discourse and
personal interaction as it has been by any technical achievement.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18 22:47                 ` David Miller
@ 2009-02-18 22:56                   ` Stephen Hemminger
  0 siblings, 0 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18 22:56 UTC (permalink / raw)
  To: David Miller
  Cc: peterz, mingo, kaber, rick.jones2, dada1, netdev, netfilter-devel,
	tglx, gandalf, linux-kernel

On Wed, 18 Feb 2009 14:47:41 -0800 (PST)
David Miller <davem@davemloft.net> wrote:

> From: Peter Zijlstra <peterz@infradead.org>
> Date: Wed, 18 Feb 2009 23:42:27 +0100
> 
> > Can we please leave it at this, the technical issue seems to be delt
> > with. You and Ingo seems to have a gift to rub each other the wrong way,
> > it would be grand if you could both try to be a little forgiving and
> > just focus on the code/technical issues which makes Linux to what it is,
> > technically excellent ;-)
> 
> Like it or not, open source development is as much about people
> and their personalitites as it is about technical issues.
> 
> So every timeone someone says to concentrate on the technical
> issues and get past the personalities, they really are missing
> the point, and at best are being naive.
> 
> The Linux kernel has been shaped by overtly emotional discourse and
> personal interaction as it has been by any technical achievement.

Everyone, please read and internalize what Matt had to say.
He is right, the community needs to learn how to review:
   
http://mdzlog.wordpress.com/2008/12/24/constructive-criticism/

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18  9:20   ` Ingo Molnar
  2009-02-18  9:30     ` David Miller
@ 2009-02-18 10:07     ` Patrick McHardy
  2009-02-18 12:05       ` [patch] timers: add mod_timer_pending() Ingo Molnar
  1 sibling, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 10:07 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Stephen Hemminger, David Miller, Rick Jones, Eric Dumazet, netdev,
	netfilter-devel, tglx, Martin Josefsson

Ingo Molnar wrote:
>> -extern int __mod_timer(struct timer_list *timer, unsigned long expires);
>> +extern int __mod_timer(struct timer_list *timer, unsigned long expires, int activate);
> 
> This is not really acceptable, it slows down every single 
> add_timer() and mod_timer() call in the kernel with a flag that 
> has one specific value in all but your case. There's more than 
> 2000 such callsites in the kernel.
> 
> Why dont you use something like this instead:
> 
> 	if (del_timer(timer))
> 		add_timer(timer);

We need to avoid having a timer that was deleted by one CPU
getting re-added by another, but want to avoid taking the
conntrack lock for every timer update. The timer-internal
locking is enough for this as long as we have a mod_timer
variant that forwards a timer, but doesn't activate it in
case it isn't active already.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [patch] timers: add mod_timer_pending()
  2009-02-18 10:07     ` Patrick McHardy
@ 2009-02-18 12:05       ` Ingo Molnar
  2009-02-18 12:33         ` Patrick McHardy
  2009-02-18 17:00         ` Oleg Nesterov
  0 siblings, 2 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 12:05 UTC (permalink / raw)
  To: Patrick McHardy, Oleg Nesterov, Peter Zijlstra
  Cc: Stephen Hemminger, David Miller, Rick Jones, Eric Dumazet, netdev,
	netfilter-devel, tglx, Martin Josefsson


* Patrick McHardy <kaber@trash.net> wrote:

> Ingo Molnar wrote:
>>> -extern int __mod_timer(struct timer_list *timer, unsigned long expires);
>>> +extern int __mod_timer(struct timer_list *timer, unsigned long expires, int activate);
>>
>> This is not really acceptable, it slows down every single add_timer() 
>> and mod_timer() call in the kernel with a flag that has one specific 
>> value in all but your case. There's more than 2000 such callsites in 
>> the kernel.
>>
>> Why dont you use something like this instead:
>>
>> 	if (del_timer(timer))
>> 		add_timer(timer);
>
> We need to avoid having a timer that was deleted by one CPU
> getting re-added by another, but want to avoid taking the
> conntrack lock for every timer update. The timer-internal
> locking is enough for this as long as we have a mod_timer
> variant that forwards a timer, but doesn't activate it in
> case it isn't active already.

that makes sense - but the implementation is still somewhat 
ugly. How about the one below instead? Not tested.

One open question is this construct in mod_timer():

+	/*
+	 * This is a common optimization triggered by the
+	 * networking code - if the timer is re-modified
+	 * to be the same thing then just return:
+	 */
+	if (timer->expires == expires && timer_pending(timer))
+		return 1;

We've had this for ages, but it seems rather SMP-unsafe. 
timer_pending(), if used in an unserialized fashion, can be any 
random value in theory - there's no internal serialization here 
anywhere.

We could end up with incorrectly not re-activating a timer in 
mod_timer() for example - have such things never been observed 
in practice?

So the original patch which added this to mod_timer_noact() was 
racy i think, and we cannot preserve this optimization outside 
of the timer list lock. (we could do it inside of it.)

	Ingo

------------------->
Subject: timers: add mod_timer_pending()
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 12:23:29 +0100

Impact: new timer API

Based on an idea from Stephen Hemminger: introduce
 mod_timer_pending() which is a mod_timer() offspring
that is an invariant on already removed timers.

(regular mod_timer() re-activates non-pending timers.)

This is useful for the networking code in that it can
allow unserialized mod_timer_pending() timer-forwarding
calls, but a single del_timer*() will stop the timer
from being reactivated again.

Also while at it:

- optimize the regular mod_timer() path some more, the
  timer-stat and a debug check was needlessly duplicated
  in __mod_timer().

- make the exports come straight after the function, as
  most other exports in timer.c already did.

- eliminate __mod_timer() as an external API, change the
  users to mod_timer().

The regular mod_timer() code path is not impacted
significantly, due to inlining optimizations and due to
the simplifications - but performance testing would be nice
nevertheless.

Based-on-patch-from: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/platforms/cell/spufs/sched.c  |    2 
 drivers/infiniband/hw/ipath/ipath_driver.c |    6 -
 include/linux/timer.h                      |   22 -----
 kernel/relay.c                             |    2 
 kernel/timer.c                             |  110 +++++++++++++++++++----------
 5 files changed, 80 insertions(+), 62 deletions(-)

Index: linux/arch/powerpc/platforms/cell/spufs/sched.c
===================================================================
--- linux.orig/arch/powerpc/platforms/cell/spufs/sched.c
+++ linux/arch/powerpc/platforms/cell/spufs/sched.c
@@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_c
 		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 		set_bit(ctx->prio, spu_prio->bitmap);
 		if (!spu_prio->nr_waiting++)
-			__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+			mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	}
 }
 
Index: linux/drivers/infiniband/hw/ipath/ipath_driver.c
===================================================================
--- linux.orig/drivers/infiniband/hw/ipath/ipath_driver.c
+++ linux/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct i
  * to prevent HoL blocking, then start the HoL timer that
  * periodically continues, then stop procs, so they can detect
  * link down if they want, and do something about it.
- * Timer may already be running, so use __mod_timer, not add_timer.
+ * Timer may already be running, so use mod_timer, not add_timer.
  */
 void ipath_hol_down(struct ipath_devdata *dd)
 {
@@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata
 	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
 	dd->ipath_hol_timer.expires = jiffies +
 		msecs_to_jiffies(ipath_hol_timeout_ms);
-	__mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
 }
 
 /*
@@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaqu
 	else {
 		dd->ipath_hol_timer.expires = jiffies +
 			msecs_to_jiffies(ipath_hol_timeout_ms);
-		__mod_timer(&dd->ipath_hol_timer,
+		mod_timer(&dd->ipath_hol_timer,
 			dd->ipath_hol_timer.expires);
 	}
 }
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -161,8 +161,8 @@ static inline int timer_pending(const st
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
-extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
@@ -221,25 +221,7 @@ static inline void timer_stats_timer_cle
 }
 #endif
 
-/**
- * add_timer - start a timer
- * @timer: the timer to be added
- *
- * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
- *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
- *
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
- */
-static inline void add_timer(struct timer_list *timer)
-{
-	BUG_ON(timer_pending(timer));
-	__mod_timer(timer, timer->expires);
-}
+extern void add_timer(struct timer_list *timer);
 
 #ifdef CONFIG_SMP
   extern int try_to_del_timer_sync(struct timer_list *timer);
Index: linux/kernel/relay.c
===================================================================
--- linux.orig/kernel/relay.c
+++ linux/kernel/relay.c
@@ -748,7 +748,7 @@ size_t relay_switch_subbuf(struct rchan_
 			 * from the scheduler (trying to re-grab
 			 * rq->lock), so defer it.
 			 */
-			__mod_timer(&buf->timer, jiffies + 1);
+			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -600,11 +600,14 @@ static struct tvec_base *lock_timer_base
 	}
 }
 
-int __mod_timer(struct timer_list *timer, unsigned long expires)
+static inline int
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret;
+
+	ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -614,6 +617,9 @@ int __mod_timer(struct timer_list *timer
 	if (timer_pending(timer)) {
 		detach_timer(timer, 0);
 		ret = 1;
+	} else {
+		if (pending_only)
+			goto out_unlock;
 	}
 
 	debug_timer_activate(timer);
@@ -640,42 +646,28 @@ int __mod_timer(struct timer_list *timer
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
+
+out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
 
-EXPORT_SYMBOL(__mod_timer);
-
 /**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * mod_timer_pending - modify a pending timer's timeout
+ * @timer: the pending timer to be modified
+ * @expires: new timeout in jiffies
  *
- * This is not very scalable on SMP. Double adds are not possible.
+ * mod_timer_pending() is the same for pending timers as mod_timer(),
+ * but will not re-activate and modify already deleted timers.
+ *
+ * It is useful for unserialized use of timers.
  */
-void add_timer_on(struct timer_list *timer, int cpu)
+int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
-	unsigned long flags;
-
-	timer_stats_timer_set_start_info(timer);
-	BUG_ON(timer_pending(timer) || !timer->function);
-	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
-	debug_timer_activate(timer);
-	internal_add_timer(base, timer);
-	/*
-	 * Check whether the other CPU is idle and needs to be
-	 * triggered to reevaluate the timer wheel when nohz is
-	 * active. We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to idle can not evaluate
-	 * the timer wheel.
-	 */
-	wake_up_idle_cpu(cpu);
-	spin_unlock_irqrestore(&base->lock, flags);
+	return __mod_timer(timer, expires, true);
 }
+EXPORT_SYMBOL(mod_timer_pending);
 
 /**
  * mod_timer - modify a timer's timeout
@@ -699,9 +691,6 @@ void add_timer_on(struct timer_list *tim
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	BUG_ON(!timer->function);
-
-	timer_stats_timer_set_start_info(timer);
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -710,12 +699,62 @@ int mod_timer(struct timer_list *timer, 
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires);
+	return __mod_timer(timer, expires, false);
 }
-
 EXPORT_SYMBOL(mod_timer);
 
 /**
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expires point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expires field in the past will be executed in the next
+ * timer tick.
+ */
+void add_timer(struct timer_list *timer)
+{
+	BUG_ON(timer_pending(timer));
+	mod_timer(timer, timer->expires);
+}
+EXPORT_SYMBOL(add_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	unsigned long flags;
+
+	timer_stats_timer_set_start_info(timer);
+	BUG_ON(timer_pending(timer) || !timer->function);
+	spin_lock_irqsave(&base->lock, flags);
+	timer_set_base(timer, base);
+	debug_timer_activate(timer);
+	internal_add_timer(base, timer);
+	/*
+	 * Check whether the other CPU is idle and needs to be
+	 * triggered to reevaluate the timer wheel when nohz is
+	 * active. We are protected against the other CPU fiddling
+	 * with the timer by holding the timer base lock. This also
+	 * makes sure that a CPU on the way to idle can not evaluate
+	 * the timer wheel.
+	 */
+	wake_up_idle_cpu(cpu);
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
+/**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
  *
@@ -744,7 +783,6 @@ int del_timer(struct timer_list *timer)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
@@ -778,7 +816,6 @@ out:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 /**
@@ -816,7 +853,6 @@ int del_timer_sync(struct timer_list *ti
 		cpu_relax();
 	}
 }
-
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
@@ -1314,7 +1350,7 @@ signed long __sched schedule_timeout(sig
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 12:05       ` [patch] timers: add mod_timer_pending() Ingo Molnar
@ 2009-02-18 12:33         ` Patrick McHardy
  2009-02-18 12:50           ` Ingo Molnar
  2009-02-18 17:00         ` Oleg Nesterov
  1 sibling, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 12:33 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Oleg Nesterov, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson

Ingo Molnar wrote:
> * Patrick McHardy <kaber@trash.net> wrote:
> 
>> We need to avoid having a timer that was deleted by one CPU
>> getting re-added by another, but want to avoid taking the
>> conntrack lock for every timer update. The timer-internal
>> locking is enough for this as long as we have a mod_timer
>> variant that forwards a timer, but doesn't activate it in
>> case it isn't active already.
> 
> that makes sense - but the implementation is still somewhat 
> ugly. How about the one below instead? Not tested.

This seems to fulfill our needs. I also like the mod_timer_pending()
name better than mod_timer_noact().

> One open question is this construct in mod_timer():
> 
> +	/*
> +	 * This is a common optimization triggered by the
> +	 * networking code - if the timer is re-modified
> +	 * to be the same thing then just return:
> +	 */
> +	if (timer->expires == expires && timer_pending(timer))
> +		return 1;
> 
> We've had this for ages, but it seems rather SMP-unsafe. 
> timer_pending(), if used in an unserialized fashion, can be any 
> random value in theory - there's no internal serialization here 
> anywhere.
> 
> We could end up with incorrectly not re-activating a timer in 
> mod_timer() for example - have such things never been observed 
> in practice?

Yes, it seems racy if done for timers that might get activated.
For forwarding only without activation it seems OK, in that case
the timer_pending check doesn't seem necessary at all.

> So the original patch which added this to mod_timer_noact() was 
> racy i think, and we cannot preserve this optimization outside 
> of the timer list lock. (we could do it inside of it.)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 12:33         ` Patrick McHardy
@ 2009-02-18 12:50           ` Ingo Molnar
  2009-02-18 12:54             ` Patrick McHardy
  0 siblings, 1 reply; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 12:50 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Oleg Nesterov, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson, linux-kernel


* Patrick McHardy <kaber@trash.net> wrote:

> Ingo Molnar wrote:
>> * Patrick McHardy <kaber@trash.net> wrote:
>>
>>> We need to avoid having a timer that was deleted by one CPU
>>> getting re-added by another, but want to avoid taking the
>>> conntrack lock for every timer update. The timer-internal
>>> locking is enough for this as long as we have a mod_timer
>>> variant that forwards a timer, but doesn't activate it in
>>> case it isn't active already.
>>
>> that makes sense - but the implementation is still somewhat ugly. How 
>> about the one below instead? Not tested.
>
> This seems to fulfill our needs. I also like the mod_timer_pending()
> name better than mod_timer_noact().
>
>> One open question is this construct in mod_timer():
>>
>> +	/*
>> +	 * This is a common optimization triggered by the
>> +	 * networking code - if the timer is re-modified
>> +	 * to be the same thing then just return:
>> +	 */
>> +	if (timer->expires == expires && timer_pending(timer))
>> +		return 1;
>>
>> We've had this for ages, but it seems rather SMP-unsafe.  
>> timer_pending(), if used in an unserialized fashion, can be any random 
>> value in theory - there's no internal serialization here anywhere.
>>
>> We could end up with incorrectly not re-activating a timer in  
>> mod_timer() for example - have such things never been observed in 
>> practice?
>
> Yes, it seems racy if done for timers that might get 
> activated. For forwarding only without activation it seems OK, 
> in that case the timer_pending check doesn't seem necessary at 
> all.

ok.

To accelerate matters i've committed the new API patch into a 
new standalone topic branch: tip:timers/new-apis.

Unless there are objections or test failures, you (or Stephen or 
David) can then git-pull it into the networking tree via the Git 
coordinates below - and you'll get this single commit in a 
surgical manner - no other timer changes are included.

Doing so has the advantage of:

- You not having to wait a kernel cycle for the API to go
  upstream.

- You can also push it upstream without waiting for the timer 
  tree. (the timer tree and the networking tree will share the 
  exact same commit)

- It will also all merge cleanly with the timer tree in 
  linux-next, etc.

I'd suggest to do it in about a week, to make sure any after 
effects have trickled down and to make sure the topic has become 
append-only. You can ping Thomas and me about testing/review 
status then, whenever you want to do the pull.

	Ingo

------------->

You can pull the latest timers/new-apis git tree from:

   git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip.git timers/new-apis

 Thanks,

	Ingo

------------------>
Ingo Molnar (1):
      timers: add mod_timer_pending()


 arch/powerpc/platforms/cell/spufs/sched.c  |    2 +-
 drivers/infiniband/hw/ipath/ipath_driver.c |    6 +-
 include/linux/timer.h                      |   22 +-----
 kernel/relay.c                             |    2 +-
 kernel/timer.c                             |  110 ++++++++++++++++++---------
 5 files changed, 80 insertions(+), 62 deletions(-)


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 12:50           ` Ingo Molnar
@ 2009-02-18 12:54             ` Patrick McHardy
  2009-02-18 13:47               ` Ingo Molnar
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 12:54 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Oleg Nesterov, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson, linux-kernel

Ingo Molnar wrote:
> To accelerate matters i've committed the new API patch into a 
> new standalone topic branch: tip:timers/new-apis.
> 
> Unless there are objections or test failures, you (or Stephen or 
> David) can then git-pull it into the networking tree via the Git 
> coordinates below - and you'll get this single commit in a 
> surgical manner - no other timer changes are included.
> 
> Doing so has the advantage of:
> 
> - You not having to wait a kernel cycle for the API to go
>   upstream.
> 
> - You can also push it upstream without waiting for the timer 
>   tree. (the timer tree and the networking tree will share the 
>   exact same commit)
> 
> - It will also all merge cleanly with the timer tree in 
>   linux-next, etc.
> 
> I'd suggest to do it in about a week, to make sure any after 
> effects have trickled down and to make sure the topic has become 
> append-only. You can ping Thomas and me about testing/review 
> status then, whenever you want to do the pull.

Thanks Ingo. I'll wait for Stephen to rebase his patches on
top of your change and the test results and will let you know.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 12:54             ` Patrick McHardy
@ 2009-02-18 13:47               ` Ingo Molnar
  0 siblings, 0 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 13:47 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Oleg Nesterov, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson, linux-kernel


* Patrick McHardy <kaber@trash.net> wrote:

> Ingo Molnar wrote:
>> To accelerate matters i've committed the new API patch into a new 
>> standalone topic branch: tip:timers/new-apis.
>>
>> Unless there are objections or test failures, you (or Stephen or  
>> David) can then git-pull it into the networking tree via the Git  
>> coordinates below - and you'll get this single commit in a surgical 
>> manner - no other timer changes are included.
>>
>> Doing so has the advantage of:
>>
>> - You not having to wait a kernel cycle for the API to go
>>   upstream.
>>
>> - You can also push it upstream without waiting for the timer   tree. 
>> (the timer tree and the networking tree will share the   exact same 
>> commit)
>>
>> - It will also all merge cleanly with the timer tree in   linux-next, 
>> etc.
>>
>> I'd suggest to do it in about a week, to make sure any after effects 
>> have trickled down and to make sure the topic has become append-only. 
>> You can ping Thomas and me about testing/review status then, whenever 
>> you want to do the pull.
>
> Thanks Ingo. I'll wait for Stephen to rebase his patches on 
> top of your change and the test results and will let you know.

Stress-testing here in the last ~2 hours on eight x86 test-boxes 
showed no problems so far.

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 12:05       ` [patch] timers: add mod_timer_pending() Ingo Molnar
  2009-02-18 12:33         ` Patrick McHardy
@ 2009-02-18 17:00         ` Oleg Nesterov
  2009-02-18 18:23           ` Ingo Molnar
  1 sibling, 1 reply; 83+ messages in thread
From: Oleg Nesterov @ 2009-02-18 17:00 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Patrick McHardy, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson

On 02/18, Ingo Molnar wrote:
>
> Based on an idea from Stephen Hemminger: introduce
>  mod_timer_pending() which is a mod_timer() offspring
> that is an invariant on already removed timers.

This also can be used by workqueues, see

	http://marc.info/?l=linux-kernel&m=122209752020413

but can't we add another helper? Because,

> +static inline int
> +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
>  {
>  	struct tvec_base *base, *new_base;
>  	unsigned long flags;
> -	int ret = 0;
> +	int ret;
> +
> +	ret = 0;
>
>  	timer_stats_timer_set_start_info(timer);
>  	BUG_ON(!timer->function);
> @@ -614,6 +617,9 @@ int __mod_timer(struct timer_list *timer
>  	if (timer_pending(timer)) {
>  		detach_timer(timer, 0);
>  		ret = 1;
> +	} else {
> +		if (pending_only)
> +			goto out_unlock;

This can change the base (CPU) of the pending timer.

How about

	int __update_timer(struct timer_list *timer, unsigned long expires)
	{
		struct tvec_base *base;
		unsigned long flags;
		int ret = 0;

		base = lock_timer_base(timer, &flags);
		if (timer_pending(timer)) {
			detach_timer(timer, 0);
			timer->expires = expires;
			internal_add_timer(base, timer);
			ret = 1;
		}
		spin_unlock_irqrestore(&base->lock, flags);

		return ret;
	}

?

Unlike __mod_timer(..., bool pending_only), it preserves the CPU on
which the timer is pending.

Or, perhaps, we can modify __mod_timer() further,

	static inline int
	__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
	{
		struct tvec_base *base;
		unsigned long flags;
		int ret;

		ret = 0;

		timer_stats_timer_set_start_info(timer);
		BUG_ON(!timer->function);

		base = lock_timer_base(timer, &flags);

		if (timer_pending(timer)) {
			detach_timer(timer, 0);
			ret = 1;
		} else if (pending_only)
			goto out_unlock;
		}

		debug_timer_activate(timer);

		if (!pending_only) {
			struct tvec_base *new_base = __get_cpu_var(tvec_bases);

			if (base != new_base) {
				/*
				 * We are trying to schedule the timer on the local CPU.
				 * However we can't change timer's base while it is running,
				 * otherwise del_timer_sync() can't detect that the timer's
				 * handler yet has not finished. This also guarantees that
				 * the timer is serialized wrt itself.
				 */
				if (likely(base->running_timer != timer)) {
					/* See the comment in lock_timer_base() */
					timer_set_base(timer, NULL);
					spin_unlock(&base->lock);
					base = new_base;
					spin_lock(&base->lock);
					timer_set_base(timer, base);
				}
			}
		}

		timer->expires = expires;
		internal_add_timer(base, timer);

	out_unlock:
		spin_unlock_irqrestore(&base->lock, flags);

		return ret;
	}

What do you all think?

Oleg.


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 17:00         ` Oleg Nesterov
@ 2009-02-18 18:23           ` Ingo Molnar
  2009-02-18 18:58             ` Oleg Nesterov
  0 siblings, 1 reply; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 18:23 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Patrick McHardy, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson


* Oleg Nesterov <oleg@redhat.com> wrote:

> On 02/18, Ingo Molnar wrote:
> >
> > Based on an idea from Stephen Hemminger: introduce
> >  mod_timer_pending() which is a mod_timer() offspring
> > that is an invariant on already removed timers.
> 
> This also can be used by workqueues, see
> 
> 	http://marc.info/?l=linux-kernel&m=122209752020413
> 
> but can't we add another helper? Because,
> 
> > +static inline int
> > +__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
> >  {
> >  	struct tvec_base *base, *new_base;
> >  	unsigned long flags;
> > -	int ret = 0;
> > +	int ret;
> > +
> > +	ret = 0;
> >
> >  	timer_stats_timer_set_start_info(timer);
> >  	BUG_ON(!timer->function);
> > @@ -614,6 +617,9 @@ int __mod_timer(struct timer_list *timer
> >  	if (timer_pending(timer)) {
> >  		detach_timer(timer, 0);
> >  		ret = 1;
> > +	} else {
> > +		if (pending_only)
> > +			goto out_unlock;
> 
> This can change the base (CPU) of the pending timer.
> 
> How about
> 
> 	int __update_timer(struct timer_list *timer, unsigned long expires)
> 	{
> 		struct tvec_base *base;
> 		unsigned long flags;
> 		int ret = 0;
> 
> 		base = lock_timer_base(timer, &flags);
> 		if (timer_pending(timer)) {
> 			detach_timer(timer, 0);
> 			timer->expires = expires;
> 			internal_add_timer(base, timer);
> 			ret = 1;
> 		}
> 		spin_unlock_irqrestore(&base->lock, flags);
> 
> 		return ret;
> 	}
> 
> ?
> 
> Unlike __mod_timer(..., bool pending_only), it preserves the CPU on
> which the timer is pending.
> 
> Or, perhaps, we can modify __mod_timer() further,
> 
> 	static inline int
> 	__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
> 	{
> 		struct tvec_base *base;
> 		unsigned long flags;
> 		int ret;
> 
> 		ret = 0;
> 
> 		timer_stats_timer_set_start_info(timer);
> 		BUG_ON(!timer->function);
> 
> 		base = lock_timer_base(timer, &flags);
> 
> 		if (timer_pending(timer)) {
> 			detach_timer(timer, 0);
> 			ret = 1;
> 		} else if (pending_only)
> 			goto out_unlock;
> 		}
> 
> 		debug_timer_activate(timer);
> 
> 		if (!pending_only) {
> 			struct tvec_base *new_base = __get_cpu_var(tvec_bases);
> 
> 			if (base != new_base) {
> 				/*
> 				 * We are trying to schedule the timer on the local CPU.
> 				 * However we can't change timer's base while it is running,
> 				 * otherwise del_timer_sync() can't detect that the timer's
> 				 * handler yet has not finished. This also guarantees that
> 				 * the timer is serialized wrt itself.
> 				 */
> 				if (likely(base->running_timer != timer)) {
> 					/* See the comment in lock_timer_base() */
> 					timer_set_base(timer, NULL);
> 					spin_unlock(&base->lock);
> 					base = new_base;
> 					spin_lock(&base->lock);
> 					timer_set_base(timer, base);
> 				}
> 			}
> 		}
> 
> 		timer->expires = expires;
> 		internal_add_timer(base, timer);
> 
> 	out_unlock:
> 		spin_unlock_irqrestore(&base->lock, flags);
> 
> 		return ret;
> 	}
> 
> What do you all think?

if then i'd put it into a separate commit.

I think the auto-migration of all the mod_timer() variants is a 
scalability feature: if for example a networking socket's main 
user migrates to another CPU, then the timer 'follows' it - even 
if the timer never actually expires (which is quite common for 
high-speed high-reliability networking transports).

By keeping it on the same CPU we'd allow the timer's and the 
task's affinity to differ.

Agreed?

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 18:23           ` Ingo Molnar
@ 2009-02-18 18:58             ` Oleg Nesterov
  2009-02-18 19:24               ` Ingo Molnar
  0 siblings, 1 reply; 83+ messages in thread
From: Oleg Nesterov @ 2009-02-18 18:58 UTC (permalink / raw)
  To: Ingo Molnar
  Cc: Patrick McHardy, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson

On 02/18, Ingo Molnar wrote:
>
> * Oleg Nesterov <oleg@redhat.com> wrote:
>
> > Unlike __mod_timer(..., bool pending_only), it preserves the CPU on
> > which the timer is pending.
> >
> > Or, perhaps, we can modify __mod_timer() further,
>
> if then i'd put it into a separate commit.
>
> I think the auto-migration of all the mod_timer() variants is a
> scalability feature: if for example a networking socket's main
> user migrates to another CPU, then the timer 'follows' it - even
> if the timer never actually expires (which is quite common for
> high-speed high-reliability networking transports).

OK.

But sometimes it is better (or necessary) to prevent the migration.
Since you already are changed __mod_timer() it would be ugly to
add yet another helper. Perhaps we should turn "bool pending_only"
into "int flags" right now?

This is minor, and perhaps we will never need the TIMER_DONT_MIGRATE
flag. But if ever need, then we have to audit all callers.

Oleg.


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [patch] timers: add mod_timer_pending()
  2009-02-18 18:58             ` Oleg Nesterov
@ 2009-02-18 19:24               ` Ingo Molnar
  0 siblings, 0 replies; 83+ messages in thread
From: Ingo Molnar @ 2009-02-18 19:24 UTC (permalink / raw)
  To: Oleg Nesterov
  Cc: Patrick McHardy, Peter Zijlstra, Stephen Hemminger, David Miller,
	Rick Jones, Eric Dumazet, netdev, netfilter-devel, tglx,
	Martin Josefsson


* Oleg Nesterov <oleg@redhat.com> wrote:

> On 02/18, Ingo Molnar wrote:
> >
> > * Oleg Nesterov <oleg@redhat.com> wrote:
> >
> > > Unlike __mod_timer(..., bool pending_only), it preserves the CPU on
> > > which the timer is pending.
> > >
> > > Or, perhaps, we can modify __mod_timer() further,
> >
> > if then i'd put it into a separate commit.
> >
> > I think the auto-migration of all the mod_timer() variants is a
> > scalability feature: if for example a networking socket's main
> > user migrates to another CPU, then the timer 'follows' it - even
> > if the timer never actually expires (which is quite common for
> > high-speed high-reliability networking transports).
> 
> OK.
> 
> But sometimes it is better (or necessary) to prevent the 
> migration. Since you already are changed __mod_timer() it 
> would be ugly to add yet another helper. Perhaps we should 
> turn "bool pending_only" into "int flags" right now?
> 
> This is minor, and perhaps we will never need the 
> TIMER_DONT_MIGRATE flag. But if ever need, then we have to 
> audit all callers.

hm, dunno - such unused flags are generally frowned upon, 
especially if they influence the code flow in a dynamic way. In 
fact i tried to avoid this flag here too - but __mod_timer() is 
too small, the flag is used in the middle, and two separate 
helpers would have made the code look worse.

	Ingo

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 2/4] Add mod_timer_noact
  2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
  2009-02-18  9:20   ` Ingo Molnar
@ 2009-02-18 10:29   ` Patrick McHardy
  1 sibling, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 10:29 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Rick Jones, Eric Dumazet, netdev, netfilter-devel,
	tglx, Martin Josefsson

Stephen Hemminger wrote:

> +/***
> + * mod_timer_noact - modify a timer's timeout
> + * @timer: the timer to be modified
> + *
> + * mod_timer_noact works like mod_timer except that it doesn't activate an
> + * inactive timer, instead it returns without updating timer->expires.
> + *
> + * The function returns whether it has modified a pending timer or not.
> + * (ie. mod_timer_noact() of an inactive timer returns 0, mod_timer_noact() of
> + * an active timer returns 1.)
> + */
> +int mod_timer_noact(struct timer_list *timer, unsigned long expires)
> +{
> +	BUG_ON(!timer->function);
> +
> +	/*
> +	 * This is a common optimization triggered by the
> +	 * networking code - if the timer is re-modified
> +	 * to be the same thing then just return:
> +	 */
> +	if (timer->expires == expires && timer_pending(timer))
> +		return 1;

This doesn't seem right, since it uses TIMER_NOACT below, there's
no point in checking for timer_pending() I think.

> +
> +	return __mod_timer(timer, expires, TIMER_NOACT);
> +}
> +
> +EXPORT_SYMBOL(mod_timer_noact);
> +


^ permalink raw reply	[flat|nested] 83+ messages in thread

* [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
  2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
  2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
@ 2009-02-18  5:19 ` Stephen Hemminger
  2009-02-18  9:54   ` Patrick McHardy
                     ` (2 more replies)
  2009-02-18  5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
  2009-02-18  8:30 ` [RFT 0/4] Netfilter/iptables performance improvements Eric Dumazet
  4 siblings, 3 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18  5:19 UTC (permalink / raw)
  To: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet
  Cc: netdev, tglx, netfilter-devel, Martin Josefsson

[-- Attachment #1: nf_ct_refresh_acct-locking.patch --]
[-- Type: text/plain, Size: 1549 bytes --]

Now that we are using mod_timer_noact() for timer updates there's no need to
hold the global lock during the timer update since the actual timeout update
is now protected by the timer locking.

Signed-off-by: Martin Josefsson <gandalf@wlug.westbo.se>

---
 net/netfilter/nf_conntrack_core.c |    9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

--- a/net/netfilter/nf_conntrack_core.c	2009-02-17 10:55:33.370882059 -0800
+++ b/net/netfilter/nf_conntrack_core.c	2009-02-17 13:48:25.080060712 -0800
@@ -793,13 +793,12 @@ void __nf_ct_refresh_acct(struct nf_conn
 	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
 	NF_CT_ASSERT(skb);
 
-	spin_lock_bh(&nf_conntrack_lock);
-
 	/* Only update if this is not a fixed timeout */
 	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
 		goto acct;
 
-	/* If not in hash table, timer will not be active yet */
+	/* If not in hash table, timer will not be active yet,
+	   we are the only one able to see it. */
 	if (!nf_ct_is_confirmed(ct)) {
 		ct->timeout.expires = extra_jiffies;
 		event = IPCT_REFRESH;
@@ -821,16 +820,16 @@ acct:
 	if (do_acct) {
 		struct nf_conn_counter *acct;
 
+		spin_lock_bh(&nf_conntrack_lock);
 		acct = nf_conn_acct_find(ct);
 		if (acct) {
 			acct[CTINFO2DIR(ctinfo)].packets++;
 			acct[CTINFO2DIR(ctinfo)].bytes +=
 				skb->len - skb_network_offset(skb);
 		}
+		spin_unlock_bh(&nf_conntrack_lock);
 	}
 
-	spin_unlock_bh(&nf_conntrack_lock);
-
 	/* must be unlocked when calling event cache */
 	if (event)
 		nf_conntrack_event_cache(event, ct);

-- 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
@ 2009-02-18  9:54   ` Patrick McHardy
  2009-02-18 11:05   ` Jarek Poplawski
  2009-02-18 14:01   ` Eric Dumazet
  2 siblings, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18  9:54 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Rick Jones, Eric Dumazet, netdev, tglx,
	netfilter-devel, Martin Josefsson

Stephen Hemminger wrote:

This looks good, thanks for not letting those patches die :)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
  2009-02-18  9:54   ` Patrick McHardy
@ 2009-02-18 11:05   ` Jarek Poplawski
  2009-02-18 11:08     ` Patrick McHardy
  2009-02-18 14:01   ` Eric Dumazet
  2 siblings, 1 reply; 83+ messages in thread
From: Jarek Poplawski @ 2009-02-18 11:05 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet, netdev,
	tglx, netfilter-devel, Martin Josefsson

On 18-02-2009 06:19, Stephen Hemminger wrote:
> Now that we are using mod_timer_noact() for timer updates there's no need to

Hmm... so where exactly we are using this mod_timer_noact() now?

Jarek P.

> hold the global lock during the timer update since the actual timeout update
> is now protected by the timer locking.
> 
> Signed-off-by: Martin Josefsson <gandalf@wlug.westbo.se>
> 
> ---
>  net/netfilter/nf_conntrack_core.c |    9 ++++-----
>  1 file changed, 4 insertions(+), 5 deletions(-)
> 
> --- a/net/netfilter/nf_conntrack_core.c	2009-02-17 10:55:33.370882059 -0800
> +++ b/net/netfilter/nf_conntrack_core.c	2009-02-17 13:48:25.080060712 -0800
> @@ -793,13 +793,12 @@ void __nf_ct_refresh_acct(struct nf_conn
>  	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
>  	NF_CT_ASSERT(skb);
>  
> -	spin_lock_bh(&nf_conntrack_lock);
> -
>  	/* Only update if this is not a fixed timeout */
>  	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
>  		goto acct;
>  
> -	/* If not in hash table, timer will not be active yet */
> +	/* If not in hash table, timer will not be active yet,
> +	   we are the only one able to see it. */
>  	if (!nf_ct_is_confirmed(ct)) {
>  		ct->timeout.expires = extra_jiffies;
>  		event = IPCT_REFRESH;
> @@ -821,16 +820,16 @@ acct:
>  	if (do_acct) {
>  		struct nf_conn_counter *acct;
>  
> +		spin_lock_bh(&nf_conntrack_lock);
>  		acct = nf_conn_acct_find(ct);
>  		if (acct) {
>  			acct[CTINFO2DIR(ctinfo)].packets++;
>  			acct[CTINFO2DIR(ctinfo)].bytes +=
>  				skb->len - skb_network_offset(skb);
>  		}
> +		spin_unlock_bh(&nf_conntrack_lock);
>  	}
>  
> -	spin_unlock_bh(&nf_conntrack_lock);
> -
>  	/* must be unlocked when calling event cache */
>  	if (event)
>  		nf_conntrack_event_cache(event, ct);

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18 11:05   ` Jarek Poplawski
@ 2009-02-18 11:08     ` Patrick McHardy
  0 siblings, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 11:08 UTC (permalink / raw)
  To: Jarek Poplawski
  Cc: Stephen Hemminger, David Miller, Rick Jones, Eric Dumazet, netdev,
	tglx, netfilter-devel, Martin Josefsson

Jarek Poplawski wrote:
> On 18-02-2009 06:19, Stephen Hemminger wrote:
>> Now that we are using mod_timer_noact() for timer updates there's no need to
> 
> Hmm... so where exactly we are using this mod_timer_noact() now?

Hehe, good point, the conversion to actually use it seems to
be missing :)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
  2009-02-18  9:54   ` Patrick McHardy
  2009-02-18 11:05   ` Jarek Poplawski
@ 2009-02-18 14:01   ` Eric Dumazet
  2009-02-18 14:04     ` Patrick McHardy
  2 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-02-18 14:01 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, netdev, tglx,
	netfilter-devel, Martin Josefsson

Stephen Hemminger a écrit :
> Now that we are using mod_timer_noact() for timer updates there's no need to
> hold the global lock during the timer update since the actual timeout update
> is now protected by the timer locking.
> 
> Signed-off-by: Martin Josefsson <gandalf@wlug.westbo.se>
> 
> ---
>  net/netfilter/nf_conntrack_core.c |    9 ++++-----
>  1 file changed, 4 insertions(+), 5 deletions(-)
> 
> --- a/net/netfilter/nf_conntrack_core.c	2009-02-17 10:55:33.370882059 -0800
> +++ b/net/netfilter/nf_conntrack_core.c	2009-02-17 13:48:25.080060712 -0800
> @@ -793,13 +793,12 @@ void __nf_ct_refresh_acct(struct nf_conn
>  	NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct);
>  	NF_CT_ASSERT(skb);
>  
> -	spin_lock_bh(&nf_conntrack_lock);
> -
>  	/* Only update if this is not a fixed timeout */
>  	if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status))
>  		goto acct;
>  
> -	/* If not in hash table, timer will not be active yet */
> +	/* If not in hash table, timer will not be active yet,
> +	   we are the only one able to see it. */
>  	if (!nf_ct_is_confirmed(ct)) {
>  		ct->timeout.expires = extra_jiffies;
>  		event = IPCT_REFRESH;
> @@ -821,16 +820,16 @@ acct:
>  	if (do_acct) {
>  		struct nf_conn_counter *acct;
>  
> +		spin_lock_bh(&nf_conntrack_lock);
>  		acct = nf_conn_acct_find(ct);
>  		if (acct) {
>  			acct[CTINFO2DIR(ctinfo)].packets++;
>  			acct[CTINFO2DIR(ctinfo)].bytes +=
>  				skb->len - skb_network_offset(skb);
>  		}
> +		spin_unlock_bh(&nf_conntrack_lock);
>  	}
>  
> -	spin_unlock_bh(&nf_conntrack_lock);
> -
>  	/* must be unlocked when calling event cache */
>  	if (event)
>  		nf_conntrack_event_cache(event, ct);
> 

Unfortunatly, this patch changes nothing, as most of the time, do_acct is true.

We also need to fine lock the accounting part as well.

	spin_lock_bh(&ct->some_lock);
	acct = nf_conn_acct_find(ct);
	if (acct) {
		acct[CTINFO2DIR(ctinfo)].packets++;
		acct[CTINFO2DIR(ctinfo)].bytes +=
			skb->len - skb_network_offset(skb);
	}
	spin_unlock_bh(&ct->some_lock);


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18 14:01   ` Eric Dumazet
@ 2009-02-18 14:04     ` Patrick McHardy
  2009-02-18 14:22       ` Eric Dumazet
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 14:04 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev, tglx,
	netfilter-devel, Martin Josefsson

Eric Dumazet wrote:
> Unfortunatly, this patch changes nothing, as most of the time, do_acct is true.
> 
> We also need to fine lock the accounting part as well.
> 
> 	spin_lock_bh(&ct->some_lock);
> 	acct = nf_conn_acct_find(ct);
> 	if (acct) {
> 		acct[CTINFO2DIR(ctinfo)].packets++;
> 		acct[CTINFO2DIR(ctinfo)].bytes +=
> 			skb->len - skb_network_offset(skb);
> 	}
> 	spin_unlock_bh(&ct->some_lock);
> 

Its currently still enabled by default, but we intend to change that.
After that I guess almost nobody will have it enabled.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18 14:04     ` Patrick McHardy
@ 2009-02-18 14:22       ` Eric Dumazet
  2009-02-18 14:27         ` Patrick McHardy
  0 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-02-18 14:22 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev, tglx,
	netfilter-devel, Martin Josefsson

Patrick McHardy a écrit :
> Eric Dumazet wrote:
>> Unfortunatly, this patch changes nothing, as most of the time, do_acct
>> is true.
>>
>> We also need to fine lock the accounting part as well.
>>
>>     spin_lock_bh(&ct->some_lock);
>>     acct = nf_conn_acct_find(ct);
>>     if (acct) {
>>         acct[CTINFO2DIR(ctinfo)].packets++;
>>         acct[CTINFO2DIR(ctinfo)].bytes +=
>>             skb->len - skb_network_offset(skb);
>>     }
>>     spin_unlock_bh(&ct->some_lock);
>>
> 
> Its currently still enabled by default, but we intend to change that.
> After that I guess almost nobody will have it enabled.
> 
> 

Really ? I find this accounting stuff really useful and always enable it :)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock
  2009-02-18 14:22       ` Eric Dumazet
@ 2009-02-18 14:27         ` Patrick McHardy
  0 siblings, 0 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 14:27 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev, tglx,
	netfilter-devel, Martin Josefsson

Eric Dumazet wrote:
> Patrick McHardy a écrit :
>> Eric Dumazet wrote:
>>> Unfortunatly, this patch changes nothing, as most of the time, do_acct
>>> is true.
>>>
>>> We also need to fine lock the accounting part as well.
>>>
>>>     spin_lock_bh(&ct->some_lock);
>>>     acct = nf_conn_acct_find(ct);
>>>     if (acct) {
>>>         acct[CTINFO2DIR(ctinfo)].packets++;
>>>         acct[CTINFO2DIR(ctinfo)].bytes +=
>>>             skb->len - skb_network_offset(skb);
>>>     }
>>>     spin_unlock_bh(&ct->some_lock);
>>>
>> Its currently still enabled by default, but we intend to change that.
>> After that I guess almost nobody will have it enabled.
>> 
> Really ? I find this accounting stuff really useful and always enable it :)

You usually need extra userspace daemons to make something useful
out of the data and I doubt many people are running them. It doesn't
hurt to optimize it anyways of course :)

But I'm somewhat doubtful that we're actually having lock contention
here. One thing we could do with your lock hash change is to perform
the counter updates while holding those locks, that avoids taking
a different lock just for the counters. The only reason why its done
in nf_ct_refresh is that it was already taking the conntrack lock,
but if thats no longer the case, no reason to keep it there.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
                   ` (2 preceding siblings ...)
  2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
@ 2009-02-18  5:19 ` Stephen Hemminger
  2009-02-18  9:56   ` Patrick McHardy
  2009-02-18  8:30 ` [RFT 0/4] Netfilter/iptables performance improvements Eric Dumazet
  4 siblings, 1 reply; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18  5:19 UTC (permalink / raw)
  To: David Miller, Patrick McHardy, Rick Jones, Eric Dumazet
  Cc: netdev, netfilter-devel

[-- Attachment #1: tcp_conntrack_lock.patch --]
[-- Type: text/plain, Size: 9853 bytes --]

TCP connection tracking suffers of huge contention on a global rwlock,
is used for protecting the  tcp conntracking state.
As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock, using a spinlock per "struct ip_ct_tcp"

tcp_print_conntrack() dont need to lock anything to read ct->proto.tcp.state,
so speedup /proc/net/ip_conntrack as well.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reported-by: Rick Jones <rick.jones2@hp.com>

---
 include/linux/netfilter/nf_conntrack_tcp.h   |    1 
 include/net/netfilter/nf_conntrack_helper.h  |    2 -
 include/net/netfilter/nf_conntrack_l4proto.h |    3 --
 net/netfilter/nf_conntrack_netlink.c         |    6 ++--
 net/netfilter/nf_conntrack_proto_dccp.c      |    2 -
 net/netfilter/nf_conntrack_proto_sctp.c      |    2 -
 net/netfilter/nf_conntrack_proto_tcp.c       |   37 ++++++++++++---------------
 7 files changed, 25 insertions(+), 28 deletions(-)

--- a/include/linux/netfilter/nf_conntrack_tcp.h	2009-02-17 11:07:16.884086452 -0800
+++ b/include/linux/netfilter/nf_conntrack_tcp.h	2009-02-17 11:07:31.643846743 -0800
@@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
 
 struct ip_ct_tcp
 {
+	spinlock_t	lock;
 	struct ip_ct_tcp_state seen[2];	/* connection parameters per direction */
 	u_int8_t	state;		/* state of the connection (enum tcp_conntrack) */
 	/* For detecting stale connections */
--- a/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-17 11:07:16.870763455 -0800
+++ b/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-17 11:21:57.528485882 -0800
@@ -26,9 +26,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct se
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -705,14 +700,15 @@ void nf_conntrack_tcp_update(const struc
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->proto.tcp.lock);
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->proto.tcp.lock);
+
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +817,7 @@ static int tcp_packet(struct nf_conn *ct
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->proto.tcp.lock);
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -851,7 +847,7 @@ static int tcp_packet(struct nf_conn *ct
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->proto.tcp.lock);
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -887,7 +883,7 @@ static int tcp_packet(struct nf_conn *ct
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->proto.tcp.lock);
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -900,7 +896,7 @@ static int tcp_packet(struct nf_conn *ct
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->proto.tcp.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +905,7 @@ static int tcp_packet(struct nf_conn *ct
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->proto.tcp.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -940,7 +936,7 @@ static int tcp_packet(struct nf_conn *ct
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->proto.tcp.lock);
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -969,7 +965,7 @@ static int tcp_packet(struct nf_conn *ct
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->proto.tcp.lock);
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1022,6 +1018,7 @@ static bool tcp_new(struct nf_conn *ct, 
 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return false;
 	}
+	spin_lock_init(&ct->proto.tcp.lock);
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
 		/* SYN packet */
@@ -1087,12 +1084,12 @@ static bool tcp_new(struct nf_conn *ct, 
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct)
+			 struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->proto.tcp.lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1112,14 +1109,14 @@ static int tcp_to_nlattr(struct sk_buff 
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->proto.tcp.lock);
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->proto.tcp.lock);
 	return -1;
 }
 
@@ -1150,7 +1147,7 @@ static int nlattr_to_tcp(struct nlattr *
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->proto.tcp.lock);
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1177,7 +1174,7 @@ static int nlattr_to_tcp(struct nlattr *
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->proto.tcp.lock);
 
 	return 0;
 }
--- a/include/net/netfilter/nf_conntrack_helper.h	2009-02-17 11:21:31.207534629 -0800
+++ b/include/net/netfilter/nf_conntrack_helper.h	2009-02-17 11:21:40.928500800 -0800
@@ -34,7 +34,7 @@ struct nf_conntrack_helper
 
 	void (*destroy)(struct nf_conn *ct);
 
-	int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nf_conn *ct);
 	unsigned int expect_class_max;
 };
 
--- a/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-17 11:31:28.323255150 -0800
+++ b/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-17 11:31:45.175004794 -0800
@@ -62,8 +62,7 @@ struct nf_conntrack_l4proto
 	int (*print_conntrack)(struct seq_file *s, const struct nf_conn *);
 
 	/* convert protoinfo to nfnetink attributes */
-	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct);
 
 	/* convert nfnetlink attributes to protoinfo */
 	int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
--- a/net/netfilter/nf_conntrack_netlink.c	2009-02-17 11:22:33.636503637 -0800
+++ b/net/netfilter/nf_conntrack_netlink.c	2009-02-17 11:33:09.030758630 -0800
@@ -143,7 +143,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
@@ -168,7 +168,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_helpinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nlattr *nest_helper;
 	const struct nf_conn_help *help = nfct_help(ct);
@@ -350,7 +350,7 @@ nla_put_failure:
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		    int event, int nowait,
-		    const struct nf_conn *ct)
+		    struct nf_conn *ct)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
--- a/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-17 11:22:33.726792709 -0800
+++ b/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-17 11:32:33.262772938 -0800
@@ -612,7 +612,7 @@ static int dccp_print_conntrack(struct s
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
--- a/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-17 11:22:33.824548630 -0800
+++ b/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-17 11:32:44.047257293 -0800
@@ -469,7 +469,7 @@ static bool sctp_new(struct nf_conn *ct,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 

-- 


^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18  5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
@ 2009-02-18  9:56   ` Patrick McHardy
  2009-02-18 14:17     ` Eric Dumazet
  2009-02-18 21:55     ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
  0 siblings, 2 replies; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18  9:56 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Rick Jones, Eric Dumazet, netdev, netfilter-devel

Stephen Hemminger wrote:

> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
>  
>  struct ip_ct_tcp
>  {
> +	spinlock_t	lock;
>  	struct ip_ct_tcp_state seen[2];	/* connection parameters per direction */
>  	u_int8_t	state;		/* state of the connection (enum tcp_conntrack) */
>  	/* For detecting stale connections */

Eric already posted a patch to use an array of locks, which is
a better approach IMO since it keeps the size of the conntrack
entries down.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18  9:56   ` Patrick McHardy
@ 2009-02-18 14:17     ` Eric Dumazet
  2009-02-19 22:03       ` Stephen Hemminger
  2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
  2009-02-18 21:55     ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
  1 sibling, 2 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-02-18 14:17 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel

Patrick McHardy a écrit :
> Stephen Hemminger wrote:
> 
>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
>>  
>>  struct ip_ct_tcp
>>  {
>> +    spinlock_t    lock;
>>      struct ip_ct_tcp_state seen[2];    /* connection parameters per
>> direction */
>>      u_int8_t    state;        /* state of the connection (enum
>> tcp_conntrack) */
>>      /* For detecting stale connections */
> 
> Eric already posted a patch to use an array of locks, which is
> a better approach IMO since it keeps the size of the conntrack
> entries down.

Yes, we probably can use an array for short lived lock sections.

The extra cost to compute the lock address is quite small, if
the array size is known at compile time.

Stephen we might also use this same array to protect the nf_conn_acct_find(ct)
thing as well (I am referring to your RFT 3/4 patch :
Use mod_timer_noact to remove nf_conntrack_lock)

Here is a repost of patch Patrick is referring to :


[PATCH] netfilter: Get rid of central rwlock in tcp conntracking

TCP connection tracking suffers of huge contention on a global rwlock,
used to protect tcp conntracking state.

As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock. Using an array of spinlocks avoids
enlarging size of connection tracking structures, yet giving reasonable
fanout.

tcp_print_conntrack() doesnt need to lock anything to read
ct->proto.tcp.state, so speedup /proc/net/ip_conntrack as well.

nf_conntrack_hash_rnd_initted & nf_conntrack_hash_rnd declared read_mostly

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reported-by: Rick Jones <rick.jones2@hp.com>
---
 include/net/netfilter/nf_conntrack.h   |   32 +++++++++++++++++++++
 net/netfilter/nf_conntrack_core.c      |   10 ++++--
 net/netfilter/nf_conntrack_proto_tcp.c |   34 ++++++++++-------------
 3 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 2e0c536..288aff5 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -129,6 +129,38 @@ struct nf_conn
 	struct rcu_head rcu;
 };
 
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
+	defined(CONFIG_PROVE_LOCKING)
+
+/*
+ * We reserve 16 locks per cpu (typical cache line size is 64 bytes)
+ * maxed to 512 locks so that ct_hlock[] uses at most 2048 bytes of memory.
+ * (on lockdep we have a quite big spinlock_t, so keep the size down there)
+ */
+#ifdef CONFIG_LOCKDEP
+#define CT_HASH_LOCK_SZ 64
+#elif NR_CPUS >= 32
+#define CT_HASH_LOCK_SZ 512
+#else
+#define CT_HASH_LOCK_SZ (roundup_pow_of_two(NR_CPUS) * 16)
+#endif
+
+extern spinlock_t ct_hlock[CT_HASH_LOCK_SZ];
+
+#else
+#define CT_HASH_LOCK_SZ 0
+#endif
+static inline spinlock_t *ct_lock_addr(const struct nf_conn *ct)
+{
+	if (CT_HASH_LOCK_SZ) {
+		unsigned long hash = (unsigned long)ct;
+		hash ^= hash >> 16;
+		hash ^= hash >> 8;
+		return &ct_hlock[hash % CT_HASH_LOCK_SZ];
+	}
+	return NULL;
+}
+
 static inline struct nf_conn *
 nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
 {
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 90ce9dd..68822d8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -61,9 +61,9 @@ struct nf_conn nf_conntrack_untracked __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_untracked);
 
 static struct kmem_cache *nf_conntrack_cachep __read_mostly;
-
-static int nf_conntrack_hash_rnd_initted;
-static unsigned int nf_conntrack_hash_rnd;
+spinlock_t ct_hlock[CT_HASH_LOCK_SZ];
+static int nf_conntrack_hash_rnd_initted __read_mostly;
+static unsigned int nf_conntrack_hash_rnd __read_mostly;
 
 static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple,
 				  unsigned int size, unsigned int rnd)
@@ -1141,7 +1141,7 @@ module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint,
 static int nf_conntrack_init_init_net(void)
 {
 	int max_factor = 8;
-	int ret;
+	int i, ret;
 
 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
@@ -1174,6 +1174,8 @@ static int nf_conntrack_init_init_net(void)
 		ret = -ENOMEM;
 		goto err_cache;
 	}
+	for (i = 0; i < CT_HASH_LOCK_SZ; i++)
+		spin_lock_init(&ct_hlock[i]);
 
 	ret = nf_conntrack_proto_init();
 	if (ret < 0)
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index a1edb9c..247e82f 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -26,9 +26,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -705,14 +700,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb,
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +816,7 @@ static int tcp_packet(struct nf_conn *ct,
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -851,7 +846,7 @@ static int tcp_packet(struct nf_conn *ct,
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(ct_lock_addr(ct));
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -887,7 +882,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(ct_lock_addr(ct));
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -900,7 +895,7 @@ static int tcp_packet(struct nf_conn *ct,
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +904,7 @@ static int tcp_packet(struct nf_conn *ct,
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -940,7 +935,7 @@ static int tcp_packet(struct nf_conn *ct,
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(ct_lock_addr(ct));
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -969,7 +964,7 @@ static int tcp_packet(struct nf_conn *ct,
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1022,6 +1017,7 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return false;
 	}
+	spin_lock_init(ct_lock_addr(ct));
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
 		/* SYN packet */
@@ -1092,7 +1088,7 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1112,14 +1108,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 	return -1;
 }
 
@@ -1150,7 +1146,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(ct_lock_addr(ct));
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1177,7 +1173,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(ct_lock_addr(ct));
 
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18 14:17     ` Eric Dumazet
@ 2009-02-19 22:03       ` Stephen Hemminger
  2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
  1 sibling, 0 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-19 22:03 UTC (permalink / raw)
  To: Eric Dumazet, Patrick McHardy, David Miller
  Cc: Rick Jones, netdev, netfilter-devel

TCP connection tracking suffers of huge contention on a global rwlock,
used to protect tcp conntracking state.
As each tcp conntrack state have no relations between each others, we
can switch to fine grained lock, using a spinlock per "struct ip_ct_tcp"

tcp_print_conntrack() dont need to lock anything to read ct->proto.tcp.state,
so speedup /proc/net/ip_conntrack as well.

In this version the lock is placed in a 4 byte whole in the nf_conntrack
structure. This means no size change, and same method can later be used
for UDP, SCTP, DCCP conntrack.

Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reported-by: Rick Jones <rick.jones2@hp.com>

---
 include/linux/skbuff.h                       |    1 
 include/net/netfilter/nf_conntrack_helper.h  |    2 -
 include/net/netfilter/nf_conntrack_l4proto.h |    3 --
 net/netfilter/nf_conntrack_core.c            |    1 
 net/netfilter/nf_conntrack_netlink.c         |    6 ++--
 net/netfilter/nf_conntrack_proto_dccp.c      |    2 -
 net/netfilter/nf_conntrack_proto_sctp.c      |    2 -
 net/netfilter/nf_conntrack_proto_tcp.c       |   37 ++++++++++++---------------
 8 files changed, 26 insertions(+), 28 deletions(-)

--- a/include/net/netfilter/nf_conntrack_helper.h	2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_helper.h	2009-02-19 13:45:56.136167400 -0800
@@ -34,7 +34,7 @@ struct nf_conntrack_helper
 
 	void (*destroy)(struct nf_conn *ct);
 
-	int (*to_nlattr)(struct sk_buff *skb, const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nf_conn *ct);
 	unsigned int expect_class_max;
 };
 
--- a/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-19 13:45:26.103408544 -0800
+++ b/include/net/netfilter/nf_conntrack_l4proto.h	2009-02-19 13:45:56.136167400 -0800
@@ -62,8 +62,7 @@ struct nf_conntrack_l4proto
 	int (*print_conntrack)(struct seq_file *s, const struct nf_conn *);
 
 	/* convert protoinfo to nfnetink attributes */
-	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct);
+	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla, struct nf_conn *ct);
 
 	/* convert nfnetlink attributes to protoinfo */
 	int (*from_nlattr)(struct nlattr *tb[], struct nf_conn *ct);
--- a/net/netfilter/nf_conntrack_core.c	2009-02-19 13:42:48.316883082 -0800
+++ b/net/netfilter/nf_conntrack_core.c	2009-02-19 13:58:59.952707711 -0800
@@ -499,6 +499,7 @@ struct nf_conn *nf_conntrack_alloc(struc
 		return ERR_PTR(-ENOMEM);
 	}
 
+	spin_lock_init(&ct->ct_general.lock);
 	atomic_set(&ct->ct_general.use, 1);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
--- a/net/netfilter/nf_conntrack_netlink.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_netlink.c	2009-02-19 13:45:56.136167400 -0800
@@ -143,7 +143,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
@@ -168,7 +168,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_helpinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_helpinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nlattr *nest_helper;
 	const struct nf_conn_help *help = nfct_help(ct);
@@ -350,7 +350,7 @@ nla_put_failure:
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		    int event, int nowait,
-		    const struct nf_conn *ct)
+		    struct nf_conn *ct)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
--- a/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_dccp.c	2009-02-19 13:45:56.136167400 -0800
@@ -612,7 +612,7 @@ static int dccp_print_conntrack(struct s
 
 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
 static int dccp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
--- a/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_sctp.c	2009-02-19 13:45:56.136167400 -0800
@@ -469,7 +469,7 @@ static bool sctp_new(struct nf_conn *ct,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int sctp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			  const struct nf_conn *ct)
+			  struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 
--- a/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-19 13:45:26.103408544 -0800
+++ b/net/netfilter/nf_conntrack_proto_tcp.c	2009-02-19 13:59:58.025139232 -0800
@@ -26,9 +26,6 @@
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -297,9 +294,7 @@ static int tcp_print_conntrack(struct se
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -705,14 +700,15 @@ void nf_conntrack_tcp_update(const struc
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
+
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -821,7 +817,7 @@ static int tcp_packet(struct nf_conn *ct
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -851,7 +847,7 @@ static int tcp_packet(struct nf_conn *ct
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->ct_general.lock);
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -887,7 +883,7 @@ static int tcp_packet(struct nf_conn *ct
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(&ct->ct_general.lock);
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -900,7 +896,7 @@ static int tcp_packet(struct nf_conn *ct
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -909,7 +905,7 @@ static int tcp_packet(struct nf_conn *ct
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -940,7 +936,7 @@ static int tcp_packet(struct nf_conn *ct
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(&ct->ct_general.lock);
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -969,7 +965,7 @@ static int tcp_packet(struct nf_conn *ct
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1022,6 +1018,7 @@ static bool tcp_new(struct nf_conn *ct, 
 		pr_debug("nf_ct_tcp: invalid new deleting.\n");
 		return false;
 	}
+	spin_lock_init(&ct->ct_general.lock);
 
 	if (new_state == TCP_CONNTRACK_SYN_SENT) {
 		/* SYN packet */
@@ -1087,12 +1084,12 @@ static bool tcp_new(struct nf_conn *ct, 
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct)
+			 struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1112,14 +1109,14 @@ static int tcp_to_nlattr(struct sk_buff 
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 	return -1;
 }
 
@@ -1150,7 +1147,7 @@ static int nlattr_to_tcp(struct nlattr *
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(&ct->ct_general.lock);
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1177,7 +1174,7 @@ static int nlattr_to_tcp(struct nlattr *
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(&ct->ct_general.lock);
 
 	return 0;
 }
--- a/include/linux/skbuff.h	2009-02-19 13:53:46.575411267 -0800
+++ b/include/linux/skbuff.h	2009-02-19 13:53:57.414478437 -0800
@@ -97,6 +97,7 @@ struct pipe_inode_info;
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
 	atomic_t use;
+	spinlock_t lock;
 };
 #endif
 

^ permalink raw reply	[flat|nested] 83+ messages in thread

* [PATCH] netfilter: finer grained nf_conn locking
  2009-02-18 14:17     ` Eric Dumazet
  2009-02-19 22:03       ` Stephen Hemminger
@ 2009-03-28 16:55       ` Eric Dumazet
  2009-03-29  0:48         ` Stephen Hemminger
  2009-03-30 18:57         ` Rick Jones
  1 sibling, 2 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-03-28 16:55 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, David Miller, Rick Jones, netdev,
	netfilter-devel

Eric Dumazet a écrit :
> Patrick McHardy a écrit :
>> Stephen Hemminger wrote:
>>
>>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
>>>  
>>>  struct ip_ct_tcp
>>>  {
>>> +    spinlock_t    lock;
>>>      struct ip_ct_tcp_state seen[2];    /* connection parameters per
>>> direction */
>>>      u_int8_t    state;        /* state of the connection (enum
>>> tcp_conntrack) */
>>>      /* For detecting stale connections */
>> Eric already posted a patch to use an array of locks, which is
>> a better approach IMO since it keeps the size of the conntrack
>> entries down.
> 
> Yes, we probably can use an array for short lived lock sections.
> 
> The extra cost to compute the lock address is quite small, if
> the array size is known at compile time.
> 
> Stephen we might also use this same array to protect the nf_conn_acct_find(ct)
> thing as well (I am referring to your RFT 3/4 patch :
> Use mod_timer_noact to remove nf_conntrack_lock)
> 
> Here is a repost of patch Patrick is referring to :
> 
> 
> [PATCH] netfilter: Get rid of central rwlock in tcp conntracking
> 
> TCP connection tracking suffers of huge contention on a global rwlock,
> used to protect tcp conntracking state.
> 
> As each tcp conntrack state have no relations between each others, we
> can switch to fine grained lock. Using an array of spinlocks avoids
> enlarging size of connection tracking structures, yet giving reasonable
> fanout.
> 
> tcp_print_conntrack() doesnt need to lock anything to read
> ct->proto.tcp.state, so speedup /proc/net/ip_conntrack as well.
> 
> nf_conntrack_hash_rnd_initted & nf_conntrack_hash_rnd declared read_mostly
> 

Hi Patrick

Apparently we could not finish the removal of tcp_lock for 2.6.30 :(

Stephen suggested using a 4 bytes hole in struct nf_conntrack,
but this is ok only if sizeof(spinlock_t) <= 4 and 64 bit arches.

We could do an hybrid thing : use nf_conn.ct_general.lock if 64 bit arches
and sizeof(spinlock_t) <= 4.

Other cases would use a carefuly sized array of spinlocks...

Thank you

[PATCH] netfilter: finer grained nf_conn locking

Introduction of fine grained lock infrastructure for nf_conn.
If possible, we use a 32bit hole on 64bit arches.
Else we use a global array of hashed spinlocks, so we dont
change size of "struct nf_conn"

Get rid of central tcp_lock rwlock used in TCP conntracking
using this infrastructure for better performance on SMP.

"tbench 8" results on my 8 core machine (32bit kernel, with
conntracking on) : 2319 MB/s instead of 2284 MB/s


Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
---
 include/linux/skbuff.h                       |    9 ++-
 include/net/netfilter/nf_conntrack_l4proto.h |    2
 net/netfilter/nf_conntrack_core.c            |   47 +++++++++++++++++
 net/netfilter/nf_conntrack_netlink.c         |    4 -
 net/netfilter/nf_conntrack_proto_tcp.c       |   35 +++++-------
 5 files changed, 74 insertions(+), 23 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index bb1981f..c737f47 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -96,7 +96,14 @@ struct pipe_inode_info;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
-	atomic_t use;
+	atomic_t	use;
+#if BITS_PER_LONG == 64
+	/*
+	 * On 64bit arches, we might use this 32bit hole for spinlock
+	 * (if a spinlock_t fits)
+	 */
+	int		pad;
+#endif
 };
 #endif
 
diff --git a/include/net/netfilter/nf_conntrack_l4proto.h b/include/net/netfilter/nf_conntrack_l4proto.h
index ba32ed7..d66bea9 100644
--- a/include/net/netfilter/nf_conntrack_l4proto.h
+++ b/include/net/netfilter/nf_conntrack_l4proto.h
@@ -63,7 +63,7 @@ struct nf_conntrack_l4proto
 
 	/* convert protoinfo to nfnetink attributes */
 	int (*to_nlattr)(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct);
+			 struct nf_conn *ct);
 	/* Calculate protoinfo nlattr size */
 	int (*nlattr_size)(void);
 
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 8020db6..408d287 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -32,6 +32,7 @@
 #include <linux/rculist_nulls.h>
 
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_lock.h>
 #include <net/netfilter/nf_conntrack_l3proto.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_expect.h>
@@ -523,6 +524,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net,
 		return ERR_PTR(-ENOMEM);
 	}
 
+	nf_conn_lock_init(ct);
 	atomic_set(&ct->ct_general.use, 1);
 	ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
 	ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
@@ -1033,11 +1035,50 @@ void nf_conntrack_flush(struct net *net, u32 pid, int report)
 }
 EXPORT_SYMBOL_GPL(nf_conntrack_flush);
 
+spinlock_t *nf_conn_hlocks __read_mostly;
+unsigned int nf_conn_hlocks_mask __read_mostly;
+
+static int nf_conn_hlocks_init(void)
+{
+	if (nf_conn_lock_type() == NF_CONN_LOCK_EXTERNAL) {
+		size_t sz;
+		int i;
+#if defined(CONFIG_PROVE_LOCKING)
+		unsigned int nr_slots = 256;
+#else
+		/* 4 nodes per cpu on VSMP, or 256 slots per cpu */
+		unsigned int nr_slots = max(256UL, ((4UL << INTERNODE_CACHE_SHIFT) /
+					  sizeof(spinlock_t)));
+		nr_slots = roundup_pow_of_two(num_possible_cpus() * nr_slots);
+#endif
+		sz = nr_slots * sizeof(spinlock_t);
+		if (sz > PAGE_SIZE)
+			nf_conn_hlocks = vmalloc(sz);
+		else
+			nf_conn_hlocks = kmalloc(sz, GFP_KERNEL);
+		if (!nf_conn_hlocks)
+			return -ENOMEM;
+		nf_conn_hlocks_mask = nr_slots - 1;
+		for (i = 0; i < nr_slots; i++)
+			spin_lock_init(nf_conn_hlocks + i);
+	}
+	return 0;
+}
+
+static void nf_conn_hlocks_fini(void)
+{
+	if (is_vmalloc_addr(nf_conn_hlocks))
+		vfree(nf_conn_hlocks);
+	else
+		kfree(nf_conn_hlocks);
+}
+
 static void nf_conntrack_cleanup_init_net(void)
 {
 	nf_conntrack_helper_fini();
 	nf_conntrack_proto_fini();
 	kmem_cache_destroy(nf_conntrack_cachep);
+	nf_conn_hlocks_fini();
 }
 
 static void nf_conntrack_cleanup_net(struct net *net)
@@ -1170,6 +1211,10 @@ static int nf_conntrack_init_init_net(void)
 	int max_factor = 8;
 	int ret;
 
+	ret = nf_conn_hlocks_init();
+	if (ret)
+		goto err_hlocks;
+
 	/* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
 	 * machine has 512 buckets. >= 1GB machines have 16384 buckets. */
 	if (!nf_conntrack_htable_size) {
@@ -1217,6 +1262,8 @@ err_helper:
 err_proto:
 	kmem_cache_destroy(nf_conntrack_cachep);
 err_cache:
+	nf_conn_hlocks_fini();
+err_hlocks:
 	return ret;
 }
 
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index c6439c7..89ea035 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -144,7 +144,7 @@ nla_put_failure:
 }
 
 static inline int
-ctnetlink_dump_protoinfo(struct sk_buff *skb, const struct nf_conn *ct)
+ctnetlink_dump_protoinfo(struct sk_buff *skb, struct nf_conn *ct)
 {
 	struct nf_conntrack_l4proto *l4proto;
 	struct nlattr *nest_proto;
@@ -351,7 +351,7 @@ nla_put_failure:
 static int
 ctnetlink_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
 		    int event, int nowait,
-		    const struct nf_conn *ct)
+		    struct nf_conn *ct)
 {
 	struct nlmsghdr *nlh;
 	struct nfgenmsg *nfmsg;
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
index b5ccf2b..bb5fc24 100644
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -23,14 +23,13 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter_ipv6.h>
 #include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_lock.h>
 #include <net/netfilter/nf_conntrack_l4proto.h>
 #include <net/netfilter/nf_conntrack_ecache.h>
 #include <net/netfilter/nf_log.h>
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
 
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
@@ -300,9 +299,7 @@ static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
 {
 	enum tcp_conntrack state;
 
-	read_lock_bh(&tcp_lock);
 	state = ct->proto.tcp.state;
-	read_unlock_bh(&tcp_lock);
 
 	return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -708,14 +705,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb,
 
 	end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(nf_conn_lock_addr(ct));
 	/*
 	 * We have to worry for the ack in the reply packet only...
 	 */
 	if (after(end, ct->proto.tcp.seen[dir].td_end))
 		ct->proto.tcp.seen[dir].td_end = end;
 	ct->proto.tcp.last_end = end;
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(nf_conn_lock_addr(ct));
 	pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
 		 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
 		 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -824,7 +821,7 @@ static int tcp_packet(struct nf_conn *ct,
 	th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
 	BUG_ON(th == NULL);
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(nf_conn_lock_addr(ct));
 	old_state = ct->proto.tcp.state;
 	dir = CTINFO2DIR(ctinfo);
 	index = get_conntrack_index(th);
@@ -854,7 +851,7 @@ static int tcp_packet(struct nf_conn *ct,
 		        && ct->proto.tcp.last_index == TCP_RST_SET)) {
 			/* Attempt to reopen a closed/aborted connection.
 			 * Delete this connection and look up again. */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(nf_conn_lock_addr(ct));
 
 			/* Only repeat if we can actually remove the timer.
 			 * Destruction may already be in progress in process
@@ -890,7 +887,7 @@ static int tcp_packet(struct nf_conn *ct,
 			 * that the client cannot but retransmit its SYN and
 			 * thus initiate a clean new session.
 			 */
-			write_unlock_bh(&tcp_lock);
+			spin_unlock_bh(nf_conn_lock_addr(ct));
 			if (LOG_INVALID(net, IPPROTO_TCP))
 				nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 					  "nf_ct_tcp: killing out of sync session ");
@@ -903,7 +900,7 @@ static int tcp_packet(struct nf_conn *ct,
 		ct->proto.tcp.last_end =
 		    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
 
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(nf_conn_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid packet ignored ");
@@ -912,7 +909,7 @@ static int tcp_packet(struct nf_conn *ct,
 		/* Invalid packet */
 		pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
 			 dir, get_conntrack_index(th), old_state);
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(nf_conn_lock_addr(ct));
 		if (LOG_INVALID(net, IPPROTO_TCP))
 			nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
 				  "nf_ct_tcp: invalid state ");
@@ -943,7 +940,7 @@ static int tcp_packet(struct nf_conn *ct,
 
 	if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
 			   skb, dataoff, th, pf)) {
-		write_unlock_bh(&tcp_lock);
+		spin_unlock_bh(nf_conn_lock_addr(ct));
 		return -NF_ACCEPT;
 	}
      in_window:
@@ -972,7 +969,7 @@ static int tcp_packet(struct nf_conn *ct,
 		timeout = nf_ct_tcp_timeout_unacknowledged;
 	else
 		timeout = tcp_timeouts[new_state];
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(nf_conn_lock_addr(ct));
 
 	nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
 	if (new_state != old_state)
@@ -1090,12 +1087,12 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-			 const struct nf_conn *ct)
+			 struct nf_conn *ct)
 {
 	struct nlattr *nest_parms;
 	struct nf_ct_tcp_flags tmp = {};
 
-	read_lock_bh(&tcp_lock);
+	spin_lock_bh(nf_conn_lock_addr(ct));
 	nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
 	if (!nest_parms)
 		goto nla_put_failure;
@@ -1115,14 +1112,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
 	tmp.flags = ct->proto.tcp.seen[1].flags;
 	NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
 		sizeof(struct nf_ct_tcp_flags), &tmp);
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(nf_conn_lock_addr(ct));
 
 	nla_nest_end(skb, nest_parms);
 
 	return 0;
 
 nla_put_failure:
-	read_unlock_bh(&tcp_lock);
+	spin_unlock_bh(nf_conn_lock_addr(ct));
 	return -1;
 }
 
@@ -1153,7 +1150,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 	    nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
 		return -EINVAL;
 
-	write_lock_bh(&tcp_lock);
+	spin_lock_bh(nf_conn_lock_addr(ct));
 	if (tb[CTA_PROTOINFO_TCP_STATE])
 		ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1180,7 +1177,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
 		ct->proto.tcp.seen[1].td_scale =
 			nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
 	}
-	write_unlock_bh(&tcp_lock);
+	spin_unlock_bh(nf_conn_lock_addr(ct));
 
 	return 0;
 }


^ permalink raw reply related	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
@ 2009-03-29  0:48         ` Stephen Hemminger
  2009-03-30 19:57           ` Eric Dumazet
  2009-03-30 18:57         ` Rick Jones
  1 sibling, 1 reply; 83+ messages in thread
From: Stephen Hemminger @ 2009-03-29  0:48 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, David Miller, Rick Jones, netdev,
	netfilter-devel

On Sat, 28 Mar 2009 17:55:38 +0100
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Eric Dumazet a écrit :
> > Patrick McHardy a écrit :
> >> Stephen Hemminger wrote:
> >>
> >>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
> >>>  
> >>>  struct ip_ct_tcp
> >>>  {
> >>> +    spinlock_t    lock;
> >>>      struct ip_ct_tcp_state seen[2];    /* connection parameters per
> >>> direction */
> >>>      u_int8_t    state;        /* state of the connection (enum
> >>> tcp_conntrack) */
> >>>      /* For detecting stale connections */
> >> Eric already posted a patch to use an array of locks, which is
> >> a better approach IMO since it keeps the size of the conntrack
> >> entries down.
> > 
> > Yes, we probably can use an array for short lived lock sections.

I am not a fan of the array of locks. Sizing it is awkward and
it is vulnerable to hash collisions. Let's see if there is another
better way.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-29  0:48         ` Stephen Hemminger
@ 2009-03-30 19:57           ` Eric Dumazet
  2009-03-30 20:05             ` Stephen Hemminger
  0 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-03-30 19:57 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Patrick McHardy, David Miller, Rick Jones, netdev,
	netfilter-devel

Stephen Hemminger a écrit :
> On Sat, 28 Mar 2009 17:55:38 +0100
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> Eric Dumazet a écrit :
>>> Patrick McHardy a écrit :
>>>> Stephen Hemminger wrote:
>>>>
>>>>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
>>>>>  
>>>>>  struct ip_ct_tcp
>>>>>  {
>>>>> +    spinlock_t    lock;
>>>>>      struct ip_ct_tcp_state seen[2];    /* connection parameters per
>>>>> direction */
>>>>>      u_int8_t    state;        /* state of the connection (enum
>>>>> tcp_conntrack) */
>>>>>      /* For detecting stale connections */
>>>> Eric already posted a patch to use an array of locks, which is
>>>> a better approach IMO since it keeps the size of the conntrack
>>>> entries down.
>>> Yes, we probably can use an array for short lived lock sections.
> 
> I am not a fan of the array of locks. Sizing it is awkward and
> it is vulnerable to hash collisions. Let's see if there is another
> better way.

On normal machines, (no debugging spinlocks), patch uses an embedded
spinlock. We probably can use this even on 32bit kernels, considering
previous patch removed the rcu_head (8 bytes on 32bit arches) from
nf_conn :)

if LOCKDEP is on, size of a spinlock is 64 bytes on x86_64.
Adding a spinlock on each nf_conn would be too expensive. In this
case, an array of spinlock is a good compromise, as done in
IP route cache, tcp ehash, ...

I agree sizing of this hash table is not pretty, and should be
a generic kernel service (I wanted such service for futexes for example)

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 19:57           ` Eric Dumazet
@ 2009-03-30 20:05             ` Stephen Hemminger
  2009-04-06 12:07               ` Patrick McHardy
  0 siblings, 1 reply; 83+ messages in thread
From: Stephen Hemminger @ 2009-03-30 20:05 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, David Miller, Rick Jones, netdev,
	netfilter-devel

On Mon, 30 Mar 2009 21:57:15 +0200
Eric Dumazet <dada1@cosmosbay.com> wrote:

> Stephen Hemminger a écrit :
> > On Sat, 28 Mar 2009 17:55:38 +0100
> > Eric Dumazet <dada1@cosmosbay.com> wrote:
> > 
> >> Eric Dumazet a écrit :
> >>> Patrick McHardy a écrit :
> >>>> Stephen Hemminger wrote:
> >>>>
> >>>>> @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
> >>>>>  
> >>>>>  struct ip_ct_tcp
> >>>>>  {
> >>>>> +    spinlock_t    lock;
> >>>>>      struct ip_ct_tcp_state seen[2];    /* connection parameters per
> >>>>> direction */
> >>>>>      u_int8_t    state;        /* state of the connection (enum
> >>>>> tcp_conntrack) */
> >>>>>      /* For detecting stale connections */
> >>>> Eric already posted a patch to use an array of locks, which is
> >>>> a better approach IMO since it keeps the size of the conntrack
> >>>> entries down.
> >>> Yes, we probably can use an array for short lived lock sections.
> > 
> > I am not a fan of the array of locks. Sizing it is awkward and
> > it is vulnerable to hash collisions. Let's see if there is another
> > better way.
> 
> On normal machines, (no debugging spinlocks), patch uses an embedded
> spinlock. We probably can use this even on 32bit kernels, considering
> previous patch removed the rcu_head (8 bytes on 32bit arches) from
> nf_conn :)
> 
> if LOCKDEP is on, size of a spinlock is 64 bytes on x86_64.
> Adding a spinlock on each nf_conn would be too expensive. In this
> case, an array of spinlock is a good compromise, as done in
> IP route cache, tcp ehash, ...
> 
> I agree sizing of this hash table is not pretty, and should be
> a generic kernel service (I wanted such service for futexes for example)
> 

IMO having different locking based on lockdep and architecture is an invitation
to future obscure problems. Perhaps some other locking method or shrinking
ct entry would be better.
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 20:05             ` Stephen Hemminger
@ 2009-04-06 12:07               ` Patrick McHardy
  2009-04-06 12:32                 ` Jan Engelhardt
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-04-06 12:07 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: Eric Dumazet, David Miller, Rick Jones, netdev, netfilter-devel

Stephen Hemminger wrote:
> On Mon, 30 Mar 2009 21:57:15 +0200
> Eric Dumazet <dada1@cosmosbay.com> wrote:
> 
>> On normal machines, (no debugging spinlocks), patch uses an embedded
>> spinlock. We probably can use this even on 32bit kernels, considering
>> previous patch removed the rcu_head (8 bytes on 32bit arches) from
>> nf_conn :)
>>
>> if LOCKDEP is on, size of a spinlock is 64 bytes on x86_64.
>> Adding a spinlock on each nf_conn would be too expensive. In this
>> case, an array of spinlock is a good compromise, as done in
>> IP route cache, tcp ehash, ...
>>
>> I agree sizing of this hash table is not pretty, and should be
>> a generic kernel service (I wanted such service for futexes for example)
>>
> 
> IMO having different locking based on lockdep and architecture is an invitation
> to future obscure problems. Perhaps some other locking method or shrinking
> ct entry would be better.

I agree. Do people enable lockdep on production machines? Otherwise
I'd say the size increase doesn't really matter.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-04-06 12:07               ` Patrick McHardy
@ 2009-04-06 12:32                 ` Jan Engelhardt
  2009-04-06 17:25                   ` Stephen Hemminger
  0 siblings, 1 reply; 83+ messages in thread
From: Jan Engelhardt @ 2009-04-06 12:32 UTC (permalink / raw)
  To: Patrick McHardy
  Cc: Stephen Hemminger, Eric Dumazet, David Miller, Rick Jones, netdev,
	netfilter-devel


On Monday 2009-04-06 14:07, Patrick McHardy wrote:
>>>
>>> if LOCKDEP is on, size of a spinlock is 64 bytes on x86_64.
>>> Adding a spinlock on each nf_conn would be too expensive. In this
>>> case, an array of spinlock is a good compromise, as done in
>>> IP route cache, tcp ehash, ...
>>
>> IMO having different locking based on lockdep and architecture is an
>> invitation
>> to future obscure problems. Perhaps some other locking method or shrinking
>> ct entry would be better.
>
> I agree. Do people enable lockdep on production machines?

They do not.[1]


[1] http://git.opensuse.org/?p=people/jblunck/kernel-source.git;a=blob;f=config/x86_64/default;hb=SL111_BRANCH

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-04-06 12:32                 ` Jan Engelhardt
@ 2009-04-06 17:25                   ` Stephen Hemminger
  0 siblings, 0 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-04-06 17:25 UTC (permalink / raw)
  To: Jan Engelhardt
  Cc: Patrick McHardy, Eric Dumazet, David Miller, Rick Jones, netdev,
	netfilter-devel

On Mon, 6 Apr 2009 14:32:54 +0200 (CEST)
Jan Engelhardt <jengelh@medozas.de> wrote:

> 
> On Monday 2009-04-06 14:07, Patrick McHardy wrote:
> >>>
> >>> if LOCKDEP is on, size of a spinlock is 64 bytes on x86_64.
> >>> Adding a spinlock on each nf_conn would be too expensive. In this
> >>> case, an array of spinlock is a good compromise, as done in
> >>> IP route cache, tcp ehash, ...
> >>
> >> IMO having different locking based on lockdep and architecture is an
> >> invitation
> >> to future obscure problems. Perhaps some other locking method or shrinking
> >> ct entry would be better.
> >
> > I agree. Do people enable lockdep on production machines?
> 
> They do not.[1]
> 
> 
> [1] http://git.opensuse.org/?p=people/jblunck/kernel-source.git;a=blob;f=config/x86_64/default;hb=SL111_BRANCH

IMHO If they enable lockdep, they can expect that the cost is non-zero.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
  2009-03-29  0:48         ` Stephen Hemminger
@ 2009-03-30 18:57         ` Rick Jones
  2009-03-30 19:20           ` Eric Dumazet
  2009-03-30 19:38           ` Jesper Dangaard Brouer
  1 sibling, 2 replies; 83+ messages in thread
From: Rick Jones @ 2009-03-30 18:57 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, netdev,
	netfilter-devel

Eric Dumazet wrote:
> Hi Patrick
> 
> Apparently we could not finish the removal of tcp_lock for 2.6.30 :(
> 
> Stephen suggested using a 4 bytes hole in struct nf_conntrack,
> but this is ok only if sizeof(spinlock_t) <= 4 and 64 bit arches.
> 
> We could do an hybrid thing : use nf_conn.ct_general.lock if 64 bit arches
> and sizeof(spinlock_t) <= 4.
> 
> Other cases would use a carefuly sized array of spinlocks...
> 
> Thank you
> 
> [PATCH] netfilter: finer grained nf_conn locking
> 
> Introduction of fine grained lock infrastructure for nf_conn.
> If possible, we use a 32bit hole on 64bit arches.
> Else we use a global array of hashed spinlocks, so we dont
> change size of "struct nf_conn"
> 
> Get rid of central tcp_lock rwlock used in TCP conntracking
> using this infrastructure for better performance on SMP.
> 
> "tbench 8" results on my 8 core machine (32bit kernel, with
> conntracking on) : 2319 MB/s instead of 2284 MB/s

Is this an implicit request for me to try to resurrect the 32-core setup?

rick jones

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 18:57         ` Rick Jones
@ 2009-03-30 19:20           ` Eric Dumazet
  2009-03-30 19:38           ` Jesper Dangaard Brouer
  1 sibling, 0 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-03-30 19:20 UTC (permalink / raw)
  To: Rick Jones
  Cc: Patrick McHardy, Stephen Hemminger, David Miller, netdev,
	netfilter-devel

Rick Jones a écrit :
> Eric Dumazet wrote:
>> Hi Patrick
>>
>> Apparently we could not finish the removal of tcp_lock for 2.6.30 :(
>>
>> Stephen suggested using a 4 bytes hole in struct nf_conntrack,
>> but this is ok only if sizeof(spinlock_t) <= 4 and 64 bit arches.
>>
>> We could do an hybrid thing : use nf_conn.ct_general.lock if 64 bit
>> arches
>> and sizeof(spinlock_t) <= 4.
>>
>> Other cases would use a carefuly sized array of spinlocks...
>>
>> Thank you
>>
>> [PATCH] netfilter: finer grained nf_conn locking
>>
>> Introduction of fine grained lock infrastructure for nf_conn.
>> If possible, we use a 32bit hole on 64bit arches.
>> Else we use a global array of hashed spinlocks, so we dont
>> change size of "struct nf_conn"
>>
>> Get rid of central tcp_lock rwlock used in TCP conntracking
>> using this infrastructure for better performance on SMP.
>>
>> "tbench 8" results on my 8 core machine (32bit kernel, with
>> conntracking on) : 2319 MB/s instead of 2284 MB/s
> 
> Is this an implicit request for me to try to resurrect the 32-core setup?
> 

Not at all, just to keep you informed of work in progress :)


--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 18:57         ` Rick Jones
  2009-03-30 19:20           ` Eric Dumazet
@ 2009-03-30 19:38           ` Jesper Dangaard Brouer
  2009-03-30 19:54             ` Eric Dumazet
  1 sibling, 1 reply; 83+ messages in thread
From: Jesper Dangaard Brouer @ 2009-03-30 19:38 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Netfilter Developers


> Eric Dumazet wrote:
>>  "tbench 8" results on my 8 core machine (32bit kernel, with
>>  conntracking on) : 2319 MB/s instead of 2284 MB/s

How do you achieve this impressing numbers?
Is it against localhost? (10Gbit/s is max 1250 MB/s)

Hilsen
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 19:38           ` Jesper Dangaard Brouer
@ 2009-03-30 19:54             ` Eric Dumazet
  2009-03-30 20:34               ` Jesper Dangaard Brouer
  0 siblings, 1 reply; 83+ messages in thread
From: Eric Dumazet @ 2009-03-30 19:54 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, Netfilter Developers

Jesper Dangaard Brouer a écrit :
> 
>> Eric Dumazet wrote:
>>>  "tbench 8" results on my 8 core machine (32bit kernel, with
>>>  conntracking on) : 2319 MB/s instead of 2284 MB/s
> 
> How do you achieve this impressing numbers?
> Is it against localhost? (10Gbit/s is max 1250 MB/s)
> 

tbench is a tcp test on localhost yes :)

Good to test tcp stack without going to NIC hardware



^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 19:54             ` Eric Dumazet
@ 2009-03-30 20:34               ` Jesper Dangaard Brouer
  2009-03-30 20:41                 ` Eric Dumazet
  0 siblings, 1 reply; 83+ messages in thread
From: Jesper Dangaard Brouer @ 2009-03-30 20:34 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Netfilter Developers

[-- Attachment #1: Type: TEXT/PLAIN, Size: 1281 bytes --]

On Mon, 30 Mar 2009, Eric Dumazet wrote:

> Jesper Dangaard Brouer a écrit :
>>
>>> Eric Dumazet wrote:
>>>>  "tbench 8" results on my 8 core machine (32bit kernel, with
>>>>  conntracking on) : 2319 MB/s instead of 2284 MB/s
>>
>> How do you achieve this impressing numbers?
>> Is it against localhost? (10Gbit/s is max 1250 MB/s)
>>
>
> tbench is a tcp test on localhost yes :)

I see!

Using a Sun 10GbE NIC I was only getting a throughput of 556.86 MB/sec 
with 64 procs (between an AMD Phenom X4 and a Core i7).  (Not tuned multi 
queues yet ...)

Against localhost I'm getting (not with applied patch):

  1336.42 MB/sec on my AMD phenom X4 9950 Quad-Core Processor

  1552.81 MB/sec on my Core i7 920 (4 physical cores, plus 4 threads)

  2274.53 MB/sec on my dual CPU Xeon E5420 (8 cores)


> Good to test tcp stack without going to NIC hardware

Yes true, but this also stresses the process scheduler, I'm seeing around 
800.000 context switches per sec on the Dual CPU Xeon system.


Cheers,
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 20:34               ` Jesper Dangaard Brouer
@ 2009-03-30 20:41                 ` Eric Dumazet
  2009-03-30 21:25                   ` Jesper Dangaard Brouer
  2009-03-30 22:44                   ` Rick Jones
  0 siblings, 2 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-03-30 20:41 UTC (permalink / raw)
  To: Jesper Dangaard Brouer; +Cc: netdev, Netfilter Developers

Jesper Dangaard Brouer a écrit :
> On Mon, 30 Mar 2009, Eric Dumazet wrote:
> 
>> Jesper Dangaard Brouer a écrit :
>>>
>>>> Eric Dumazet wrote:
>>>>>  "tbench 8" results on my 8 core machine (32bit kernel, with
>>>>>  conntracking on) : 2319 MB/s instead of 2284 MB/s
>>>
>>> How do you achieve this impressing numbers?
>>> Is it against localhost? (10Gbit/s is max 1250 MB/s)
>>>
>>
>> tbench is a tcp test on localhost yes :)
> 
> I see!
> 
> Using a Sun 10GbE NIC I was only getting a throughput of 556.86 MB/sec
> with 64 procs (between an AMD Phenom X4 and a Core i7).  (Not tuned
> multi queues yet ...)
> 
> Against localhost I'm getting (not with applied patch):
> 
>  1336.42 MB/sec on my AMD phenom X4 9950 Quad-Core Processor
> 
>  1552.81 MB/sec on my Core i7 920 (4 physical cores, plus 4 threads)

Strange results, compared to my E5420 (I thought i7 was faster ??)

> 
>  2274.53 MB/sec on my dual CPU Xeon E5420 (8 cores)

Yes, my dev machine is a dual E5420 (8 cores) at 3.00 GHz
gcc version here is 4.3.3

> 
> 
>> Good to test tcp stack without going to NIC hardware
> 
> Yes true, but this also stresses the process scheduler, I'm seeing
> around 800.000 context switches per sec on the Dual CPU Xeon system.
> 

Indeed, tbench is a mix of tcp and process scheduler test/bench




^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 20:41                 ` Eric Dumazet
@ 2009-03-30 21:25                   ` Jesper Dangaard Brouer
  2009-03-30 22:44                   ` Rick Jones
  1 sibling, 0 replies; 83+ messages in thread
From: Jesper Dangaard Brouer @ 2009-03-30 21:25 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: netdev, Netfilter Developers

[-- Attachment #1: Type: TEXT/PLAIN, Size: 853 bytes --]


On Mon, 30 Mar 2009, Eric Dumazet wrote:

> Jesper Dangaard Brouer a écrit :
>>
>> Against localhost I'm getting (not with applied patch):
>>
>>  1336.42 MB/sec on my AMD phenom X4 9950 Quad-Core Processor
>>
>>  1552.81 MB/sec on my Core i7 920 (4 physical cores, plus 4 threads)
>
> Strange results, compared to my E5420 (I thought i7 was faster ??)

I also though that i7 would be faster, but I think it can be explained by 
the i7 only has 4 real cores even though it shows 8 CPUs (due to 
hyperthreads).

>>  2274.53 MB/sec on my dual CPU Xeon E5420 (8 cores)

Hilsen
   Jesper Brouer

--
-------------------------------------------------------------------
MSc. Master of Computer Science
Dept. of Computer Science, University of Copenhagen
Author of http://www.adsl-optimizer.dk
-------------------------------------------------------------------

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [PATCH] netfilter: finer grained nf_conn locking
  2009-03-30 20:41                 ` Eric Dumazet
  2009-03-30 21:25                   ` Jesper Dangaard Brouer
@ 2009-03-30 22:44                   ` Rick Jones
  1 sibling, 0 replies; 83+ messages in thread
From: Rick Jones @ 2009-03-30 22:44 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Jesper Dangaard Brouer, netdev, Netfilter Developers

> Indeed, tbench is a mix of tcp and process scheduler test/bench

If I were inclined to run networking tests (eg netperf) over loopback and wanted 
to maximize the trips up and down the protocol stack while minimizing scheduler 
overheads, I might be inclinded to configure --enable-burst with netperf and then 
run N/2 concurrent instances of something like:

netperf -T M,N -t TCP_RR -l 30 -- -b 128 -D &

where M and N were chosen to have each netperf and netserver pair bound to a pair 
of suitable cores, and the value in the -b option wash picked to maximize the CPU 
utilization on those cores.  Then, in theory there would be little to no process 
to process context switching and presumably little in the way of scheduler effect.

What I don't know is if such a setup would have both netperf and netserver each 
consuming 100% of a CPU or if one of them might "peg" before the other.  If one 
did peg before the other, I might be inclined to switch to running N concurrent 
instances, with -T M to bind each  netperf/netserver pair to the same core. 
There would then be the process to process context switching though it would be 
limited to "related" processes.

rick jones

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18  9:56   ` Patrick McHardy
  2009-02-18 14:17     ` Eric Dumazet
@ 2009-02-18 21:55     ` David Miller
  2009-02-18 23:23       ` Patrick McHardy
  1 sibling, 1 reply; 83+ messages in thread
From: David Miller @ 2009-02-18 21:55 UTC (permalink / raw)
  To: kaber; +Cc: shemminger, rick.jones2, dada1, netdev, netfilter-devel

From: Patrick McHardy <kaber@trash.net>
Date: Wed, 18 Feb 2009 10:56:45 +0100

> Stephen Hemminger wrote:
> 
> > @@ -50,6 +50,7 @@ struct ip_ct_tcp_state {
> >  struct ip_ct_tcp
> >  {
> > +	spinlock_t	lock;
> >  	struct ip_ct_tcp_state seen[2];	/* connection parameters per direction */
> >  	u_int8_t	state;		/* state of the connection (enum tcp_conntrack) */
> >  	/* For detecting stale connections */
> 
> Eric already posted a patch to use an array of locks, which is
> a better approach IMO since it keeps the size of the conntrack
> entries down.

Just as a side note, we generally frown upon the
hash-array-of-spinlocks approach to scalability.

If you need proof that in the long term it's suboptimal, note that:

1) this is Solaris's approach to locking scalability :-)

2) every such case in the kernel eventually gets transformed into
   RCU, a tree/trie based scheme, or some combination of the two

So maybe for now it's ok, but keep in mind that eventually
this is certain to change. :)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18 21:55     ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
@ 2009-02-18 23:23       ` Patrick McHardy
  2009-02-18 23:35         ` Stephen Hemminger
  0 siblings, 1 reply; 83+ messages in thread
From: Patrick McHardy @ 2009-02-18 23:23 UTC (permalink / raw)
  To: David Miller; +Cc: shemminger, rick.jones2, dada1, netdev, netfilter-devel

David Miller wrote:
> From: Patrick McHardy <kaber@trash.net>
> Date: Wed, 18 Feb 2009 10:56:45 +0100
> 
>> Eric already posted a patch to use an array of locks, which is
>> a better approach IMO since it keeps the size of the conntrack
>> entries down.
> 
> Just as a side note, we generally frown upon the
> hash-array-of-spinlocks approach to scalability.
> 
> If you need proof that in the long term it's suboptimal, note that:
> 
> 1) this is Solaris's approach to locking scalability :-)

:)

> 2) every such case in the kernel eventually gets transformed into
>    RCU, a tree/trie based scheme, or some combination of the two
> 
> So maybe for now it's ok, but keep in mind that eventually
> this is certain to change. :)

This case might be different in that a normal firewall use case
probably doesn't have more than 16 cpus, even than would be quite
a lot. So for bigger machines this is probably more about keeping
the "non-use" costs low.

I'll keep it in mind though and I'm interested in seeing how it
turns out in the long term :)

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking
  2009-02-18 23:23       ` Patrick McHardy
@ 2009-02-18 23:35         ` Stephen Hemminger
  0 siblings, 0 replies; 83+ messages in thread
From: Stephen Hemminger @ 2009-02-18 23:35 UTC (permalink / raw)
  To: Patrick McHardy; +Cc: David Miller, rick.jones2, dada1, netdev, netfilter-devel

On Thu, 19 Feb 2009 00:23:45 +0100
Patrick McHardy <kaber@trash.net> wrote:

> David Miller wrote:
> > From: Patrick McHardy <kaber@trash.net>
> > Date: Wed, 18 Feb 2009 10:56:45 +0100
> > 
> >> Eric already posted a patch to use an array of locks, which is
> >> a better approach IMO since it keeps the size of the conntrack
> >> entries down.
> > 
> > Just as a side note, we generally frown upon the
> > hash-array-of-spinlocks approach to scalability.
> > 
> > If you need proof that in the long term it's suboptimal, note that:
> > 
> > 1) this is Solaris's approach to locking scalability :-)
> 
> :)
> 
> > 2) every such case in the kernel eventually gets transformed into
> >    RCU, a tree/trie based scheme, or some combination of the two
> > 
> > So maybe for now it's ok, but keep in mind that eventually
> > this is certain to change. :)
> 
> This case might be different in that a normal firewall use case
> probably doesn't have more than 16 cpus, even than would be quite
> a lot. So for bigger machines this is probably more about keeping
> the "non-use" costs low.
> 
> I'll keep it in mind though and I'm interested in seeing how it
> turns out in the long term :)

It doesn't help that spinlock_t keeps growing! In good old days,
a spin lock could fit in one byte.

^ permalink raw reply	[flat|nested] 83+ messages in thread

* Re: [RFT 0/4] Netfilter/iptables performance improvements
  2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
                   ` (3 preceding siblings ...)
  2009-02-18  5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
@ 2009-02-18  8:30 ` Eric Dumazet
  4 siblings, 0 replies; 83+ messages in thread
From: Eric Dumazet @ 2009-02-18  8:30 UTC (permalink / raw)
  To: Stephen Hemminger
  Cc: David Miller, Patrick McHardy, Rick Jones, netdev,
	netfilter-devel

Stephen Hemminger a écrit :
> Bring together the three performance improvements suggested.
>   1) RCU for ip_tables entries
>   2) mod_timer_noact for conntrack timer
>   3) eliminate tcp_lock
> 
> I took the patches for 2 & 3 and made them build and basically work.
> 
> This patch set is against Patrick's netfilter next tree since
> it is where it should end up.
> 	git.kernel.org:/pub/scm/linux/kernel/git/kaber/nf-next-2.6.git
> 

Excellent Stephen, I'll give some time today to review this and test it
on my lab machines.


^ permalink raw reply	[flat|nested] 83+ messages in thread

end of thread, other threads:[~2009-04-06 17:25 UTC | newest]

Thread overview: 83+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-02-18  5:19 [RFT 0/4] Netfilter/iptables performance improvements Stephen Hemminger
2009-02-18  5:19 ` [RFT 1/4] iptables: lock free counters Stephen Hemminger
2009-02-18 10:02   ` Patrick McHardy
2009-02-19 19:47   ` [PATCH] " Stephen Hemminger
2009-02-19 23:46     ` Eric Dumazet
2009-02-19 23:56       ` Rick Jones
2009-02-20  1:03         ` Stephen Hemminger
2009-02-20  1:18           ` Rick Jones
2009-02-20  9:42             ` Patrick McHardy
2009-02-20 22:57               ` Rick Jones
2009-02-21  0:35                 ` Rick Jones
2009-02-20  9:37       ` Patrick McHardy
2009-02-20 18:10       ` [PATCH] iptables: xt_hashlimit fix Eric Dumazet
2009-02-20 18:33         ` Jan Engelhardt
2009-02-28  1:54           ` Jan Engelhardt
2009-02-28  6:56             ` Eric Dumazet
2009-02-28  8:22               ` Jan Engelhardt
2009-02-24 14:31         ` Patrick McHardy
2009-02-27 14:02       ` [PATCH] iptables: lock free counters Eric Dumazet
2009-02-27 16:08         ` [PATCH] rcu: increment quiescent state counter in ksoftirqd() Eric Dumazet
2009-02-27 16:34           ` Paul E. McKenney
2009-03-02 10:55         ` [PATCH] iptables: lock free counters Patrick McHardy
2009-03-02 17:47           ` Eric Dumazet
2009-03-02 21:56             ` Patrick McHardy
2009-03-02 22:02               ` Stephen Hemminger
2009-03-02 22:07                 ` Patrick McHardy
2009-03-02 22:17                   ` Paul E. McKenney
2009-03-02 22:27                 ` Eric Dumazet
2009-02-18  5:19 ` [RFT 2/4] Add mod_timer_noact Stephen Hemminger
2009-02-18  9:20   ` Ingo Molnar
2009-02-18  9:30     ` David Miller
2009-02-18 11:01       ` Ingo Molnar
2009-02-18 11:39         ` Jarek Poplawski
2009-02-18 12:37           ` Ingo Molnar
2009-02-18 12:33         ` Patrick McHardy
2009-02-18 21:39         ` David Miller
2009-02-18 21:51           ` Ingo Molnar
2009-02-18 22:04             ` David Miller
2009-02-18 22:42               ` Peter Zijlstra
2009-02-18 22:47                 ` David Miller
2009-02-18 22:56                   ` Stephen Hemminger
2009-02-18 10:07     ` Patrick McHardy
2009-02-18 12:05       ` [patch] timers: add mod_timer_pending() Ingo Molnar
2009-02-18 12:33         ` Patrick McHardy
2009-02-18 12:50           ` Ingo Molnar
2009-02-18 12:54             ` Patrick McHardy
2009-02-18 13:47               ` Ingo Molnar
2009-02-18 17:00         ` Oleg Nesterov
2009-02-18 18:23           ` Ingo Molnar
2009-02-18 18:58             ` Oleg Nesterov
2009-02-18 19:24               ` Ingo Molnar
2009-02-18 10:29   ` [RFT 2/4] Add mod_timer_noact Patrick McHardy
2009-02-18  5:19 ` [RFT 3/4] Use mod_timer_noact to remove nf_conntrack_lock Stephen Hemminger
2009-02-18  9:54   ` Patrick McHardy
2009-02-18 11:05   ` Jarek Poplawski
2009-02-18 11:08     ` Patrick McHardy
2009-02-18 14:01   ` Eric Dumazet
2009-02-18 14:04     ` Patrick McHardy
2009-02-18 14:22       ` Eric Dumazet
2009-02-18 14:27         ` Patrick McHardy
2009-02-18  5:19 ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking Stephen Hemminger
2009-02-18  9:56   ` Patrick McHardy
2009-02-18 14:17     ` Eric Dumazet
2009-02-19 22:03       ` Stephen Hemminger
2009-03-28 16:55       ` [PATCH] netfilter: finer grained nf_conn locking Eric Dumazet
2009-03-29  0:48         ` Stephen Hemminger
2009-03-30 19:57           ` Eric Dumazet
2009-03-30 20:05             ` Stephen Hemminger
2009-04-06 12:07               ` Patrick McHardy
2009-04-06 12:32                 ` Jan Engelhardt
2009-04-06 17:25                   ` Stephen Hemminger
2009-03-30 18:57         ` Rick Jones
2009-03-30 19:20           ` Eric Dumazet
2009-03-30 19:38           ` Jesper Dangaard Brouer
2009-03-30 19:54             ` Eric Dumazet
2009-03-30 20:34               ` Jesper Dangaard Brouer
2009-03-30 20:41                 ` Eric Dumazet
2009-03-30 21:25                   ` Jesper Dangaard Brouer
2009-03-30 22:44                   ` Rick Jones
2009-02-18 21:55     ` [RFT 4/4] netfilter: Get rid of central rwlock in tcp conntracking David Miller
2009-02-18 23:23       ` Patrick McHardy
2009-02-18 23:35         ` Stephen Hemminger
2009-02-18  8:30 ` [RFT 0/4] Netfilter/iptables performance improvements Eric Dumazet

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).