netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH, netfilter] NUMA aware ipv4/netfilter/ip_tables.c
@ 2005-09-19 17:09 Eric dumazet
  2005-09-19 17:20 ` Eric Dumazet
  2005-09-19 17:48 ` Andi Kleen
  0 siblings, 2 replies; 58+ messages in thread
From: Eric dumazet @ 2005-09-19 17:09 UTC (permalink / raw)
  To: netfilter-devel; +Cc: netdev

[-- Attachment #1: Type: text/plain, Size: 455 bytes --]

Hi

Part of the performance problem we have with netfilter  is memory 
allocation is not NUMA aware, but 'only' SMP aware.

What do you think of this patch ?

Note : The allocation function is quite complex (copied from 
drivers/pci/pci-driver.c pci_call_probe())
because current kernel doesnt have a NUMA aware vmalloc() wrapper, maybe 
a future kernel will export
such a common function ?

Thank you

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>



[-- Attachment #2: patch_ip_tables_numa --]
[-- Type: text/plain, Size: 9518 bytes --]

--- linux-2.6.14-rc1.p1/net/ipv4/netfilter/ip_tables.c	2005-09-19 19:56:12.000000000 +0200
+++ linux-2.6.14-rc1/net/ipv4/netfilter/ip_tables.c	2005-09-19 21:03:44.000000000 +0200
@@ -17,6 +17,7 @@
 #include <linux/skbuff.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/tcp.h>
@@ -82,11 +83,6 @@
    context stops packets coming through and allows user context to read
    the counters or update the rules.
 
-   To be cache friendly on SMP, we arrange them like so:
-   [ n-entries ]
-   ... cache-align padding ...
-   [ n-entries ]
-
    Hence the start of any table is given by get_table() below.  */
 
 /* The table itself */
@@ -104,7 +100,8 @@
 	unsigned int underflow[NF_IP_NUMHOOKS];
 
 	/* ipt_entry tables: one per CPU */
-	char entries[0] ____cacheline_aligned;
+	void *entry0 ; /* entry for the first possible cpu */
+	void *entries[NR_CPUS];
 };
 
 static LIST_HEAD(ipt_target);
@@ -112,11 +109,6 @@
 static LIST_HEAD(ipt_tables);
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
 
 #if 0
 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
@@ -289,8 +281,7 @@
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+	table_base = (void *)table->private->entries[smp_processor_id()];
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -571,7 +562,7 @@
 	for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ipt_entry *e
-			= (struct ipt_entry *)(newinfo->entries + pos);
+			= (struct ipt_entry *)(newinfo->entry0 + pos);
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
@@ -621,13 +612,13 @@
 						goto next;
 
 					e = (struct ipt_entry *)
-						(newinfo->entries + pos);
+						(newinfo->entry0 + pos);
 				} while (oldpos == pos + e->next_offset);
 
 				/* Move along one */
 				size = e->next_offset;
 				e = (struct ipt_entry *)
-					(newinfo->entries + pos + size);
+					(newinfo->entry0 + pos + size);
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -644,7 +635,7 @@
 					newpos = pos + e->next_offset;
 				}
 				e = (struct ipt_entry *)
-					(newinfo->entries + newpos);
+					(newinfo->entry0 + newpos);
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
@@ -874,11 +865,11 @@
 	duprintf("translate_table: size %u\n", newinfo->size);
 	i = 0;
 	/* Walk through entries, checking offsets. */
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
 				check_entry_size_and_hooks,
 				newinfo,
-				newinfo->entries,
-				newinfo->entries + size,
+				newinfo->entry0,
+				newinfo->entry0 + size,
 				hook_entries, underflows, &i);
 	if (ret != 0)
 		return ret;
@@ -911,20 +902,21 @@
 
 	/* Finally, each sanity check must pass */
 	i = 0;
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
 				check_entry, name, size, &i);
 
 	if (ret != 0) {
-		IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+		IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
 				  cleanup_entry, &i);
 		return ret;
 	}
 
 	/* And one copy for every other CPU */
-	for (i = 1; i < num_possible_cpus(); i++) {
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
-		       newinfo->entries,
-		       SMP_ALIGN(newinfo->size));
+	for_each_cpu(i) {
+		if (newinfo->entries[i] != newinfo->entry0)
+			memcpy(newinfo->entries[i],
+				newinfo->entry0,
+				newinfo->size);
 	}
 
 	return ret;
@@ -940,15 +932,12 @@
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	{
-		struct ipt_entry *table_base;
-		unsigned int i;
-
-		for (i = 0; i < num_possible_cpus(); i++) {
-			table_base =
-				(void *)newinfo->entries
-				+ TABLE_OFFSET(newinfo, i);
+		int cpu;
 
-			table_base->comefrom = 0xdead57ac;
+		for_each_cpu(cpu) {
+			struct ipt_entry *table_base = newinfo->entries[cpu];
+			if (table_base)
+				table_base->comefrom = 0xdead57ac;
 		}
 	}
 #endif
@@ -990,9 +979,9 @@
 	unsigned int cpu;
 	unsigned int i;
 
-	for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+	for_each_cpu(cpu) {
 		i = 0;
-		IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
@@ -1026,7 +1015,7 @@
 	write_unlock_bh(&table->lock);
 
 	/* ... then copy entire thing from CPU 0... */
-	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+	if (copy_to_user(userptr, table->private->entry0, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
 	}
@@ -1038,7 +1027,7 @@
 		struct ipt_entry_match *m;
 		struct ipt_entry_target *t;
 
-		e = (struct ipt_entry *)(table->private->entries + off);
+		e = (struct ipt_entry *)(table->private->entry0 + off);
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ipt_entry, counters),
 				 &counters[num],
@@ -1107,6 +1096,51 @@
 	return ret;
 }
 
+static void free_table_info(struct ipt_table_info *info)
+{
+	int cpu;
+	for_each_cpu(cpu)
+		vfree(info->entries[cpu]);
+	kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+struct ipt_table_info *newinfo;
+int cpu;
+	newinfo = kmalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+	memset(newinfo, 0, sizeof(*newinfo));
+	for_each_cpu(cpu) {
+#ifdef CONFIG_NUMA
+		struct mempolicy *oldpol;
+		cpumask_t oldmask = current->cpus_allowed;
+		int node = cpu_to_node(cpu);
+		if (node >= 0 && node_online(node))
+		    set_cpus_allowed(current, node_to_cpumask(node));
+		/* And set default memory allocation policy */
+		oldpol = current->mempolicy;
+		current->mempolicy = &default_policy;
+		mpol_get(current->mempolicy);
+#endif
+		newinfo->entries[cpu] = vmalloc(size);
+#ifdef CONFIG_NUMA
+		set_cpus_allowed(current, oldmask);
+		mpol_free(current->mempolicy);
+		current->mempolicy = oldpol;
+#endif
+		if (newinfo->entries[cpu] == 0) {
+			free_table_info(newinfo);
+			return NULL;
+		if (!newinfo->entry0)
+			newinfo->entry0 = newinfo->entries[cpu];
+		}
+	}
+	return newinfo;
+}
+
+
 static int
 do_replace(void __user *user, unsigned int len)
 {
@@ -1127,12 +1161,10 @@
 	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
 		return -ENOMEM;
 
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+	newinfo = alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
-
-	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+	if (copy_from_user(newinfo->entry0, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1185,8 +1217,8 @@
 	/* Get the old counters. */
 	get_counters(oldinfo, counters);
 	/* Decrease module usage counts and free resource */
-	IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
-	vfree(oldinfo);
+	IPT_ENTRY_ITERATE(oldinfo->entry0, oldinfo->size, cleanup_entry,NULL);
+	free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
 			 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
@@ -1198,11 +1230,11 @@
 	module_put(t->me);
 	up(&ipt_mutex);
  free_newinfo_counters_untrans:
-	IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+	IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	return ret;
 }
 
@@ -1264,7 +1296,7 @@
 	}
 
 	i = 0;
-	IPT_ENTRY_ITERATE(t->private->entries,
+	IPT_ENTRY_ITERATE(t->private->entry0,
 			  t->private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
@@ -1454,15 +1486,20 @@
 {
 	int ret;
 	struct ipt_table_info *newinfo;
-	static struct ipt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	static struct ipt_table_info bootstrap = {
+		.size = 0,
+		.number = 0,
+		.initial_entries = 0,
+		.hook_entry = { 0 },
+		.underflow = { 0 },
+		.entry0 = NULL,
+		.entries = {NULL }
+		};
 
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+	newinfo = alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
-
-	memcpy(newinfo->entries, repl->entries, repl->size);
+	memcpy(newinfo->entry0, repl->entries, repl->size);
 
 	ret = translate_table(table->name, table->valid_hooks,
 			      newinfo, repl->size,
@@ -1470,13 +1507,13 @@
 			      repl->hook_entry,
 			      repl->underflow);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
 	ret = down_interruptible(&ipt_mutex);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
@@ -1505,7 +1542,7 @@
 	return ret;
 
  free_unlock:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	goto unlock;
 }
 
@@ -1516,9 +1553,9 @@
 	up(&ipt_mutex);
 
 	/* Decrease module usage counts and free resources */
-	IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+	IPT_ENTRY_ITERATE(table->private->entry0, table->private->size,
 			  cleanup_entry, NULL);
-	vfree(table->private);
+	free_table_info(table->private);
 }
 
 /* Returns 1 if the port is matched by the range, 0 otherwise */

^ permalink raw reply	[flat|nested] 58+ messages in thread

end of thread, other threads:[~2005-11-25 18:20 UTC | newest]

Thread overview: 58+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2005-09-19 17:09 [PATCH, netfilter] NUMA aware ipv4/netfilter/ip_tables.c Eric dumazet
2005-09-19 17:20 ` Eric Dumazet
2005-09-19 17:48 ` Andi Kleen
2005-09-19 19:09   ` Eric Dumazet
2005-09-20  9:47   ` Eric Dumazet
2005-09-20 16:30     ` Andi Kleen
2005-09-20 17:02       ` Eric Dumazet
2005-09-20 21:45       ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h , " Eric Dumazet
2005-09-20 21:46         ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h Eric Dumazet
2005-09-21 21:24           ` [PATCH 0/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-21 22:43             ` Christoph Lameter
2005-09-22  0:34               ` David S. Miller
2005-09-22  1:44                 ` Christoph Lameter
2005-09-22 12:11                   ` Eric Dumazet
2005-09-22 12:49                     ` Christoph Hellwig
2005-09-22 12:54                       ` Andi Kleen
2005-09-22 12:58                         ` Christoph Hellwig
2005-09-22 13:05                           ` Andi Kleen
2005-09-22 15:37                             ` Christoph Lameter
2005-09-22 15:50                               ` Eric Dumazet
2005-09-22 15:55                                 ` Christoph Lameter
2005-09-23 17:11                                 ` Harald Welte
2005-09-23 17:44                                   ` Christoph Lameter
2005-09-23 18:04                                     ` Dave Hansen
2005-09-23 17:47                                   ` Eric Dumazet
2005-09-23 18:00                                     ` Kyle Moffett
2005-09-22  4:18             ` James Morris
2005-09-22  5:07               ` Eric Dumazet
2005-09-22 13:03             ` Andi Kleen
2005-09-22 13:30               ` Eric Dumazet
2005-09-23 17:09               ` Harald Welte
2005-09-27 16:23                 ` Andi Kleen
2005-09-28  0:25                   ` Henrik Nordstrom
2005-09-28  8:32                     ` Harald Welte
2005-09-28  8:37                       ` Andi Kleen
2005-10-04 17:01                         ` Patrick McHardy
2005-10-05 16:53                           ` Andi Kleen
2005-10-07  2:38                             ` Harald Welte
2005-10-06 17:59                               ` Andi Kleen
2005-10-07 17:08                                 ` Patrick McHardy
2005-10-07 17:21                                   ` Andi Kleen
2005-10-07 17:50                                     ` Patrick McHardy
2005-09-28 10:34                       ` Henrik Nordstrom
2005-11-25 11:23             ` [PATCH] netfilter : zap get_cpu()/put_cpu() calls from ip_tables Eric Dumazet
2005-11-25 11:28               ` [PATCH (resent with the attachment !)] " Eric Dumazet
2005-11-25 18:20                 ` Patrick McHardy
2005-09-21 21:29           ` [PATCH 1/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-22 12:57             ` Harald Welte
2005-09-22 13:17               ` Eric Dumazet
2005-09-21 21:32           ` [PATCH 2/3] " Eric Dumazet
2005-09-22 12:48             ` Harald Welte
2005-09-22 13:05               ` Eric Dumazet
2005-09-23  4:02                 ` Willy Tarreau
2005-09-23  5:14                   ` Eric Dumazet
2005-09-23 11:33                     ` Willy Tarreau
2005-09-23 14:00                   ` Tim Mattox
2005-09-21 21:37           ` [PATCH 3/3] " Eric Dumazet
2005-09-22 12:50             ` Harald Welte

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).