[PATCH 3/3] netfilter : 3 patches to boost ip_tables performance

netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

From: Eric Dumazet <dada1@cosmosbay.com>
To: linux-kernel@vger.kernel.org,
	netfilter-devel@lists.netfilter.org, netdev@vger.kernel.org
Cc: Andi Kleen <ak@suse.de>
Subject: [PATCH 3/3] netfilter : 3 patches to boost ip_tables performance
Date: Wed, 21 Sep 2005 23:37:20 +0200	[thread overview]
Message-ID: <4331D290.6070104@cosmosbay.com> (raw)
In-Reply-To: <43308324.70403@cosmosbay.com>

[-- Attachment #1: Type: text/plain, Size: 1068 bytes --]

Patch 3/3 (please apply after Patch 2/3)

3) NUMA allocation.

Part of the performance problem we have with netfilter is memory allocation
is not NUMA aware, but 'only' SMP aware (ie each CPU normally touch
separate cache lines)

Even with small iptables rules, the cost of this misplacement can be high
  on common workloads.

Instead of using one vmalloc() area (located in the node of the iptables
process), we now allocate an area for each possible CPU, using NUMA policy
(MPOL_PREFERRED) so that memory should be allocated in the CPU's node
if possible.

If the size of ipt_table is small enough (less than one page), we use
kmalloc_node() instead of vmalloc(), to use less memory and less TLB entries)
in small setups.

Please note that this patch doesnt change the number of allocated bytes, only 
the location of allocated zones.

Note2 : This patch depends on another patch that declares sys_set_mempolicy()
in include/linux/syscalls.h
( http://marc.theaimsgroup.com/?l=linux-kernel&m=112725288622984&w=2 )

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>


[-- Attachment #2: patch_ip_tables_numa.3 --]
[-- Type: text/plain, Size: 12981 bytes --]

--- linux-2.6/net/ipv4/netfilter/ip_tables.c	2005-09-22 00:44:34.000000000 +0200
+++ linux-2.6-ed/net/ipv4/netfilter/ip_tables.c	2005-09-22 00:57:15.000000000 +0200
@@ -17,6 +17,8 @@
 #include <linux/skbuff.h>
 #include <linux/kmod.h>
 #include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
+#include <linux/syscalls.h>
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/tcp.h>
@@ -82,11 +84,6 @@
    context stops packets coming through and allows user context to read
    the counters or update the rules.
 
-   To be cache friendly on SMP, we arrange them like so:
-   [ n-entries ]
-   ... cache-align padding ...
-   [ n-entries ]
-
    Hence the start of any table is given by get_table() below.  */
 
 /* The table itself */
@@ -104,7 +101,7 @@
 	unsigned int underflow[NF_IP_NUMHOOKS];
 
 	/* ipt_entry tables: one per CPU */
-	char entries[0] ____cacheline_aligned;
+	void *entries[NR_CPUS];
 };
 
 static LIST_HEAD(ipt_target);
@@ -113,12 +110,6 @@
 #define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0)
 #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
 
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
-
 #if 0
 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
 #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; })
@@ -331,8 +322,7 @@
 	read_lock_bh(&table->lock);
 #endif
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
-	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+	table_base = (void *)table->private->entries[smp_processor_id()];
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -608,7 +598,7 @@
 /* Figures out from what hook each rule can be called: returns 0 if
    there are loops.  Puts hook bitmask in comefrom. */
 static int
-mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks)
+mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks, void *entry0)
 {
 	unsigned int hook;
 
@@ -617,7 +607,7 @@
 	for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
 		unsigned int pos = newinfo->hook_entry[hook];
 		struct ipt_entry *e
-			= (struct ipt_entry *)(newinfo->entries + pos);
+			= (struct ipt_entry *)(entry0 + pos);
 
 		if (!(valid_hooks & (1 << hook)))
 			continue;
@@ -667,13 +657,13 @@
 						goto next;
 
 					e = (struct ipt_entry *)
-						(newinfo->entries + pos);
+						(entry0 + pos);
 				} while (oldpos == pos + e->next_offset);
 
 				/* Move along one */
 				size = e->next_offset;
 				e = (struct ipt_entry *)
-					(newinfo->entries + pos + size);
+					(entry0 + pos + size);
 				e->counters.pcnt = pos;
 				pos += size;
 			} else {
@@ -690,7 +680,7 @@
 					newpos = pos + e->next_offset;
 				}
 				e = (struct ipt_entry *)
-					(newinfo->entries + newpos);
+					(entry0 + newpos);
 				e->counters.pcnt = pos;
 				pos = newpos;
 			}
@@ -900,6 +890,7 @@
 translate_table(const char *name,
 		unsigned int valid_hooks,
 		struct ipt_table_info *newinfo,
+		void *entry0,
 		unsigned int size,
 		unsigned int number,
 		const unsigned int *hook_entries,
@@ -920,11 +911,11 @@
 	duprintf("translate_table: size %u\n", newinfo->size);
 	i = 0;
 	/* Walk through entries, checking offsets. */
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry_size_and_hooks,
 				newinfo,
-				newinfo->entries,
-				newinfo->entries + size,
+				entry0,
+				entry0 + size,
 				hook_entries, underflows, &i);
 	if (ret != 0)
 		return ret;
@@ -952,25 +943,24 @@
 		}
 	}
 
-	if (!mark_source_chains(newinfo, valid_hooks))
+	if (!mark_source_chains(newinfo, valid_hooks, entry0))
 		return -ELOOP;
 
 	/* Finally, each sanity check must pass */
 	i = 0;
-	ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+	ret = IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				check_entry, name, size, &i);
 
 	if (ret != 0) {
-		IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+		IPT_ENTRY_ITERATE(entry0, newinfo->size,
 				  cleanup_entry, &i);
 		return ret;
 	}
 
 	/* And one copy for every other CPU */
-	for (i = 1; i < num_possible_cpus(); i++) {
-		memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
-		       newinfo->entries,
-		       SMP_ALIGN(newinfo->size));
+	for_each_cpu(i) {
+		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
+			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
 
 	return ret;
@@ -1010,15 +1000,12 @@
 
 #ifdef CONFIG_NETFILTER_DEBUG
 	{
-		struct ipt_entry *table_base;
-		unsigned int i;
+		int cpu;
 
-		for (i = 0; i < num_possible_cpus(); i++) {
-			table_base =
-				(void *)newinfo->entries
-				+ TABLE_OFFSET(newinfo, i);
-
-			table_base->comefrom = 0xdead57ac;
+		for_each_cpu(cpu) {
+			struct ipt_entry *table_base = newinfo->entries[cpu];
+			if (table_base)
+				table_base->comefrom = 0xdead57ac;
 		}
 	}
 #endif
@@ -1083,7 +1070,7 @@
 	if (table)
 		write_lock_bh(per_cpu_ptr(table->lock_p, curcpu));
 	i = 0;
-	IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, curcpu),
+	IPT_ENTRY_ITERATE(t->entries[curcpu],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
@@ -1097,7 +1084,7 @@
 		if (table)
 			write_lock_bh(per_cpu_ptr(table->lock_p, cpu));
 		i = 0;
-		IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+		IPT_ENTRY_ITERATE(t->entries[cpu],
 				  t->size,
 				  add_entry_to_counter,
 				  counters,
@@ -1110,7 +1097,7 @@
 	if (table)
 		write_lock_bh(&table->lock);
 	i = 0;
-	IPT_ENTRY_ITERATE(t->entries,
+	IPT_ENTRY_ITERATE(t->entries[0],
 			  t->size,
 			  set_entry_to_counter,
 			  counters,
@@ -1129,6 +1116,7 @@
 	struct ipt_entry *e;
 	struct ipt_counters *counters;
 	int ret = 0;
+	void *loc_cpu_entry;
 
 	/* We need atomic snapshot of counters: rest doesn't change
 	   (other than comefrom, which userspace doesn't care
@@ -1142,8 +1130,12 @@
 	/* First, sum counters... */
 	get_counters(table, table->private, counters);
 
-	/* ... then copy entire thing from CPU 0... */
-	if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+	/*
+	 * choose the copy that is on our node/cpu,
+	 */
+	loc_cpu_entry = table->private->entries[get_cpu()];
+	/* ... then copy entire thing ... */
+	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
 	}
@@ -1155,7 +1147,7 @@
 		struct ipt_entry_match *m;
 		struct ipt_entry_target *t;
 
-		e = (struct ipt_entry *)(table->private->entries + off);
+		e = (struct ipt_entry *)(loc_cpu_entry + off);
 		if (copy_to_user(userptr + off
 				 + offsetof(struct ipt_entry, counters),
 				 &counters[num],
@@ -1192,6 +1184,7 @@
 	}
 
  free_counters:
+	put_cpu();
 	vfree(counters);
 	return ret;
 }
@@ -1224,6 +1217,60 @@
 	return ret;
 }
 
+static void free_table_info(struct ipt_table_info *info)
+{
+	int cpu;
+	for_each_cpu(cpu) {
+		if (info->size <= PAGE_SIZE)
+			kfree(info->entries[cpu]);
+		else
+			vfree(info->entries[cpu]);
+	}
+	kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+struct ipt_table_info *newinfo;
+int cpu;
+	newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+	if (!newinfo)
+		return NULL;
+	newinfo->size = size;
+	for_each_cpu(cpu) {
+		if (size <= PAGE_SIZE) {
+			newinfo->entries[cpu] = kmalloc_node(size,
+				GFP_KERNEL,
+				cpu_to_node(cpu));
+		} else {
+#ifdef CONFIG_NUMA
+			struct mempolicy *oldpol;
+			mm_segment_t oldfs = get_fs();
+			DECLARE_BITMAP(mynode, MAX_NUMNODES);
+
+			oldpol = current->mempolicy;
+			mpol_get(oldpol);
+			bitmap_zero(mynode, MAX_NUMNODES);
+			set_bit(cpu_to_node(cpu), mynode);
+			set_fs(KERNEL_DS);
+			sys_set_mempolicy(MPOL_PREFERRED, mynode, MAX_NUMNODES);
+			set_fs(oldfs);
+#endif
+			newinfo->entries[cpu] = vmalloc(size);
+#ifdef CONFIG_NUMA
+			mpol_free(current->mempolicy);
+			current->mempolicy = oldpol;
+#endif
+		}
+		if (newinfo->entries[cpu] == 0) {
+			free_table_info(newinfo);
+			return NULL;
+		}
+	}
+	return newinfo;
+}
+
+
 static int
 do_replace(void __user *user, unsigned int len)
 {
@@ -1232,6 +1279,7 @@
 	struct ipt_table *t;
 	struct ipt_table_info *newinfo, *oldinfo;
 	struct ipt_counters *counters;
+	void *loc_cpu_entry, *loc_cpu_old_entry;
 
 	if (copy_from_user(&tmp, user, sizeof(tmp)) != 0)
 		return -EFAULT;
@@ -1244,12 +1292,14 @@
 	if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
 		return -ENOMEM;
 
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(tmp.size) * num_possible_cpus());
+	newinfo = alloc_table_info(tmp.size);
 	if (!newinfo)
 		return -ENOMEM;
-
-	if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+	/*
+	 * choose the copy that is on our node/cpu
+	 */
+	loc_cpu_entry = newinfo->entries[get_cpu()];
+	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1260,10 +1310,9 @@
 		ret = -ENOMEM;
 		goto free_newinfo;
 	}
-	memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters));
 
 	ret = translate_table(tmp.name, tmp.valid_hooks,
-			      newinfo, tmp.size, tmp.num_entries,
+			      newinfo, loc_cpu_entry, tmp.size, tmp.num_entries,
 			      tmp.hook_entry, tmp.underflow);
 	if (ret != 0)
 		goto free_newinfo_counters;
@@ -1302,8 +1351,10 @@
 	/* Get the old counters. */
 	get_counters(NULL, oldinfo, counters);
 	/* Decrease module usage counts and free resource */
-	IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
-	vfree(oldinfo);
+	loc_cpu_old_entry = oldinfo->entries[smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL);
+	put_cpu();
+	free_table_info(oldinfo);
 	if (copy_to_user(tmp.counters, counters,
 			 sizeof(struct ipt_counters) * tmp.num_counters) != 0)
 		ret = -EFAULT;
@@ -1315,11 +1366,12 @@
 	module_put(t->me);
 	up(&ipt_mutex);
  free_newinfo_counters_untrans:
-	IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+	IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL);
  free_newinfo_counters:
 	vfree(counters);
  free_newinfo:
-	vfree(newinfo);
+	put_cpu();
+	free_table_info(newinfo);
 	return ret;
 }
 
@@ -1352,6 +1404,7 @@
 	struct ipt_counters_info tmp, *paddc;
 	struct ipt_table *t;
 	int ret = 0;
+	void *loc_cpu_entry;
 #ifdef CONFIG_SMP
 	rwlock_t *lockp;
 #endif
@@ -1389,7 +1442,11 @@
 	}
 
 	i = 0;
-	IPT_ENTRY_ITERATE(t->private->entries,
+	/*
+	 * choose the copy that is on our node,
+	 */
+	loc_cpu_entry = t->private->entries[smp_processor_id()];
+	IPT_ENTRY_ITERATE(loc_cpu_entry,
 			  t->private->size,
 			  add_counter_to_entry,
 			  paddc->counters,
@@ -1584,8 +1641,15 @@
 {
 	int ret;
 	struct ipt_table_info *newinfo;
-	static struct ipt_table_info bootstrap
-		= { 0, 0, 0, { 0 }, { 0 }, { } };
+	static struct ipt_table_info bootstrap = {
+		.size = 0,
+		.number = 0,
+		.initial_entries = 0,
+		.hook_entry = { 0 },
+		.underflow = { 0 },
+		.entries = {NULL }
+		};
+	void *loc_cpu_entry;
 #ifdef CONFIG_SMP
 	int cpu;
 	if (!table->lock_p) {
@@ -1597,26 +1661,30 @@
 #else
 	rwlock_init(&table->lock);
 #endif
-	newinfo = vmalloc(sizeof(struct ipt_table_info)
-			  + SMP_ALIGN(repl->size) * num_possible_cpus());
+
+	newinfo = alloc_table_info(repl->size);
 	if (!newinfo)
 		return -ENOMEM;
-
-	memcpy(newinfo->entries, repl->entries, repl->size);
+	/*
+	 * choose the copy that is on our node/cpu
+	 */
+	loc_cpu_entry = newinfo->entries[get_cpu()];
+	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(table->name, table->valid_hooks,
-			      newinfo, repl->size,
+			      newinfo, loc_cpu_entry, repl->size,
 			      repl->num_entries,
 			      repl->hook_entry,
 			      repl->underflow);
+	put_cpu();
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
 	ret = down_interruptible(&ipt_mutex);
 	if (ret != 0) {
-		vfree(newinfo);
+		free_table_info(newinfo);
 		return ret;
 	}
 
@@ -1644,20 +1712,25 @@
 	return ret;
 
  free_unlock:
-	vfree(newinfo);
+	free_table_info(newinfo);
 	goto unlock;
 }
 
 void ipt_unregister_table(struct ipt_table *table)
 {
+	void *loc_cpu_entry;
 	down(&ipt_mutex);
 	LIST_DELETE(&ipt_tables, table);
 	up(&ipt_mutex);
 
-	/* Decrease module usage counts and free resources */
-	IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+	/* Decrease module usage counts and free resources
+	 * choose the copy that is on our node/cpu
+	 */
+	loc_cpu_entry = table->private->entries[get_cpu()];
+	IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size,
 			  cleanup_entry, NULL);
-	vfree(table->private);
+	put_cpu();
+	free_table_info(table->private);
 #ifdef CONFIG_SMP
 	free_percpu(table->lock_p);
 	table->lock_p = NULL;

next prev parent reply	other threads:[~2005-09-21 21:37 UTC|newest]

Thread overview: 58+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2005-09-19 17:09 [PATCH, netfilter] NUMA aware ipv4/netfilter/ip_tables.c Eric dumazet
2005-09-19 17:20 ` Eric Dumazet
2005-09-19 17:48 ` Andi Kleen
2005-09-19 19:09   ` Eric Dumazet
2005-09-20  9:47   ` Eric Dumazet
2005-09-20 16:30     ` Andi Kleen
2005-09-20 17:02       ` Eric Dumazet
2005-09-20 21:45       ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h , " Eric Dumazet
2005-09-20 21:46         ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h Eric Dumazet
2005-09-21 21:24           ` [PATCH 0/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-21 22:43             ` Christoph Lameter
2005-09-22  0:34               ` David S. Miller
2005-09-22  1:44                 ` Christoph Lameter
2005-09-22 12:11                   ` Eric Dumazet
2005-09-22 12:49                     ` Christoph Hellwig
2005-09-22 12:54                       ` Andi Kleen
2005-09-22 12:58                         ` Christoph Hellwig
2005-09-22 13:05                           ` Andi Kleen
2005-09-22 15:37                             ` Christoph Lameter
2005-09-22 15:50                               ` Eric Dumazet
2005-09-22 15:55                                 ` Christoph Lameter
2005-09-23 17:11                                 ` Harald Welte
2005-09-23 17:44                                   ` Christoph Lameter
2005-09-23 18:04                                     ` Dave Hansen
2005-09-23 17:47                                   ` Eric Dumazet
2005-09-23 18:00                                     ` Kyle Moffett
2005-09-22  4:18             ` James Morris
2005-09-22  5:07               ` Eric Dumazet
2005-09-22 13:03             ` Andi Kleen
2005-09-22 13:30               ` Eric Dumazet
2005-09-23 17:09               ` Harald Welte
2005-09-27 16:23                 ` Andi Kleen
2005-09-28  0:25                   ` Henrik Nordstrom
2005-09-28  8:32                     ` Harald Welte
2005-09-28  8:37                       ` Andi Kleen
2005-10-04 17:01                         ` Patrick McHardy
2005-10-05 16:53                           ` Andi Kleen
2005-10-07  2:38                             ` Harald Welte
2005-10-06 17:59                               ` Andi Kleen
2005-10-07 17:08                                 ` Patrick McHardy
2005-10-07 17:21                                   ` Andi Kleen
2005-10-07 17:50                                     ` Patrick McHardy
2005-09-28 10:34                       ` Henrik Nordstrom
2005-11-25 11:23             ` [PATCH] netfilter : zap get_cpu()/put_cpu() calls from ip_tables Eric Dumazet
2005-11-25 11:28               ` [PATCH (resent with the attachment !)] " Eric Dumazet
2005-11-25 18:20                 ` Patrick McHardy
2005-09-21 21:29           ` [PATCH 1/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-22 12:57             ` Harald Welte
2005-09-22 13:17               ` Eric Dumazet
2005-09-21 21:32           ` [PATCH 2/3] " Eric Dumazet
2005-09-22 12:48             ` Harald Welte
2005-09-22 13:05               ` Eric Dumazet
2005-09-23  4:02                 ` Willy Tarreau
2005-09-23  5:14                   ` Eric Dumazet
2005-09-23 11:33                     ` Willy Tarreau
2005-09-23 14:00                   ` Tim Mattox
2005-09-21 21:37           ` Eric Dumazet [this message]
2005-09-22 12:50             ` [PATCH 3/3] " Harald Welte

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4331D290.6070104@cosmosbay.com \
    --to=dada1@cosmosbay.com \
    --cc=ak@suse.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@lists.netfilter.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).