From: Eric dumazet <abuse@cosmosbay.com>
To: netfilter-devel@lists.netfilter.org
Cc: netdev@vger.kernel.org
Subject: [PATCH, netfilter] NUMA aware ipv4/netfilter/ip_tables.c
Date: Mon, 19 Sep 2005 19:09:25 +0200 [thread overview]
Message-ID: <432EF0C5.5090908@cosmosbay.com> (raw)
[-- Attachment #1: Type: text/plain, Size: 455 bytes --]
Hi
Part of the performance problem we have with netfilter is memory
allocation is not NUMA aware, but 'only' SMP aware.
What do you think of this patch ?
Note : The allocation function is quite complex (copied from
drivers/pci/pci-driver.c pci_call_probe())
because current kernel doesnt have a NUMA aware vmalloc() wrapper, maybe
a future kernel will export
such a common function ?
Thank you
Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
[-- Attachment #2: patch_ip_tables_numa --]
[-- Type: text/plain, Size: 9518 bytes --]
--- linux-2.6.14-rc1.p1/net/ipv4/netfilter/ip_tables.c 2005-09-19 19:56:12.000000000 +0200
+++ linux-2.6.14-rc1/net/ipv4/netfilter/ip_tables.c 2005-09-19 21:03:44.000000000 +0200
@@ -17,6 +17,7 @@
#include <linux/skbuff.h>
#include <linux/kmod.h>
#include <linux/vmalloc.h>
+#include <linux/mempolicy.h>
#include <linux/netdevice.h>
#include <linux/module.h>
#include <linux/tcp.h>
@@ -82,11 +83,6 @@
context stops packets coming through and allows user context to read
the counters or update the rules.
- To be cache friendly on SMP, we arrange them like so:
- [ n-entries ]
- ... cache-align padding ...
- [ n-entries ]
-
Hence the start of any table is given by get_table() below. */
/* The table itself */
@@ -104,7 +100,8 @@
unsigned int underflow[NF_IP_NUMHOOKS];
/* ipt_entry tables: one per CPU */
- char entries[0] ____cacheline_aligned;
+ void *entry0 ; /* entry for the first possible cpu */
+ void *entries[NR_CPUS];
};
static LIST_HEAD(ipt_target);
@@ -112,11 +109,6 @@
static LIST_HEAD(ipt_tables);
#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
-#ifdef CONFIG_SMP
-#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
-#else
-#define TABLE_OFFSET(t,p) 0
-#endif
#if 0
#define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0)
@@ -289,8 +281,7 @@
read_lock_bh(&table->lock);
IP_NF_ASSERT(table->valid_hooks & (1 << hook));
- table_base = (void *)table->private->entries
- + TABLE_OFFSET(table->private, smp_processor_id());
+ table_base = (void *)table->private->entries[smp_processor_id()];
e = get_entry(table_base, table->private->hook_entry[hook]);
#ifdef CONFIG_NETFILTER_DEBUG
@@ -571,7 +562,7 @@
for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) {
unsigned int pos = newinfo->hook_entry[hook];
struct ipt_entry *e
- = (struct ipt_entry *)(newinfo->entries + pos);
+ = (struct ipt_entry *)(newinfo->entry0 + pos);
if (!(valid_hooks & (1 << hook)))
continue;
@@ -621,13 +612,13 @@
goto next;
e = (struct ipt_entry *)
- (newinfo->entries + pos);
+ (newinfo->entry0 + pos);
} while (oldpos == pos + e->next_offset);
/* Move along one */
size = e->next_offset;
e = (struct ipt_entry *)
- (newinfo->entries + pos + size);
+ (newinfo->entry0 + pos + size);
e->counters.pcnt = pos;
pos += size;
} else {
@@ -644,7 +635,7 @@
newpos = pos + e->next_offset;
}
e = (struct ipt_entry *)
- (newinfo->entries + newpos);
+ (newinfo->entry0 + newpos);
e->counters.pcnt = pos;
pos = newpos;
}
@@ -874,11 +865,11 @@
duprintf("translate_table: size %u\n", newinfo->size);
i = 0;
/* Walk through entries, checking offsets. */
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
check_entry_size_and_hooks,
newinfo,
- newinfo->entries,
- newinfo->entries + size,
+ newinfo->entry0,
+ newinfo->entry0 + size,
hook_entries, underflows, &i);
if (ret != 0)
return ret;
@@ -911,20 +902,21 @@
/* Finally, each sanity check must pass */
i = 0;
- ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ ret = IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
check_entry, name, size, &i);
if (ret != 0) {
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size,
+ IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size,
cleanup_entry, &i);
return ret;
}
/* And one copy for every other CPU */
- for (i = 1; i < num_possible_cpus(); i++) {
- memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i,
- newinfo->entries,
- SMP_ALIGN(newinfo->size));
+ for_each_cpu(i) {
+ if (newinfo->entries[i] != newinfo->entry0)
+ memcpy(newinfo->entries[i],
+ newinfo->entry0,
+ newinfo->size);
}
return ret;
@@ -940,15 +932,12 @@
#ifdef CONFIG_NETFILTER_DEBUG
{
- struct ipt_entry *table_base;
- unsigned int i;
-
- for (i = 0; i < num_possible_cpus(); i++) {
- table_base =
- (void *)newinfo->entries
- + TABLE_OFFSET(newinfo, i);
+ int cpu;
- table_base->comefrom = 0xdead57ac;
+ for_each_cpu(cpu) {
+ struct ipt_entry *table_base = newinfo->entries[cpu];
+ if (table_base)
+ table_base->comefrom = 0xdead57ac;
}
}
#endif
@@ -990,9 +979,9 @@
unsigned int cpu;
unsigned int i;
- for (cpu = 0; cpu < num_possible_cpus(); cpu++) {
+ for_each_cpu(cpu) {
i = 0;
- IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu),
+ IPT_ENTRY_ITERATE(t->entries[cpu],
t->size,
add_entry_to_counter,
counters,
@@ -1026,7 +1015,7 @@
write_unlock_bh(&table->lock);
/* ... then copy entire thing from CPU 0... */
- if (copy_to_user(userptr, table->private->entries, total_size) != 0) {
+ if (copy_to_user(userptr, table->private->entry0, total_size) != 0) {
ret = -EFAULT;
goto free_counters;
}
@@ -1038,7 +1027,7 @@
struct ipt_entry_match *m;
struct ipt_entry_target *t;
- e = (struct ipt_entry *)(table->private->entries + off);
+ e = (struct ipt_entry *)(table->private->entry0 + off);
if (copy_to_user(userptr + off
+ offsetof(struct ipt_entry, counters),
&counters[num],
@@ -1107,6 +1096,51 @@
return ret;
}
+static void free_table_info(struct ipt_table_info *info)
+{
+ int cpu;
+ for_each_cpu(cpu)
+ vfree(info->entries[cpu]);
+ kfree(info);
+}
+
+static struct ipt_table_info *alloc_table_info(unsigned int size)
+{
+struct ipt_table_info *newinfo;
+int cpu;
+ newinfo = kmalloc(sizeof(struct ipt_table_info), GFP_KERNEL);
+ if (!newinfo)
+ return NULL;
+ memset(newinfo, 0, sizeof(*newinfo));
+ for_each_cpu(cpu) {
+#ifdef CONFIG_NUMA
+ struct mempolicy *oldpol;
+ cpumask_t oldmask = current->cpus_allowed;
+ int node = cpu_to_node(cpu);
+ if (node >= 0 && node_online(node))
+ set_cpus_allowed(current, node_to_cpumask(node));
+ /* And set default memory allocation policy */
+ oldpol = current->mempolicy;
+ current->mempolicy = &default_policy;
+ mpol_get(current->mempolicy);
+#endif
+ newinfo->entries[cpu] = vmalloc(size);
+#ifdef CONFIG_NUMA
+ set_cpus_allowed(current, oldmask);
+ mpol_free(current->mempolicy);
+ current->mempolicy = oldpol;
+#endif
+ if (newinfo->entries[cpu] == 0) {
+ free_table_info(newinfo);
+ return NULL;
+ if (!newinfo->entry0)
+ newinfo->entry0 = newinfo->entries[cpu];
+ }
+ }
+ return newinfo;
+}
+
+
static int
do_replace(void __user *user, unsigned int len)
{
@@ -1127,12 +1161,10 @@
if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages)
return -ENOMEM;
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(tmp.size) * num_possible_cpus());
+ newinfo = alloc_table_info(tmp.size);
if (!newinfo)
return -ENOMEM;
-
- if (copy_from_user(newinfo->entries, user + sizeof(tmp),
+ if (copy_from_user(newinfo->entry0, user + sizeof(tmp),
tmp.size) != 0) {
ret = -EFAULT;
goto free_newinfo;
@@ -1185,8 +1217,8 @@
/* Get the old counters. */
get_counters(oldinfo, counters);
/* Decrease module usage counts and free resource */
- IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL);
- vfree(oldinfo);
+ IPT_ENTRY_ITERATE(oldinfo->entry0, oldinfo->size, cleanup_entry,NULL);
+ free_table_info(oldinfo);
if (copy_to_user(tmp.counters, counters,
sizeof(struct ipt_counters) * tmp.num_counters) != 0)
ret = -EFAULT;
@@ -1198,11 +1230,11 @@
module_put(t->me);
up(&ipt_mutex);
free_newinfo_counters_untrans:
- IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL);
+ IPT_ENTRY_ITERATE(newinfo->entry0, newinfo->size, cleanup_entry,NULL);
free_newinfo_counters:
vfree(counters);
free_newinfo:
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1264,7 +1296,7 @@
}
i = 0;
- IPT_ENTRY_ITERATE(t->private->entries,
+ IPT_ENTRY_ITERATE(t->private->entry0,
t->private->size,
add_counter_to_entry,
paddc->counters,
@@ -1454,15 +1486,20 @@
{
int ret;
struct ipt_table_info *newinfo;
- static struct ipt_table_info bootstrap
- = { 0, 0, 0, { 0 }, { 0 }, { } };
+ static struct ipt_table_info bootstrap = {
+ .size = 0,
+ .number = 0,
+ .initial_entries = 0,
+ .hook_entry = { 0 },
+ .underflow = { 0 },
+ .entry0 = NULL,
+ .entries = {NULL }
+ };
- newinfo = vmalloc(sizeof(struct ipt_table_info)
- + SMP_ALIGN(repl->size) * num_possible_cpus());
+ newinfo = alloc_table_info(repl->size);
if (!newinfo)
return -ENOMEM;
-
- memcpy(newinfo->entries, repl->entries, repl->size);
+ memcpy(newinfo->entry0, repl->entries, repl->size);
ret = translate_table(table->name, table->valid_hooks,
newinfo, repl->size,
@@ -1470,13 +1507,13 @@
repl->hook_entry,
repl->underflow);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
ret = down_interruptible(&ipt_mutex);
if (ret != 0) {
- vfree(newinfo);
+ free_table_info(newinfo);
return ret;
}
@@ -1505,7 +1542,7 @@
return ret;
free_unlock:
- vfree(newinfo);
+ free_table_info(newinfo);
goto unlock;
}
@@ -1516,9 +1553,9 @@
up(&ipt_mutex);
/* Decrease module usage counts and free resources */
- IPT_ENTRY_ITERATE(table->private->entries, table->private->size,
+ IPT_ENTRY_ITERATE(table->private->entry0, table->private->size,
cleanup_entry, NULL);
- vfree(table->private);
+ free_table_info(table->private);
}
/* Returns 1 if the port is matched by the range, 0 otherwise */
next reply other threads:[~2005-09-19 17:09 UTC|newest]
Thread overview: 58+ messages / expand[flat|nested] mbox.gz Atom feed top
2005-09-19 17:09 Eric dumazet [this message]
2005-09-19 17:20 ` [PATCH, netfilter] NUMA aware ipv4/netfilter/ip_tables.c Eric Dumazet
2005-09-19 17:48 ` Andi Kleen
2005-09-19 19:09 ` Eric Dumazet
2005-09-20 9:47 ` Eric Dumazet
2005-09-20 16:30 ` Andi Kleen
2005-09-20 17:02 ` Eric Dumazet
2005-09-20 21:45 ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h , " Eric Dumazet
2005-09-20 21:46 ` [PATCH] Adds sys_set_mempolicy() in include/linux/syscalls.h Eric Dumazet
2005-09-21 21:24 ` [PATCH 0/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-21 22:43 ` Christoph Lameter
2005-09-22 0:34 ` David S. Miller
2005-09-22 1:44 ` Christoph Lameter
2005-09-22 12:11 ` Eric Dumazet
2005-09-22 12:49 ` Christoph Hellwig
2005-09-22 12:54 ` Andi Kleen
2005-09-22 12:58 ` Christoph Hellwig
2005-09-22 13:05 ` Andi Kleen
2005-09-22 15:37 ` Christoph Lameter
2005-09-22 15:50 ` Eric Dumazet
2005-09-22 15:55 ` Christoph Lameter
2005-09-23 17:11 ` Harald Welte
2005-09-23 17:44 ` Christoph Lameter
2005-09-23 18:04 ` Dave Hansen
2005-09-23 17:47 ` Eric Dumazet
2005-09-23 18:00 ` Kyle Moffett
2005-09-22 4:18 ` James Morris
2005-09-22 5:07 ` Eric Dumazet
2005-09-22 13:03 ` Andi Kleen
2005-09-22 13:30 ` Eric Dumazet
2005-09-23 17:09 ` Harald Welte
2005-09-27 16:23 ` Andi Kleen
2005-09-28 0:25 ` Henrik Nordstrom
2005-09-28 8:32 ` Harald Welte
2005-09-28 8:37 ` Andi Kleen
2005-10-04 17:01 ` Patrick McHardy
2005-10-05 16:53 ` Andi Kleen
2005-10-07 2:38 ` Harald Welte
2005-10-06 17:59 ` Andi Kleen
2005-10-07 17:08 ` Patrick McHardy
2005-10-07 17:21 ` Andi Kleen
2005-10-07 17:50 ` Patrick McHardy
2005-09-28 10:34 ` Henrik Nordstrom
2005-11-25 11:23 ` [PATCH] netfilter : zap get_cpu()/put_cpu() calls from ip_tables Eric Dumazet
2005-11-25 11:28 ` [PATCH (resent with the attachment !)] " Eric Dumazet
2005-11-25 18:20 ` Patrick McHardy
2005-09-21 21:29 ` [PATCH 1/3] netfilter : 3 patches to boost ip_tables performance Eric Dumazet
2005-09-22 12:57 ` Harald Welte
2005-09-22 13:17 ` Eric Dumazet
2005-09-21 21:32 ` [PATCH 2/3] " Eric Dumazet
2005-09-22 12:48 ` Harald Welte
2005-09-22 13:05 ` Eric Dumazet
2005-09-23 4:02 ` Willy Tarreau
2005-09-23 5:14 ` Eric Dumazet
2005-09-23 11:33 ` Willy Tarreau
2005-09-23 14:00 ` Tim Mattox
2005-09-21 21:37 ` [PATCH 3/3] " Eric Dumazet
2005-09-22 12:50 ` Harald Welte
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=432EF0C5.5090908@cosmosbay.com \
--to=abuse@cosmosbay.com \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@lists.netfilter.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).