--- linux-2.6.14-rc1.p1/net/ipv4/netfilter/ip_tables.c 2005-09-19 19:56:12.000000000 +0200 +++ linux-2.6.14-rc1/net/ipv4/netfilter/ip_tables.c 2005-09-20 13:41:37.000000000 +0200 @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -82,11 +83,6 @@ context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -104,19 +100,14 @@ unsigned int underflow[NF_IP_NUMHOOKS]; /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) - -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) @@ -289,8 +280,7 @@ read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -562,7 +552,7 @@ /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -571,7 +561,7 @@ for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -621,13 +611,13 @@ goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -644,7 +634,7 @@ newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -854,6 +844,7 @@ translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -874,11 +865,11 @@ duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -906,25 +897,24 @@ } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ - for (i = 1; i < num_possible_cpus(); i++) { - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size)*i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + for_each_cpu(i) { + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -940,15 +930,12 @@ #ifdef CONFIG_NETFILTER_DEBUG { - struct ipt_entry *table_base; - unsigned int i; + int cpu; - for (i = 0; i < num_possible_cpus(); i++) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ipt_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -972,6 +959,7 @@ } /* Gets counters. */ +#ifdef CONFIG_SMP static inline int add_entry_to_counter(const struct ipt_entry *e, struct ipt_counters total[], @@ -982,22 +970,44 @@ (*i)++; return 0; } +#endif +static inline int +set_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} static void get_counters(const struct ipt_table_info *t, struct ipt_counters counters[]) { unsigned int cpu; + unsigned int curcpu = raw_smp_processor_id(); unsigned int i; - for (cpu = 0; cpu < num_possible_cpus(); cpu++) { + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); +#ifdef CONFIG_SMP + for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, &i); } +#endif } static int @@ -1009,6 +1019,7 @@ struct ipt_entry *e; struct ipt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1020,13 +1031,17 @@ return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* + * choose the copy that is on our node, + * but dont worry if the scheduler changes the CPU + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1038,7 +1053,7 @@ struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1107,6 +1122,56 @@ return ret; } +static void free_table_info(struct ipt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ipt_table_info *alloc_table_info(unsigned int size) +{ +struct ipt_table_info *newinfo; +int cpu; + newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + newinfo->size = size; + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) { + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + } else { +#ifdef CONFIG_NUMA + struct mempolicy *oldpol; + struct mempolicy prefnodepolicy = { + .refcnt = ATOMIC_INIT(1), + .policy = MPOL_PREFERRED, + .v.preferred_node = cpu_to_node(cpu), + }; + oldpol = current->mempolicy; + current->mempolicy = &prefnodepolicy; +#endif + newinfo->entries[cpu] = vmalloc(size); +#ifdef CONFIG_NUMA + current->mempolicy = oldpol; +#endif + } + if (newinfo->entries[cpu] == 0) { + free_table_info(newinfo); + return NULL; + } + } + return newinfo; +} + + static int do_replace(void __user *user, unsigned int len) { @@ -1115,6 +1180,7 @@ struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1127,12 +1193,15 @@ if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * num_possible_cpus()); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* + * choose the copy that is on our node, + * but dont worry if the scheduler changes the CPU + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1143,10 +1212,9 @@ ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1185,8 +1253,9 @@ /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1198,11 +1267,11 @@ module_put(t->me); up(&ipt_mutex); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1235,6 +1304,7 @@ struct ipt_counters_info tmp, *paddc; struct ipt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1264,7 +1334,12 @@ } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, + /* + * choose the copy that is on our node, + * but dont worry if the scheduler changes the CPU + */ + loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1454,29 +1529,39 @@ { int ret; struct ipt_table_info *newinfo; - static struct ipt_table_info bootstrap - = { 0, 0, 0, { 0 }, { 0 }, { } }; + static struct ipt_table_info bootstrap = { + .size = 0, + .number = 0, + .initial_entries = 0, + .hook_entry = { 0 }, + .underflow = { 0 }, + .entries = {NULL } + }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * num_possible_cpus()); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - - memcpy(newinfo->entries, repl->entries, repl->size); + /* + * choose the copy that is on our node, + * but dont worry if the scheduler changes the CPU + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ipt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1505,20 +1590,25 @@ return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ipt_unregister_table(struct ipt_table *table) { + void *loc_cpu_entry; down(&ipt_mutex); LIST_DELETE(&ipt_tables, table); up(&ipt_mutex); - /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + /* Decrease module usage counts and free resources + * choose the copy that is on our node, + * but dont worry if the scheduler changes the CPU + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */