netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters
@ 2015-06-08 17:27 Florian Westphal
  2015-06-08 17:27 ` [PATCH nf-next 2/2] netfilter: xtables: avoid percpu ruleset duplication Florian Westphal
  2015-06-08 18:59 ` [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Eric Dumazet
  0 siblings, 2 replies; 4+ messages in thread
From: Florian Westphal @ 2015-06-08 17:27 UTC (permalink / raw)
  To: netfilter-devel; +Cc: eric.dumazet, Florian Westphal

The binary arp/ip/ip6tables ruleset is stored per cpu.

The only reason left as to why we need percpu duplication are the rule
counters embedded into ipt_entry et al -- since each cpu has its own copy
of the rules, all counters can be lockless.

The downside is that the more cpus are supported, the more memory is
required.  Rules are not just duplicated per online cpu but for each
possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
not for the e.g. 64 cores present.

To save some memory and also improve utilization of shared caches it
would be preferable to only store the rule blob once.

So we first need to separate counters and the rule blob.

Instead of using entry->counters, allocate this percpu and store the
percpu address in entry->counters.pcnt on CONFIG_SMP.

This change makes no sense as-is; it is merely an intermediate step to
remove the percpu duplication of the rule set in a followup patch.

Suggested-by: Eric Dumazet <edumazet@google.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter/x_tables.h | 51 ++++++++++++++++++++++++++++++++++++++
 net/ipv4/netfilter/arp_tables.c    | 32 ++++++++++++++++++++----
 net/ipv4/netfilter/ip_tables.c     | 31 ++++++++++++++++++++---
 net/ipv6/netfilter/ip6_tables.c    | 31 +++++++++++++++++++----
 4 files changed, 131 insertions(+), 14 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 09f3820..e438c43 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -353,6 +353,57 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
 	return ret;
 }
 
+
+/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
+ * real (percpu) counter.  On !SMP, its just the packet count,
+ * so nothing needs to be done there.
+ *
+ * xt_percpu_counter_alloc returns the address of the percpu
+ * counter, or 0 on !SMP.
+ *
+ * Hence caller must use IS_ERR_VALUE to check for error, this
+ * allows us to contain CONFIG_SMP kludgery.
+ */
+static inline u64 xt_percpu_counter_alloc(void)
+{
+#ifdef CONFIG_SMP
+	void *res = alloc_percpu(struct xt_counters);
+
+	if (res == NULL)
+		return (u64) -ENOMEM;
+
+	return (__force u64) res;
+#else
+	return 0;
+#endif
+}
+static inline void xt_percpu_counter_free(u64 pcnt)
+{
+#ifdef CONFIG_SMP
+	free_percpu((__force void *) pcnt);
+#endif
+}
+
+static inline struct xt_counters *
+xt_get_this_cpu_counter(struct xt_counters *cnt)
+{
+#ifdef CONFIG_SMP
+	return this_cpu_ptr((void *) cnt->pcnt);
+#else
+	return cnt;
+#endif
+}
+
+static inline struct xt_counters *
+xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
+{
+#ifdef CONFIG_SMP
+	return per_cpu_ptr((void *) cnt->pcnt, cpu);
+#else
+	return cnt;
+#endif
+}
+
 struct nf_hook_ops *xt_hook_link(const struct xt_table *, nf_hookfn *);
 void xt_hook_unlink(const struct xt_table *, struct nf_hook_ops *);
 
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index a612007..0ada09a 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -289,13 +289,15 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	arp = arp_hdr(skb);
 	do {
 		const struct xt_entry_target *t;
+		struct xt_counters *counter;
 
 		if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) {
 			e = arpt_next_entry(e);
 			continue;
 		}
 
-		ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, arp_hdr_len(skb->dev), 1);
 
 		t = arpt_get_target_c(e);
 
@@ -521,6 +523,10 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	t = arpt_get_target(e);
 	target = xt_request_find_target(NFPROTO_ARP, t->u.user.name,
 					t->u.user.revision);
@@ -538,6 +544,8 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
 err:
 	module_put(t->u.kernel.target->me);
 out:
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -614,6 +622,7 @@ static inline void cleanup_entry(struct arpt_entry *e)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -723,13 +732,15 @@ static void get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1186,7 +1197,10 @@ static int do_add_counters(struct net *net, const void __user *user,
 	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1416,9 +1430,17 @@ static int translate_compat_table(const char *name,
 
 	i = 0;
 	xt_entry_foreach(iter1, entry1, newinfo->size) {
+		iter1->counters.pcnt = xt_percpu_counter_alloc();
+		if (IS_ERR_VALUE(iter1->counters.pcnt)) {
+			ret = -ENOMEM;
+			break;
+		}
+
 		ret = check_target(iter1, name);
-		if (ret != 0)
+		if (ret != 0) {
+			xt_percpu_counter_free(iter1->counters.pcnt);
 			break;
+		}
 		++i;
 		if (strcmp(arpt_get_target(iter1)->u.user.name,
 		    XT_ERROR_TARGET) == 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index e7abf51..d190b10 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -345,6 +345,7 @@ ipt_do_table(struct sk_buff *skb,
 	do {
 		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
+		struct xt_counters *counter;
 
 		IP_NF_ASSERT(e);
 		if (!ip_packet_match(ip, indev, outdev,
@@ -361,7 +362,8 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -665,6 +667,10 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -691,6 +697,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 	ret = check_target(e, net, name);
 	if (ret)
 		goto err;
+
 	return 0;
  err:
 	module_put(t->u.kernel.target->me);
@@ -700,6 +707,9 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -784,6 +794,7 @@ cleanup_entry(struct ipt_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -888,13 +899,15 @@ get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1374,7 +1387,10 @@ do_add_counters(struct net *net, const void __user *user,
 	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1608,6 +1624,10 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
 	unsigned int j;
 	int ret = 0;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -1632,6 +1652,9 @@ compat_check_entry(struct ipt_entry *e, struct net *net, const char *name)
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index cdd085f..a1190ee 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -367,6 +367,7 @@ ip6t_do_table(struct sk_buff *skb,
 	do {
 		const struct xt_entry_target *t;
 		const struct xt_entry_match *ematch;
+		struct xt_counters *counter;
 
 		IP_NF_ASSERT(e);
 		acpar.thoff = 0;
@@ -384,7 +385,8 @@ ip6t_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		counter = xt_get_this_cpu_counter(&e->counters);
+		ADD_COUNTER(*counter, skb->len, 1);
 
 		t = ip6t_get_target_c(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -679,6 +681,10 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 	if (ret)
 		return ret;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
+
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -714,6 +720,9 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
@@ -797,6 +806,8 @@ static void cleanup_entry(struct ip6t_entry *e, struct net *net)
 	if (par.target->destroy != NULL)
 		par.target->destroy(&par);
 	module_put(par.target->me);
+
+	xt_percpu_counter_free(e->counters.pcnt);
 }
 
 /* Checks and translates the user-supplied table segment (held in
@@ -901,13 +912,15 @@ get_counters(const struct xt_table_info *t,
 
 		i = 0;
 		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
 
+			tmp = xt_get_per_cpu_counter(&iter->counters, cpu);
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = tmp->bcnt;
+				pcnt = tmp->pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
@@ -1374,7 +1387,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		goto free;
 	}
 
-
 	local_bh_disable();
 	private = t->private;
 	if (private->number != num_counters) {
@@ -1388,7 +1400,10 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	addend = xt_write_recseq_begin();
 	loc_cpu_entry = private->entries[curcpu];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
+		struct xt_counters *tmp;
+
+		tmp = xt_get_this_cpu_counter(&iter->counters);
+		ADD_COUNTER(*tmp, paddc[i].bcnt, paddc[i].pcnt);
 		++i;
 	}
 	xt_write_recseq_end(addend);
@@ -1621,6 +1636,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
 	struct xt_mtchk_param mtpar;
 	struct xt_entry_match *ematch;
 
+	e->counters.pcnt = xt_percpu_counter_alloc();
+	if (IS_ERR_VALUE(e->counters.pcnt))
+		return -ENOMEM;
 	j = 0;
 	mtpar.net	= net;
 	mtpar.table     = name;
@@ -1645,6 +1663,9 @@ static int compat_check_entry(struct ip6t_entry *e, struct net *net,
 			break;
 		cleanup_match(ematch, net);
 	}
+
+	xt_percpu_counter_free(e->counters.pcnt);
+
 	return ret;
 }
 
-- 
2.0.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* [PATCH nf-next 2/2] netfilter: xtables: avoid percpu ruleset duplication
  2015-06-08 17:27 [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Florian Westphal
@ 2015-06-08 17:27 ` Florian Westphal
  2015-06-08 18:59 ` [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Eric Dumazet
  1 sibling, 0 replies; 4+ messages in thread
From: Florian Westphal @ 2015-06-08 17:27 UTC (permalink / raw)
  To: netfilter-devel; +Cc: eric.dumazet, Florian Westphal

We store the rule blob per (possible) cpu.  Unfortunately this means we can
waste lot of memory on big smp machines. ipt_entry structure ('rule head')
is 112 byte, so e.g. with maxcpu=64 one single rule eats
close to 8k RAM.

Since previous patch made counters percpu it appears there is nothing
left in the rule blob that needs to be percpu.

On my test system (144 possible cpus, 400k dummy rules) this
change saves close to 9 Gigabyte of RAM.

An earlier version of this patch allocated the rule blob once per numa
node, but as pointed out by Eric Dumazet a busy system would keep the
(read-mostly) rule blob in caches anyway.

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 include/linux/netfilter/x_tables.h |  4 +--
 net/ipv4/netfilter/arp_tables.c    | 50 +++++++++--------------------
 net/ipv4/netfilter/ip_tables.c     | 64 ++++++++++---------------------------
 net/ipv6/netfilter/ip6_tables.c    | 65 ++++++++++----------------------------
 net/netfilter/x_tables.c           | 23 +++++---------
 5 files changed, 57 insertions(+), 149 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index e438c43..6081075 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -224,9 +224,9 @@ struct xt_table_info {
 	unsigned int stacksize;
 	unsigned int __percpu *stackptr;
 	void ***jumpstack;
-	/* ipt_entry tables: one per CPU */
+
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
-	void *entries[1];
+	void *entries;
 };
 
 #define XT_TABLE_INFO_SZ (offsetof(struct xt_table_info, entries) \
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 0ada09a..d75c139 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -275,7 +275,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries;
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -711,12 +711,6 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -731,7 +725,7 @@ static void get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -785,7 +779,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	/* ... then copy entire thing ... */
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
@@ -880,10 +874,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(NFPROTO_ARP, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1048,7 +1042,7 @@ static int __do_replace(struct net *net, const char *name,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	loc_cpu_old_entry = oldinfo->entries;
 	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
 		cleanup_entry(iter);
 
@@ -1095,8 +1089,7 @@ static int do_replace(struct net *net, const void __user *user,
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1126,7 +1119,7 @@ static int do_replace(struct net *net, const void __user *user,
 static int do_add_counters(struct net *net, const void __user *user,
 			   unsigned int len, int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1136,7 +1129,6 @@ static int do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
 	struct arpt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1192,11 +1184,9 @@ static int do_add_counters(struct net *net, const void __user *user,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
+
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter,  private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1410,7 +1400,7 @@ static int translate_compat_table(const char *name,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1470,11 +1460,6 @@ static int translate_compat_table(const char *name,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1533,8 +1518,7 @@ static int compat_do_replace(struct net *net, void __user *user,
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1631,7 +1615,6 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct arpt_entry *iter;
 
@@ -1639,11 +1622,9 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy on our node/cpu */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -1812,8 +1793,7 @@ struct xt_table *arpt_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1844,7 +1824,7 @@ void arpt_unregister_table(struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index d190b10..6151500 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -254,15 +254,13 @@ static void trace_packet(const struct sk_buff *skb,
 			 const struct xt_table_info *private,
 			 const struct ipt_entry *e)
 {
-	const void *table_base;
 	const struct ipt_entry *root;
 	const char *hookname, *chainname, *comment;
 	const struct ipt_entry *iter;
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
-	root = get_entry(table_base, private->hook_entry[hook]);
+	root = get_entry(private->entries, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
 	comment = comments[NF_IP_TRACE_COMMENT_RULE];
@@ -331,7 +329,7 @@ ipt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[cpu];
+	table_base = private->entries;
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -877,12 +875,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -898,7 +890,7 @@ get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -946,17 +938,13 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	const void *loc_cpu_entry;
+	void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1070,10 +1058,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(AF_INET, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1194,7 +1182,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct xt_table *t;
 	struct xt_table_info *oldinfo;
 	struct xt_counters *counters;
-	void *loc_cpu_old_entry;
 	struct ipt_entry *iter;
 
 	ret = 0;
@@ -1237,8 +1224,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
-	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
 		cleanup_entry(iter, net);
 
 	xt_free_table_info(oldinfo);
@@ -1284,8 +1270,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1316,7 +1301,7 @@ static int
 do_add_counters(struct net *net, const void __user *user,
                 unsigned int len, int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1326,7 +1311,6 @@ do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
 	struct ipt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1382,11 +1366,8 @@ do_add_counters(struct net *net, const void __user *user,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter, private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1739,7 +1720,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1791,11 +1772,6 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1842,8 +1818,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1914,7 +1889,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct ipt_entry *iter;
 
@@ -1922,14 +1896,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -2104,8 +2073,7 @@ struct xt_table *ipt_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2136,7 +2104,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index a1190ee..80a7f0d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -283,15 +283,13 @@ static void trace_packet(const struct sk_buff *skb,
 			 const struct xt_table_info *private,
 			 const struct ip6t_entry *e)
 {
-	const void *table_base;
 	const struct ip6t_entry *root;
 	const char *hookname, *chainname, *comment;
 	const struct ip6t_entry *iter;
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
-	root = get_entry(table_base, private->hook_entry[hook]);
+	root = get_entry(private->entries, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
 	comment = comments[NF_IP6_TRACE_COMMENT_RULE];
@@ -357,7 +355,7 @@ ip6t_do_table(struct sk_buff *skb,
 	 */
 	smp_read_barrier_depends();
 	cpu        = smp_processor_id();
-	table_base = private->entries[cpu];
+	table_base = private->entries;
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -890,12 +888,6 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
-		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
-			memcpy(newinfo->entries[i], entry0, newinfo->size);
-	}
-
 	return ret;
 }
 
@@ -911,7 +903,7 @@ get_counters(const struct xt_table_info *t,
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
 		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		xt_entry_foreach(iter, t->entries, t->size) {
 			struct xt_counters *tmp;
 			u64 bcnt, pcnt;
 			unsigned int start;
@@ -959,17 +951,13 @@ copy_entries_to_user(unsigned int total_size,
 	struct xt_counters *counters;
 	const struct xt_table_info *private = table->private;
 	int ret = 0;
-	const void *loc_cpu_entry;
+	void *loc_cpu_entry;
 
 	counters = alloc_counters(table);
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1083,10 +1071,10 @@ static int compat_table_info(const struct xt_table_info *info,
 	if (!newinfo || !info)
 		return -EINVAL;
 
-	/* we dont care about newinfo->entries[] */
+	/* we dont care about newinfo->entries */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries;
 	xt_compat_init_offsets(AF_INET6, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1207,7 +1195,6 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct xt_table *t;
 	struct xt_table_info *oldinfo;
 	struct xt_counters *counters;
-	const void *loc_cpu_old_entry;
 	struct ip6t_entry *iter;
 
 	ret = 0;
@@ -1250,8 +1237,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
-	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
+	xt_entry_foreach(iter, oldinfo->entries, oldinfo->size)
 		cleanup_entry(iter, net);
 
 	xt_free_table_info(oldinfo);
@@ -1297,8 +1283,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1329,7 +1314,7 @@ static int
 do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		int compat)
 {
-	unsigned int i, curcpu;
+	unsigned int i;
 	struct xt_counters_info tmp;
 	struct xt_counters *paddc;
 	unsigned int num_counters;
@@ -1339,7 +1324,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	struct ip6t_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
@@ -1395,11 +1379,8 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	}
 
 	i = 0;
-	/* Choose the copy that is on our node */
-	curcpu = smp_processor_id();
 	addend = xt_write_recseq_begin();
-	loc_cpu_entry = private->entries[curcpu];
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
+	xt_entry_foreach(iter, private->entries, private->size) {
 		struct xt_counters *tmp;
 
 		tmp = xt_get_this_cpu_counter(&iter->counters);
@@ -1407,7 +1388,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		++i;
 	}
 	xt_write_recseq_end(addend);
-
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
@@ -1750,7 +1730,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries;
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1802,11 +1782,6 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
-		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
-			memcpy(newinfo->entries[i], entry1, newinfo->size);
-
 	*pinfo = newinfo;
 	*pentry0 = entry1;
 	xt_free_table_info(info);
@@ -1853,8 +1828,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 	if (!newinfo)
 		return -ENOMEM;
 
-	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1925,7 +1899,6 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	void __user *pos;
 	unsigned int size;
 	int ret = 0;
-	const void *loc_cpu_entry;
 	unsigned int i = 0;
 	struct ip6t_entry *iter;
 
@@ -1933,14 +1906,9 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	/* choose the copy that is on our node/cpu, ...
-	 * This choice is lazy (because current thread is
-	 * allowed to migrate to another cpu)
-	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
 	pos = userptr;
 	size = total_size;
-	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
+	xt_entry_foreach(iter, private->entries, total_size) {
 		ret = compat_copy_entry_to_user(iter, &pos,
 						&size, counters, i++);
 		if (ret != 0)
@@ -2115,8 +2083,7 @@ struct xt_table *ip6t_register_table(struct net *net,
 		goto out;
 	}
 
-	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries;
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2146,7 +2113,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries;
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8303246..6062ce3 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -659,7 +659,6 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;
-	int cpu;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
@@ -671,19 +670,14 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 
 	newinfo->size = size;
 
-	for_each_possible_cpu(cpu) {
-		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
-		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							cpu_to_node(cpu));
+	if (size <= PAGE_SIZE)
+		newinfo->entries = kmalloc(size, GFP_KERNEL);
+	else
+		newinfo->entries = vmalloc(size);
 
-		if (newinfo->entries[cpu] == NULL) {
-			xt_free_table_info(newinfo);
-			return NULL;
-		}
+	if (newinfo->entries == NULL) {
+		xt_free_table_info(newinfo);
+		return NULL;
 	}
 
 	return newinfo;
@@ -694,8 +688,7 @@ void xt_free_table_info(struct xt_table_info *info)
 {
 	int cpu;
 
-	for_each_possible_cpu(cpu)
-		kvfree(info->entries[cpu]);
+	kvfree(info->entries);
 
 	if (info->jumpstack != NULL) {
 		for_each_possible_cpu(cpu)
-- 
2.0.5


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters
  2015-06-08 17:27 [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Florian Westphal
  2015-06-08 17:27 ` [PATCH nf-next 2/2] netfilter: xtables: avoid percpu ruleset duplication Florian Westphal
@ 2015-06-08 18:59 ` Eric Dumazet
  2015-06-08 19:54   ` Florian Westphal
  1 sibling, 1 reply; 4+ messages in thread
From: Eric Dumazet @ 2015-06-08 18:59 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Mon, 2015-06-08 at 19:27 +0200, Florian Westphal wrote:
> The binary arp/ip/ip6tables ruleset is stored per cpu.
> 
> The only reason left as to why we need percpu duplication are the rule
> counters embedded into ipt_entry et al -- since each cpu has its own copy
> of the rules, all counters can be lockless.
> 
> The downside is that the more cpus are supported, the more memory is
> required.  Rules are not just duplicated per online cpu but for each
> possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
> not for the e.g. 64 cores present.
> 
> To save some memory and also improve utilization of shared caches it
> would be preferable to only store the rule blob once.
> 
> So we first need to separate counters and the rule blob.
> 
> Instead of using entry->counters, allocate this percpu and store the
> percpu address in entry->counters.pcnt on CONFIG_SMP.
> 
> This change makes no sense as-is; it is merely an intermediate step to
> remove the percpu duplication of the rule set in a followup patch.
> 
> Suggested-by: Eric Dumazet <edumazet@google.com>
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  include/linux/netfilter/x_tables.h | 51 ++++++++++++++++++++++++++++++++++++++
>  net/ipv4/netfilter/arp_tables.c    | 32 ++++++++++++++++++++----
>  net/ipv4/netfilter/ip_tables.c     | 31 ++++++++++++++++++++---
>  net/ipv6/netfilter/ip6_tables.c    | 31 +++++++++++++++++++----
>  4 files changed, 131 insertions(+), 14 deletions(-)
> 
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 09f3820..e438c43 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -353,6 +353,57 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
>  	return ret;
>  }
>  
> +
> +/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
> + * real (percpu) counter.  On !SMP, its just the packet count,
> + * so nothing needs to be done there.
> + *
> + * xt_percpu_counter_alloc returns the address of the percpu
> + * counter, or 0 on !SMP.
> + *
> + * Hence caller must use IS_ERR_VALUE to check for error, this
> + * allows us to contain CONFIG_SMP kludgery.
> + */
> +static inline u64 xt_percpu_counter_alloc(void)
> +{
> +#ifdef CONFIG_SMP
> +	void *res = alloc_percpu(struct xt_counters);
> +
> +	if (res == NULL)
> +		return (u64) -ENOMEM;
> +
> +	return (__force u64) res;
> +#else
> +	return 0;
> +#endif
> +}


The u64 stuff looks strange on a 32bit kernel ;)

> +static inline void xt_percpu_counter_free(u64 pcnt)
> +{
> +#ifdef CONFIG_SMP
> +	free_percpu((__force void *) pcnt);
> +#endif
> +}
> +
> +static inline struct xt_counters *
> +xt_get_this_cpu_counter(struct xt_counters *cnt)
> +{
> +#ifdef CONFIG_SMP
> +	return this_cpu_ptr((void *) cnt->pcnt);
> +#else
> +	return cnt;
> +#endif
> +}
> +
> +static inline struct xt_counters *
> +xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
> +{
> +#ifdef CONFIG_SMP
> +	return per_cpu_ptr((void *) cnt->pcnt, cpu);
> +#else
> +	return cnt;
> +#endif
> +}
> +

So I understand struct xt_counters is exported to user space
(include/uapi/linux/netfilter/x_tables.h) 


But maybe you could use in the kernel 

union xt_kcounters {
	struct xt_counters __percpu *pcpu;
	struct xt_counters counters;
};



This way, you could avoid a lot of casts ?

Please run sparse to make sure you did not miss an annotation.

include/linux/netfilter/x_tables.h:370:21: warning: incorrect type in initializer (different address spaces)
include/linux/netfilter/x_tables.h:370:21:    expected void *res
include/linux/netfilter/x_tables.h:370:21:    got struct xt_counters [noderef] <asn:3>*<noident>
include/linux/netfilter/x_tables.h:391:16: warning: incorrect type in initializer (different address spaces)
include/linux/netfilter/x_tables.h:391:16:    expected void const [noderef] <asn:3>*__vpp_verify
include/linux/netfilter/x_tables.h:391:16:    got void *<noident>
include/linux/netfilter/x_tables.h:383:22: warning: incorrect type in argument 1 (different address spaces)
include/linux/netfilter/x_tables.h:383:22:    expected void [noderef] <asn:3>*__pdata
include/linux/netfilter/x_tables.h:383:22:    got void *<noident>
include/linux/netfilter/x_tables.h:383:22: warning: incorrect type in argument 1 (different address spaces)
include/linux/netfilter/x_tables.h:383:22:    expected void [noderef] <asn:3>*__pdata
include/linux/netfilter/x_tables.h:383:22:    got void *<noident>
include/linux/netfilter/x_tables.h:401:16: warning: incorrect type in initializer (different address spaces)
include/linux/netfilter/x_tables.h:401:16:    expected void const [noderef] <asn:3>*__vpp_verify
include/linux/netfilter/x_tables.h:401:16:    got void *<noident>

Otherwise, patch looks absolutely great, thanks Florian !



^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters
  2015-06-08 18:59 ` [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Eric Dumazet
@ 2015-06-08 19:54   ` Florian Westphal
  0 siblings, 0 replies; 4+ messages in thread
From: Florian Westphal @ 2015-06-08 19:54 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > The only reason left as to why we need percpu duplication are the rule
> > counters embedded into ipt_entry et al -- since each cpu has its own copy
> > of the rules, all counters can be lockless.
> > 
> > The downside is that the more cpus are supported, the more memory is
> > required.  Rules are not just duplicated per online cpu but for each
> > possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
> > not for the e.g. 64 cores present.
> > 
> > To save some memory and also improve utilization of shared caches it
> > would be preferable to only store the rule blob once.
> > 
> > So we first need to separate counters and the rule blob.
> > 
> > Instead of using entry->counters, allocate this percpu and store the
> > percpu address in entry->counters.pcnt on CONFIG_SMP.
> > 
> > This change makes no sense as-is; it is merely an intermediate step to
> > remove the percpu duplication of the rule set in a followup patch.
> > 
> > Suggested-by: Eric Dumazet <edumazet@google.com>
> > Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > Signed-off-by: Florian Westphal <fw@strlen.de>
> > ---
[..]

> > +++ b/include/linux/netfilter/x_tables.h
> > +/* On SMP, ip(6)t_entry->counters.pcnt holds address of the
> > + * real (percpu) counter.  On !SMP, its just the packet count,
> > + * so nothing needs to be done there.
> > + *
> > + * xt_percpu_counter_alloc returns the address of the percpu
> > + * counter, or 0 on !SMP.
> > + *
> > + * Hence caller must use IS_ERR_VALUE to check for error, this
> > + * allows us to contain CONFIG_SMP kludgery.
> > + */
> > +static inline u64 xt_percpu_counter_alloc(void)
> > +{
> > +#ifdef CONFIG_SMP
> > +	void *res = alloc_percpu(struct xt_counters);
> > +
> > +	if (res == NULL)
> > +		return (u64) -ENOMEM;
> > +
> > +	return (__force u64) res;
> > +#else
> > +	return 0;
> > +#endif
> > +}
> 
> 
> The u64 stuff looks strange on a 32bit kernel ;)

Hmmm, not sure how to avoid that.
The packet and byte counters are always u64, and I don't
think it will help to provide different helpers for 32 vs. 64bit system.

> > +static inline void xt_percpu_counter_free(u64 pcnt)
> > +{
> > +#ifdef CONFIG_SMP
> > +	free_percpu((__force void *) pcnt);
> > +#endif
> > +}
> > +
> > +static inline struct xt_counters *
> > +xt_get_this_cpu_counter(struct xt_counters *cnt)
> > +{
> > +#ifdef CONFIG_SMP
> > +	return this_cpu_ptr((void *) cnt->pcnt);
> > +#else
> > +	return cnt;
> > +#endif
> > +}
> > +
> > +static inline struct xt_counters *
> > +xt_get_per_cpu_counter(struct xt_counters *cnt, unsigned int cpu)
> > +{
> > +#ifdef CONFIG_SMP
> > +	return per_cpu_ptr((void *) cnt->pcnt, cpu);
> > +#else
> > +	return cnt;
> > +#endif
> > +}
> > +
> 
> So I understand struct xt_counters is exported to user space
> (include/uapi/linux/netfilter/x_tables.h) 
>
> But maybe you could use in the kernel 
> 
> union xt_kcounters {
> 	struct xt_counters __percpu *pcpu;
> 	struct xt_counters counters;
> };
> 
> This way, you could avoid a lot of casts ?

I tried that but I did not find out how to use this to improve
readability :-|

static inline bool xt_percpu_counter_alloc(union xt_kcounters *cnt)
{
#ifdef CONFIG_SMP
	cnt->pcpu = alloc_percpu(struct xt_counters);

	return cnt->pcpu != NULL;
#else
	return true;
#endif
}

... looks ok BUT it means call site looks like

if (xt_percpu_counter_alloc((union xt_kcounters *) &e->counters))
	/* err */

which I find worse :-/

I could add anon union in ipt_entry to avoid casts but I remember
there were build failure problems with anon unions & older gcc releases.

Is there any idea you have wrt. not leaking percpu yes|no details
to ip(6)_tables.c while still avoiding the casts outside the static
inline helpers above?

> Please run sparse to make sure you did not miss an annotation.
> 
> include/linux/netfilter/x_tables.h:370:21: warning: incorrect type in initializer (different address spaces)
> include/linux/netfilter/x_tables.h:370:21:    expected void *res
> include/linux/netfilter/x_tables.h:370:21:    got struct xt_counters [noderef] <asn:3>*<noident>

Ugh.  I sprinkled a few more __percpu annotations on x_tables.h and
these are gone now.

Thanks Eric.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2015-06-08 19:54 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-06-08 17:27 [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Florian Westphal
2015-06-08 17:27 ` [PATCH nf-next 2/2] netfilter: xtables: avoid percpu ruleset duplication Florian Westphal
2015-06-08 18:59 ` [PATCH nf-next 1/2] netfilter: xtables: use percpu rule counters Eric Dumazet
2015-06-08 19:54   ` Florian Westphal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).