[PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules

netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
@ 2015-05-28 20:51 Florian Westphal
  2015-05-28 20:51 ` [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu Florian Westphal
                   ` (2 more replies)
  0 siblings, 3 replies; 13+ messages in thread
From: Florian Westphal @ 2015-05-28 20:51 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal

The binary arp/ip/ip6tables ruleset is stored per cpu.

The only reason left as to why we need percpu duplication are the rule
counters embedded into ipt_entry et al -- since each cpu has its own copy
of the rules, all counters can be lockless.

The downside is that the more cpus are supported, the more memory is
required.  Rules are not just duplicated per online cpu but for each
possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
not for the e.g. 64 cores present.

To save some memory and also allow cpus with shared caches to make
better use of available cache size, it would be preferable to only
store a copy of the rule blob for each numa node.

So we first need to separate counters and the rule blob.

We create array of struct xt_counters for each possible cpu and
index them from the main blob via the (unused after validation)
->comefrom member.

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 Changes since v1:
  - add ->comefrom comment in arptables, too

 include/linux/netfilter/x_tables.h |  6 ++++++
 net/ipv4/netfilter/arp_tables.c    | 33 +++++++++++++++---------------
 net/ipv4/netfilter/ip_tables.c     | 31 ++++++++++++++--------------
 net/ipv6/netfilter/ip6_tables.c    | 32 ++++++++++++++---------------
 net/netfilter/x_tables.c           | 42 ++++++++++++++++++++++++++++++++++++++
 5 files changed, 95 insertions(+), 49 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 09f3820..e50ba76 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -224,6 +224,12 @@ struct xt_table_info {
 	unsigned int stacksize;
 	unsigned int __percpu *stackptr;
 	void ***jumpstack;
+
+	/* pointer to array of counters, one per CPU
+	 * each rule maps 1:1 to an entry in the percpu counter array.
+	 */
+	struct xt_counters **counters;
+
 	/* ipt_entry tables: one per CPU */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	void *entries[1];
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 13bfe84..d92f8a6 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -259,7 +259,8 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	void *table_base;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
-	unsigned int addend;
+	struct xt_counters *counters;
+	unsigned int addend, cpu;
 
 	if (!pskb_may_pull(skb, arp_hdr_len(skb->dev)))
 		return NF_DROP;
@@ -270,12 +271,14 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	local_bh_disable();
 	addend = xt_write_recseq_begin();
 	private = table->private;
+	cpu = smp_processor_id();
 	/*
 	 * Ensure we load private-> members after we've fetched the base
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
 	table_base = private->entries[smp_processor_id()];
+	counters   = private->counters[cpu];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 	back = get_entry(table_base, private->underflow[hook]);
@@ -295,7 +298,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 			continue;
 		}
 
-		ADD_COUNTER(e->counters, arp_hdr_len(skb->dev), 1);
+		ADD_COUNTER(counters[e->comefrom], skb->len, 1);
 
 		t = arpt_get_target_c(e);
 
@@ -690,6 +693,8 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 		ret = find_check_entry(iter, repl->name, repl->size);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter->comefrom = i;
 		++i;
 	}
 
@@ -714,26 +719,24 @@ static int translate_table(struct xt_table_info *newinfo, void *entry0,
 static void get_counters(const struct xt_table_info *t,
 			 struct xt_counters counters[])
 {
-	struct arpt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
+		struct xt_counters *pcpu_counters = t->counters[cpu];
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
-		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		for (i = 0; i < t->number; i++) {
 			u64 bcnt, pcnt;
 			unsigned int start;
 
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = pcpu_counters[i].bcnt;
+				pcnt = pcpu_counters[i].pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
-			++i;
 		}
 	}
 }
@@ -1114,7 +1117,7 @@ static int do_add_counters(struct net *net, const void __user *user,
 {
 	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
-	struct xt_counters *paddc;
+	struct xt_counters *paddc, *pcpu_counters;
 	unsigned int num_counters;
 	const char *name;
 	int size;
@@ -1122,8 +1125,6 @@ static int do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
-	struct arpt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
@@ -1180,12 +1181,10 @@ static int do_add_counters(struct net *net, const void __user *user,
 	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
+	pcpu_counters = private->counters[curcpu];
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
-		++i;
-	}
+	for (i = 0; i < private->number ; i++)
+		ADD_COUNTER(pcpu_counters[i], paddc[i].bcnt, paddc[i].pcnt);
 	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
@@ -1416,6 +1415,8 @@ static int translate_compat_table(const char *name,
 		ret = check_target(iter1, name);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter1->comefrom = i;
 		++i;
 		if (strcmp(arpt_get_target(iter1)->u.user.name,
 		    XT_ERROR_TARGET) == 0)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index 583779f..a68c377 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,6 +301,7 @@ ipt_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	struct xt_counters *counters;
 	unsigned int addend;
 
 	/* Initialization */
@@ -335,6 +336,7 @@ ipt_do_table(struct sk_buff *skb,
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
+	counters   = private->counters[cpu];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -361,7 +363,7 @@ ipt_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		ADD_COUNTER(counters[e->comefrom], skb->len, 1);
 
 		t = ipt_get_target(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -854,6 +856,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		ret = find_check_entry(iter, net, repl->name, repl->size);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter->comefrom = i;
 		++i;
 	}
 
@@ -879,26 +883,24 @@ static void
 get_counters(const struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
-	struct ipt_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
+		struct xt_counters *pcpu_counters = t->counters[cpu];
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
-		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		for (i = 0; i < t->number; i++) {
 			u64 bcnt, pcnt;
 			unsigned int start;
 
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = pcpu_counters[i].bcnt;
+				pcnt = pcpu_counters[i].pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
-			++i; /* macro does multi eval of i */
 		}
 	}
 }
@@ -1302,7 +1304,7 @@ do_add_counters(struct net *net, const void __user *user,
 {
 	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
-	struct xt_counters *paddc;
+	struct xt_counters *paddc, *pcpu_counters;
 	unsigned int num_counters;
 	const char *name;
 	int size;
@@ -1310,8 +1312,6 @@ do_add_counters(struct net *net, const void __user *user,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	void *loc_cpu_entry;
-	struct ipt_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
@@ -1365,15 +1365,12 @@ do_add_counters(struct net *net, const void __user *user,
 		goto unlock_up_free;
 	}
 
-	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
-	loc_cpu_entry = private->entries[curcpu];
+	pcpu_counters = private->counters[curcpu];
 	addend = xt_write_recseq_begin();
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
-		++i;
-	}
+	for (i = 0; i < private->number ; i++)
+		ADD_COUNTER(pcpu_counters[i], paddc[i].bcnt, paddc[i].pcnt);
 	xt_write_recseq_end(addend);
  unlock_up_free:
 	local_bh_enable();
@@ -1736,6 +1733,8 @@ translate_compat_table(struct net *net,
 		ret = compat_check_entry(iter1, net, name);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter1->comefrom = i;
 		++i;
 		if (strcmp(ipt_get_target(iter1)->u.user.name,
 		    XT_ERROR_TARGET) == 0)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index d54f049..69aec1d 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -329,6 +329,7 @@ ip6t_do_table(struct sk_buff *skb,
 	unsigned int *stackptr, origptr, cpu;
 	const struct xt_table_info *private;
 	struct xt_action_param acpar;
+	struct xt_counters *counters;
 	unsigned int addend;
 
 	/* Initialization */
@@ -361,6 +362,7 @@ ip6t_do_table(struct sk_buff *skb,
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
+	counters   = private->counters[cpu];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
 
@@ -384,7 +386,7 @@ ip6t_do_table(struct sk_buff *skb,
 				goto no_match;
 		}
 
-		ADD_COUNTER(e->counters, skb->len, 1);
+		ADD_COUNTER(counters[e->comefrom], skb->len, 1);
 
 		t = ip6t_get_target_c(e);
 		IP_NF_ASSERT(t->u.kernel.target);
@@ -867,6 +869,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		ret = find_check_entry(iter, net, repl->name, repl->size);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter->comefrom = i;
 		++i;
 	}
 
@@ -892,26 +896,24 @@ static void
 get_counters(const struct xt_table_info *t,
 	     struct xt_counters counters[])
 {
-	struct ip6t_entry *iter;
 	unsigned int cpu;
 	unsigned int i;
 
 	for_each_possible_cpu(cpu) {
+		struct xt_counters *pcpu_counters = t->counters[cpu];
 		seqcount_t *s = &per_cpu(xt_recseq, cpu);
 
-		i = 0;
-		xt_entry_foreach(iter, t->entries[cpu], t->size) {
+		for (i = 0; i < t->number; i++) {
 			u64 bcnt, pcnt;
 			unsigned int start;
 
 			do {
 				start = read_seqcount_begin(s);
-				bcnt = iter->counters.bcnt;
-				pcnt = iter->counters.pcnt;
+				bcnt = pcpu_counters[i].bcnt;
+				pcnt = pcpu_counters[i].pcnt;
 			} while (read_seqcount_retry(s, start));
 
 			ADD_COUNTER(counters[i], bcnt, pcnt);
-			++i;
 		}
 	}
 }
@@ -1315,7 +1317,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 {
 	unsigned int i, curcpu;
 	struct xt_counters_info tmp;
-	struct xt_counters *paddc;
+	struct xt_counters *paddc, *pcpu_counters;
 	unsigned int num_counters;
 	char *name;
 	int size;
@@ -1323,8 +1325,6 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 	struct xt_table *t;
 	const struct xt_table_info *private;
 	int ret = 0;
-	const void *loc_cpu_entry;
-	struct ip6t_entry *iter;
 	unsigned int addend;
 #ifdef CONFIG_COMPAT
 	struct compat_xt_counters_info compat_tmp;
@@ -1379,17 +1379,13 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len,
 		goto unlock_up_free;
 	}
 
-	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
+	pcpu_counters = private->counters[curcpu];
 	addend = xt_write_recseq_begin();
-	loc_cpu_entry = private->entries[curcpu];
-	xt_entry_foreach(iter, loc_cpu_entry, private->size) {
-		ADD_COUNTER(iter->counters, paddc[i].bcnt, paddc[i].pcnt);
-		++i;
-	}
+	for (i = 0; i < private->number ; i++)
+		ADD_COUNTER(pcpu_counters[i], paddc[i].bcnt, paddc[i].pcnt);
 	xt_write_recseq_end(addend);
-
  unlock_up_free:
 	local_bh_enable();
 	xt_table_unlock(t);
@@ -1749,6 +1745,8 @@ translate_compat_table(struct net *net,
 		ret = compat_check_entry(iter1, net, name);
 		if (ret != 0)
 			break;
+		/* overload comefrom to index into percpu counters array */
+		iter1->comefrom = i;
 		++i;
 		if (strcmp(ip6t_get_target(iter1)->u.user.name,
 		    XT_ERROR_TARGET) == 0)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8303246..28e3396 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -697,6 +697,12 @@ void xt_free_table_info(struct xt_table_info *info)
 	for_each_possible_cpu(cpu)
 		kvfree(info->entries[cpu]);
 
+	if (info->counters != NULL) {
+		for_each_possible_cpu(cpu)
+			kvfree(info->counters[cpu]);
+		kvfree(info->counters);
+	}
+
 	if (info->jumpstack != NULL) {
 		for_each_possible_cpu(cpu)
 			kvfree(info->jumpstack[cpu]);
@@ -747,6 +753,36 @@ EXPORT_SYMBOL_GPL(xt_compat_unlock);
 DEFINE_PER_CPU(seqcount_t, xt_recseq);
 EXPORT_PER_CPU_SYMBOL_GPL(xt_recseq);
 
+static int xt_counters_alloc(struct xt_table_info *i)
+{
+	unsigned int size;
+	int cpu;
+
+	size = sizeof(void *) * nr_cpu_ids;
+	if (size > PAGE_SIZE)
+		i->counters = vzalloc(size);
+	else
+		i->counters = kzalloc(size, GFP_KERNEL);
+
+	if (i->counters == NULL)
+		return -ENOMEM;
+
+	size = sizeof(struct xt_counters) * i->number;
+
+	for_each_possible_cpu(cpu) {
+		if (size > PAGE_SIZE)
+			i->counters[cpu] = vzalloc_node(size,
+				cpu_to_node(cpu));
+		else
+			i->counters[cpu] = kzalloc_node(size,
+				GFP_KERNEL, cpu_to_node(cpu));
+		if (i->counters[cpu] == NULL)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
 static int xt_jumpstack_alloc(struct xt_table_info *i)
 {
 	unsigned int size;
@@ -794,6 +830,12 @@ xt_replace_table(struct xt_table *table,
 	struct xt_table_info *private;
 	int ret;
 
+	ret = xt_counters_alloc(newinfo);
+	if (ret < 0) {
+		*error = ret;
+		return NULL;
+	}
+
 	ret = xt_jumpstack_alloc(newinfo);
 	if (ret < 0) {
 		*error = ret;
-- 
2.0.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu
  2015-05-28 20:51 [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Florian Westphal
@ 2015-05-28 20:51 ` Florian Westphal
  2015-05-28 21:38   ` Eric Dumazet
  2015-05-28 21:33 ` [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Eric Dumazet
  2015-05-29 11:32 ` Patrick Schaaf
  2 siblings, 1 reply; 13+ messages in thread
From: Florian Westphal @ 2015-05-28 20:51 UTC (permalink / raw)
  To: netfilter-devel; +Cc: Florian Westphal

We store rule blob per (possible) cpu.  Unfortunately this means we can
waste lot of memory on big smp machines. ipt_entry structure ('rule head')
is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM.

Since previous patch moved counters to separate percpu blob, it appears
there is nothing left in the rule blob that must be percpu.

Thus only duplicate the rule blob for each NUMA node.

On my test system (144 possible cpus, one numa node, 400k dummy rules) this
change saves close to 9 Gigabyte of RAM.

Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
---
 No changes since v1.

 include/linux/netfilter/x_tables.h |  2 +-
 net/ipv4/netfilter/arp_tables.c    | 21 ++++++++++-----------
 net/ipv4/netfilter/ip_tables.c     | 30 +++++++++++++++---------------
 net/ipv6/netfilter/ip6_tables.c    | 30 +++++++++++++++---------------
 net/netfilter/x_tables.c           | 20 +++++++++-----------
 5 files changed, 50 insertions(+), 53 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index e50ba76..ff25664 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -230,7 +230,7 @@ struct xt_table_info {
 	 */
 	struct xt_counters **counters;
 
-	/* ipt_entry tables: one per CPU */
+	/* ipt_entry tables: one per NUMA node */
 	/* Note : this field MUST be the last one, see XT_TABLE_INFO_SZ */
 	void *entries[1];
 };
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index d92f8a6..301a147 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -277,7 +277,7 @@ unsigned int arpt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[cpu_to_node(cpu)];
 	counters   = private->counters[cpu];
 
 	e = get_entry(table_base, private->hook_entry[hook]);
@@ -777,7 +777,7 @@ static int copy_entries_to_user(unsigned int total_size,
 	if (IS_ERR(counters))
 		return PTR_ERR(counters);
 
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	/* ... then copy entire thing ... */
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
@@ -875,7 +875,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	/* we dont care about newinfo->entries[] */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries[numa_node_id()];
 	xt_compat_init_offsets(NFPROTO_ARP, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1040,7 +1040,7 @@ static int __do_replace(struct net *net, const char *name,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	loc_cpu_old_entry = oldinfo->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
 		cleanup_entry(iter);
 
@@ -1085,7 +1085,7 @@ static int do_replace(struct net *net, const void __user *user,
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1178,7 +1178,6 @@ static int do_add_counters(struct net *net, const void __user *user,
 		goto unlock_up_free;
 	}
 
-	i = 0;
 	/* Choose the copy that is on our node */
 	curcpu = smp_processor_id();
 	pcpu_counters = private->counters[curcpu];
@@ -1392,7 +1391,7 @@ static int translate_compat_table(const char *name,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries[numa_node_id()];
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1507,7 +1506,7 @@ static int compat_do_replace(struct net *net, void __user *user,
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) {
 		ret = -EFAULT;
 		goto free_newinfo;
@@ -1613,7 +1612,7 @@ static int compat_copy_entries_to_user(unsigned int total_size,
 		return PTR_ERR(counters);
 
 	/* choose the copy on our node/cpu */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	pos = userptr;
 	size = total_size;
 	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
@@ -1786,7 +1785,7 @@ struct xt_table *arpt_register_table(struct net *net,
 	}
 
 	/* choose the copy on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(newinfo, loc_cpu_entry, repl);
@@ -1817,7 +1816,7 @@ void arpt_unregister_table(struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a68c377..6fa6213 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -261,7 +261,7 @@ static void trace_packet(const struct sk_buff *skb,
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[numa_node_id()];
 	root = get_entry(table_base, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
@@ -332,7 +332,7 @@ ipt_do_table(struct sk_buff *skb,
 	 * pointer.
 	 */
 	smp_read_barrier_depends();
-	table_base = private->entries[cpu];
+	table_base = private->entries[cpu_to_node(cpu)];
 	jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -870,8 +870,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
+	/* And one copy for every other NUMA node */
+	for_each_node(i) {
 		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
 			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
@@ -945,7 +945,7 @@ copy_entries_to_user(unsigned int total_size,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1062,7 +1062,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	/* we dont care about newinfo->entries[] */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries[numa_node_id()];
 	xt_compat_init_offsets(AF_INET, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1226,7 +1226,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	loc_cpu_old_entry = oldinfo->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
 		cleanup_entry(iter, net);
 
@@ -1271,7 +1271,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1710,7 +1710,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries[numa_node_id()];
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1764,8 +1764,8 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
+	/* And one copy for every other NUMA node */
+	for_each_node(i)
 		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
 			memcpy(newinfo->entries[i], entry1, newinfo->size);
 
@@ -1813,7 +1813,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1896,7 +1896,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	pos = userptr;
 	size = total_size;
 	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
@@ -2075,7 +2075,7 @@ struct xt_table *ipt_register_table(struct net *net,
 	}
 
 	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2106,7 +2106,7 @@ void ipt_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 69aec1d..7491315 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -290,7 +290,7 @@ static void trace_packet(const struct sk_buff *skb,
 	unsigned int rulenum = 0;
 	struct net *net = dev_net(in ? in : out);
 
-	table_base = private->entries[smp_processor_id()];
+	table_base = private->entries[numa_node_id()];
 	root = get_entry(table_base, private->hook_entry[hook]);
 
 	hookname = chainname = hooknames[hook];
@@ -358,7 +358,7 @@ ip6t_do_table(struct sk_buff *skb,
 	 */
 	smp_read_barrier_depends();
 	cpu        = smp_processor_id();
-	table_base = private->entries[cpu];
+	table_base = private->entries[cpu_to_node(cpu)];
 	jumpstack  = (struct ip6t_entry **)private->jumpstack[cpu];
 	stackptr   = per_cpu_ptr(private->stackptr, cpu);
 	origptr    = *stackptr;
@@ -883,8 +883,8 @@ translate_table(struct net *net, struct xt_table_info *newinfo, void *entry0,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i) {
+	/* And one copy for every other NUMA node */
+	for_each_node(i) {
 		if (newinfo->entries[i] && newinfo->entries[i] != entry0)
 			memcpy(newinfo->entries[i], entry0, newinfo->size);
 	}
@@ -958,7 +958,7 @@ copy_entries_to_user(unsigned int total_size,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) {
 		ret = -EFAULT;
 		goto free_counters;
@@ -1075,7 +1075,7 @@ static int compat_table_info(const struct xt_table_info *info,
 	/* we dont care about newinfo->entries[] */
 	memcpy(newinfo, info, offsetof(struct xt_table_info, entries));
 	newinfo->initial_entries = 0;
-	loc_cpu_entry = info->entries[raw_smp_processor_id()];
+	loc_cpu_entry = info->entries[numa_node_id()];
 	xt_compat_init_offsets(AF_INET6, info->number);
 	xt_entry_foreach(iter, loc_cpu_entry, info->size) {
 		ret = compat_calc_entry(iter, info, loc_cpu_entry, newinfo);
@@ -1239,7 +1239,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	get_counters(oldinfo, counters);
 
 	/* Decrease module usage counts and free resource */
-	loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()];
+	loc_cpu_old_entry = oldinfo->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_old_entry, oldinfo->size)
 		cleanup_entry(iter, net);
 
@@ -1284,7 +1284,7 @@ do_replace(struct net *net, const void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1722,7 +1722,7 @@ translate_compat_table(struct net *net,
 		newinfo->hook_entry[i] = info->hook_entry[i];
 		newinfo->underflow[i] = info->underflow[i];
 	}
-	entry1 = newinfo->entries[raw_smp_processor_id()];
+	entry1 = newinfo->entries[numa_node_id()];
 	pos = entry1;
 	size = total_size;
 	xt_entry_foreach(iter0, entry0, total_size) {
@@ -1776,8 +1776,8 @@ translate_compat_table(struct net *net,
 		return ret;
 	}
 
-	/* And one copy for every other CPU */
-	for_each_possible_cpu(i)
+	/* And one copy for every other NUMA node */
+	for_each_node(i)
 		if (newinfo->entries[i] && newinfo->entries[i] != entry1)
 			memcpy(newinfo->entries[i], entry1, newinfo->size);
 
@@ -1825,7 +1825,7 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 		return -ENOMEM;
 
 	/* choose the copy that is on our node/cpu */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	if (copy_from_user(loc_cpu_entry, user + sizeof(tmp),
 			   tmp.size) != 0) {
 		ret = -EFAULT;
@@ -1908,7 +1908,7 @@ compat_copy_entries_to_user(unsigned int total_size, struct xt_table *table,
 	 * This choice is lazy (because current thread is
 	 * allowed to migrate to another cpu)
 	 */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	pos = userptr;
 	size = total_size;
 	xt_entry_foreach(iter, loc_cpu_entry, total_size) {
@@ -2087,7 +2087,7 @@ struct xt_table *ip6t_register_table(struct net *net,
 	}
 
 	/* choose the copy on our node/cpu, but dont care about preemption */
-	loc_cpu_entry = newinfo->entries[raw_smp_processor_id()];
+	loc_cpu_entry = newinfo->entries[numa_node_id()];
 	memcpy(loc_cpu_entry, repl->entries, repl->size);
 
 	ret = translate_table(net, newinfo, loc_cpu_entry, repl);
@@ -2117,7 +2117,7 @@ void ip6t_unregister_table(struct net *net, struct xt_table *table)
 	private = xt_unregister_table(table);
 
 	/* Decrease module usage counts and free resources */
-	loc_cpu_entry = private->entries[raw_smp_processor_id()];
+	loc_cpu_entry = private->entries[numa_node_id()];
 	xt_entry_foreach(iter, loc_cpu_entry, private->size)
 		cleanup_entry(iter, net);
 	if (private->number > private->initial_entries)
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 28e3396..01549c1 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -659,7 +659,7 @@ EXPORT_SYMBOL_GPL(xt_compat_target_to_user);
 struct xt_table_info *xt_alloc_table_info(unsigned int size)
 {
 	struct xt_table_info *newinfo;
-	int cpu;
+	int node;
 
 	/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
 	if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
@@ -671,16 +671,14 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 
 	newinfo->size = size;
 
-	for_each_possible_cpu(cpu) {
+	for_each_node(node) {
 		if (size <= PAGE_SIZE)
-			newinfo->entries[cpu] = kmalloc_node(size,
-							GFP_KERNEL,
-							cpu_to_node(cpu));
+			newinfo->entries[node] = kmalloc_node(size,
+							GFP_KERNEL, node);
 		else
-			newinfo->entries[cpu] = vmalloc_node(size,
-							cpu_to_node(cpu));
+			newinfo->entries[node] = vmalloc_node(size, node);
 
-		if (newinfo->entries[cpu] == NULL) {
+		if (newinfo->entries[node] == NULL) {
 			xt_free_table_info(newinfo);
 			return NULL;
 		}
@@ -692,10 +690,10 @@ EXPORT_SYMBOL(xt_alloc_table_info);
 
 void xt_free_table_info(struct xt_table_info *info)
 {
-	int cpu;
+	int cpu, node;
 
-	for_each_possible_cpu(cpu)
-		kvfree(info->entries[cpu]);
+	for_each_node(node)
+		kvfree(info->entries[node]);
 
 	if (info->counters != NULL) {
 		for_each_possible_cpu(cpu)
-- 
2.0.5


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-28 20:51 [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Florian Westphal
  2015-05-28 20:51 ` [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu Florian Westphal
@ 2015-05-28 21:33 ` Eric Dumazet
  2015-05-28 21:45   ` Florian Westphal
  2015-05-29 11:32 ` Patrick Schaaf
  2 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2015-05-28 21:33 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Thu, 2015-05-28 at 22:51 +0200, Florian Westphal wrote:
> The binary arp/ip/ip6tables ruleset is stored per cpu.
> 
> The only reason left as to why we need percpu duplication are the rule
> counters embedded into ipt_entry et al -- since each cpu has its own copy
> of the rules, all counters can be lockless.
> 
> The downside is that the more cpus are supported, the more memory is
> required.  Rules are not just duplicated per online cpu but for each
> possible cpu, i.e. if maxcpu is 144, then rule is duplicated 144 times,
> not for the e.g. 64 cores present.
> 
> To save some memory and also allow cpus with shared caches to make
> better use of available cache size, it would be preferable to only
> store a copy of the rule blob for each numa node.
> 
> So we first need to separate counters and the rule blob.
> 
> We create array of struct xt_counters for each possible cpu and
> index them from the main blob via the (unused after validation)
> ->comefrom member.

Yeah, I remember trying to work on this exact idea some years ago,
but gave up (maybe because nftable was coming)

> 
> Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---
>  Changes since v1:
>   - add ->comefrom comment in arptables, too
> 
>  include/linux/netfilter/x_tables.h |  6 ++++++
>  net/ipv4/netfilter/arp_tables.c    | 33 +++++++++++++++---------------
>  net/ipv4/netfilter/ip_tables.c     | 31 ++++++++++++++--------------
>  net/ipv6/netfilter/ip6_tables.c    | 32 ++++++++++++++---------------
>  net/netfilter/x_tables.c           | 42 ++++++++++++++++++++++++++++++++++++++
>  5 files changed, 95 insertions(+), 49 deletions(-)
> 
> diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
> index 09f3820..e50ba76 100644
> --- a/include/linux/netfilter/x_tables.h
> +++ b/include/linux/netfilter/x_tables.h
> @@ -224,6 +224,12 @@ struct xt_table_info {
>  	unsigned int stacksize;
>  	unsigned int __percpu *stackptr;
>  	void ***jumpstack;
> +
> +	/* pointer to array of counters, one per CPU
> +	 * each rule maps 1:1 to an entry in the percpu counter array.
> +	 */
> +	struct xt_counters **counters;
> +

You could avoid using this array, if you use alloc_percpu(struct
xt_counter) per counter.

In the rules, instead of storing the index of each counter, store the
percpu address.

This would avoid yet another indirection in iptables.

And it would be nice avoiding this stuff on non SMP kernels maybe ?

Thanks !



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu
  2015-05-28 20:51 ` [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu Florian Westphal
@ 2015-05-28 21:38   ` Eric Dumazet
  2015-05-28 21:52     ` Florian Westphal
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2015-05-28 21:38 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Thu, 2015-05-28 at 22:51 +0200, Florian Westphal wrote:
> We store rule blob per (possible) cpu.  Unfortunately this means we can
> waste lot of memory on big smp machines. ipt_entry structure ('rule head')
> is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM.
> 
> Since previous patch moved counters to separate percpu blob, it appears
> there is nothing left in the rule blob that must be percpu.
> 
> Thus only duplicate the rule blob for each NUMA node.
> 
> On my test system (144 possible cpus, one numa node, 400k dummy rules) this
> change saves close to 9 Gigabyte of RAM.
> 
> Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> ---

Really if the program is now readonly, I would keep a single copy in
memory.

Are we copying kernel text to each NUMA node ? ;)




^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-28 21:33 ` [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Eric Dumazet
@ 2015-05-28 21:45   ` Florian Westphal
  2015-05-28 21:54     ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Westphal @ 2015-05-28 21:45 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > +
> > +	/* pointer to array of counters, one per CPU
> > +	 * each rule maps 1:1 to an entry in the percpu counter array.
> > +	 */
> > +	struct xt_counters **counters;
> > +
> 
> You could avoid using this array, if you use alloc_percpu(struct
> xt_counter) per counter.

I used this since it fits with the jumpstack allocation that we already
have.

Is there an inherent advantage to alloc_percpu?

[ I'm asking to see if it makes sense to convert jump stack too ].

> In the rules, instead of storing the index of each counter, store the
> percpu address.

-v

How?  What address?  You mean relative offset to counter start?

> This would avoid yet another indirection in iptables.

I don't see how I can avoid it.

when rule x matches, I need to increment the corresponding counter
for that rule.

But there is no 1:1 mapping of addresses, and I found no way to infer
the correct counter address to use for the rule just by looking at
ipt_entry address.

Thats why the entry stores the 'counter' index: to find the counter to
increment based on current cpu counter array and the rule number.

> And it would be nice avoiding this stuff on non SMP kernels maybe ?

Hmm.  Will have to think about this to minimize ifdef kludgery.
But, yes, I agree.  Will fix it in v3.

Thanks Eric.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu
  2015-05-28 21:38   ` Eric Dumazet
@ 2015-05-28 21:52     ` Florian Westphal
  2015-05-28 22:04       ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Westphal @ 2015-05-28 21:52 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2015-05-28 at 22:51 +0200, Florian Westphal wrote:
> > We store rule blob per (possible) cpu.  Unfortunately this means we can
> > waste lot of memory on big smp machines. ipt_entry structure ('rule head')
> > is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM.
> > 
> > Since previous patch moved counters to separate percpu blob, it appears
> > there is nothing left in the rule blob that must be percpu.
> > 
> > Thus only duplicate the rule blob for each NUMA node.
> > 
> > On my test system (144 possible cpus, one numa node, 400k dummy rules) this
> > change saves close to 9 Gigabyte of RAM.
> > 
> > Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > Signed-off-by: Florian Westphal <fw@strlen.de>
> > ---
> 
> Really if the program is now readonly, I would keep a single copy in
> memory.

Some matches (limit for instance) store kernel data ptr in their
matchinfo data (from checkentry hook, not per packet match function),
so its not 100% readonly.

> Are we copying kernel text to each NUMA node ? ;)

Beats me.  I was under impression that cpu accessing memory on other node
takes access penalty, thats why I changed it to per node allocation.

Is it insignificant in practice?

If so, I can respin it w/o the numa duplication; we can still add it
back later if needed.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-28 21:45   ` Florian Westphal
@ 2015-05-28 21:54     ` Eric Dumazet
  2015-05-29 10:05       ` Florian Westphal
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2015-05-28 21:54 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Thu, 2015-05-28 at 23:45 +0200, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > +
> > > +	/* pointer to array of counters, one per CPU
> > > +	 * each rule maps 1:1 to an entry in the percpu counter array.
> > > +	 */
> > > +	struct xt_counters **counters;
> > > +
> > 
> > You could avoid using this array, if you use alloc_percpu(struct
> > xt_counter) per counter.
> 
> I used this since it fits with the jumpstack allocation that we already
> have.
> 
> Is there an inherent advantage to alloc_percpu?
> 
> [ I'm asking to see if it makes sense to convert jump stack too ].
> 
> > In the rules, instead of storing the index of each counter, store the
> > percpu address.
> 
> -v
> 
> How?  What address?  You mean relative offset to counter start?

No, the address itself. This is what I coded years ago.

> 
> > This would avoid yet another indirection in iptables.
> 
> I don't see how I can avoid it.
> 
> when rule x matches, I need to increment the corresponding counter
> for that rule.

Right. You have in the rule itself the storage for xt_counter. Thats 16
bytes. You only need 8 bytes (on 64bit arch) to store the percpu
address.

Instead of leaving nothing in it, place the percpu address of the
corresponding counter. access to it is in the cache line needed to
analyze the entry anyway. Its free.

> 
> But there is no 1:1 mapping of addresses, and I found no way to infer
> the correct counter address to use for the rule just by looking at
> ipt_entry address.
> 
> Thats why the entry stores the 'counter' index: to find the counter to
> increment based on current cpu counter array and the rule number.
> 
> > And it would be nice avoiding this stuff on non SMP kernels maybe ?
> 
> Hmm.  Will have to think about this to minimize ifdef kludgery.
> But, yes, I agree.  Will fix it in v3.
> 
> Thanks Eric.



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu
  2015-05-28 21:52     ` Florian Westphal
@ 2015-05-28 22:04       ` Eric Dumazet
  2015-05-29  9:41         ` Florian Westphal
  0 siblings, 1 reply; 13+ messages in thread
From: Eric Dumazet @ 2015-05-28 22:04 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Thu, 2015-05-28 at 23:52 +0200, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > On Thu, 2015-05-28 at 22:51 +0200, Florian Westphal wrote:
> > > We store rule blob per (possible) cpu.  Unfortunately this means we can
> > > waste lot of memory on big smp machines. ipt_entry structure ('rule head')
> > > is 112 byte, so e.g. with maxcpu=64 one single rule eats close to 8k RAM.
> > > 
> > > Since previous patch moved counters to separate percpu blob, it appears
> > > there is nothing left in the rule blob that must be percpu.
> > > 
> > > Thus only duplicate the rule blob for each NUMA node.
> > > 
> > > On my test system (144 possible cpus, one numa node, 400k dummy rules) this
> > > change saves close to 9 Gigabyte of RAM.
> > > 
> > > Reported-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
> > > Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
> > > Signed-off-by: Florian Westphal <fw@strlen.de>
> > > ---
> > 
> > Really if the program is now readonly, I would keep a single copy in
> > memory.
> 
> Some matches (limit for instance) store kernel data ptr in their
> matchinfo data (from checkentry hook, not per packet match function),
> so its not 100% readonly.

Lets change 'read only' by 'read mostly' then ;)

> 
> > Are we copying kernel text to each NUMA node ? ;)
> 
> Beats me.  I was under impression that cpu accessing memory on other node
> takes access penalty, thats why I changed it to per node allocation.

Well, it depends. If one core is busy while others are idle, then
fetching data from 2 NUMA nodes is actually faster. (Some workloads are
actually faster with 'random' NUMA interleaving)

If you constrain all memory access being done from local node, then you
might loose total bandwidth. In practice, intensive workloads will
populate L1/L2/L3 cache, and actual memory location does not really
matter.

> 
> Is it insignificant in practice?

Well, its a trade off I guess.

You made clear that some people had 400k rules ;)



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu
  2015-05-28 22:04       ` Eric Dumazet
@ 2015-05-29  9:41         ` Florian Westphal
  0 siblings, 0 replies; 13+ messages in thread
From: Florian Westphal @ 2015-05-29  9:41 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> On Thu, 2015-05-28 at 23:52 +0200, Florian Westphal wrote:
> > Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > Are we copying kernel text to each NUMA node ? ;)
> > 
> > Beats me.  I was under impression that cpu accessing memory on other node
> > takes access penalty, thats why I changed it to per node allocation.
> 
> Well, it depends. If one core is busy while others are idle, then
> fetching data from 2 NUMA nodes is actually faster. (Some workloads are
> actually faster with 'random' NUMA interleaving)
> 
> If you constrain all memory access being done from local node, then you
> might loose total bandwidth. In practice, intensive workloads will
> populate L1/L2/L3 cache, and actual memory location does not really
> matter.

OK; convinced.  I'll kill the per numa part of the change, thanks Eric.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-28 21:54     ` Eric Dumazet
@ 2015-05-29 10:05       ` Florian Westphal
  2015-05-29 10:32         ` Eric Dumazet
  0 siblings, 1 reply; 13+ messages in thread
From: Florian Westphal @ 2015-05-29 10:05 UTC (permalink / raw)
  To: Eric Dumazet; +Cc: Florian Westphal, netfilter-devel

Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > How?  What address?  You mean relative offset to counter start?
> 
> No, the address itself. This is what I coded years ago.
> > when rule x matches, I need to increment the corresponding counter
> > for that rule.
> 
> Right. You have in the rule itself the storage for xt_counter. Thats 16
> bytes. You only need 8 bytes (on 64bit arch) to store the percpu
> address.

> Instead of leaving nothing in it, place the percpu address of the
> corresponding counter. access to it is in the cache line needed to
> analyze the entry anyway. Its free.

I'm dense.  So, what you're saying is that I should

alloc_percpu(sizeof(struct xt_counters));

for each rule and store the resulting address in ipt_entry?
Then it would be possible to get the correct location for current cpu
when counter has to be incremented.

Thanks Eric.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-29 10:05       ` Florian Westphal
@ 2015-05-29 10:32         ` Eric Dumazet
  0 siblings, 0 replies; 13+ messages in thread
From: Eric Dumazet @ 2015-05-29 10:32 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

On Fri, 2015-05-29 at 12:05 +0200, Florian Westphal wrote:
> Eric Dumazet <eric.dumazet@gmail.com> wrote:
> > > How?  What address?  You mean relative offset to counter start?
> > 
> > No, the address itself. This is what I coded years ago.
> > > when rule x matches, I need to increment the corresponding counter
> > > for that rule.
> > 
> > Right. You have in the rule itself the storage for xt_counter. Thats 16
> > bytes. You only need 8 bytes (on 64bit arch) to store the percpu
> > address.
> 
> > Instead of leaving nothing in it, place the percpu address of the
> > corresponding counter. access to it is in the cache line needed to
> > analyze the entry anyway. Its free.
> 
> I'm dense.  So, what you're saying is that I should
> 
> alloc_percpu(sizeof(struct xt_counters));
> 
> for each rule and store the resulting address in ipt_entry?
> Then it would be possible to get the correct location for current cpu
> when counter has to be incremented.
> 
> Thanks Eric.

That was my plan.

When looking at perf record/report, ipt_do_table spends a lot of time in
setup phase to fetch all these indirections :

       IP_NF_ASSERT(table->valid_hooks & (1 << hook));
        local_bh_disable();
        addend = xt_write_recseq_begin();
        private = table->private;
        cpu        = smp_processor_id();
        /*
         * Ensure we load private-> members after we've fetched the base
         * pointer.
         */
        smp_read_barrier_depends();
        table_base = private->entries[cpu];
        jumpstack  = (struct ipt_entry **)private->jumpstack[cpu];
        stackptr   = per_cpu_ptr(private->stackptr, cpu);
        origptr    = *stackptr;

By storing the pointer to the percpu storage inside the ipt_entry,
you do not need an extra dereference on x86_64.

-               ADD_COUNTER(e->counters, skb->len, 1);
+               ADD_COUNTER(counters[e->comefrom], skb->len, 1);

Would be :

               ADD_COUNTER(e->pcu_addr, skb->len, 1);

If you alias/union pcu_addr and counters

You might need temporary arrays at load/dump times, but not in fast path.

x86_64 would then use

    load e->pcpu_addr into register R (assume it is %rax)
    load skb->len into %rdx

    incq %gs:(%rax)    // pcnt++
    addq %rdx,%gs:8(%rax)  // bcnt += skb->len





^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-28 20:51 [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Florian Westphal
  2015-05-28 20:51 ` [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu Florian Westphal
  2015-05-28 21:33 ` [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Eric Dumazet
@ 2015-05-29 11:32 ` Patrick Schaaf
  2015-06-05 12:28   ` Florian Westphal
  2 siblings, 1 reply; 13+ messages in thread
From: Patrick Schaaf @ 2015-05-29 11:32 UTC (permalink / raw)
  To: Florian Westphal; +Cc: netfilter-devel

Hi Florian (+ list), (resend without HTML part...)

would it be feasible to have sysctl knobs to disable the counters? 

Easiest approach might be to keep all the counter memory allocation 
as it is (or as it is changed with your current work), and just not count at 
packet processing time. Which should make things a bit faster (no 
cache pollution for the RMW counter access of any matching rules.) 

More complicated approach might even save the whole counter 
memory consumption, faking 0 values when returning counters to 
userlevel, and ignoring userlevel supplied values (iptables-restore) 

best regards
  Patrick



^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules
  2015-05-29 11:32 ` Patrick Schaaf
@ 2015-06-05 12:28   ` Florian Westphal
  0 siblings, 0 replies; 13+ messages in thread
From: Florian Westphal @ 2015-06-05 12:28 UTC (permalink / raw)
  To: Patrick Schaaf; +Cc: Florian Westphal, netfilter-devel

Patrick Schaaf <netdev@bof.de> wrote:
> Hi Florian (+ list), (resend without HTML part...)
> 
> would it be feasible to have sysctl knobs to disable the counters? 
>
> Easiest approach might be to keep all the counter memory allocation 
> as it is (or as it is changed with your current work), and just not count at 
> packet processing time. Which should make things a bit faster (no 
> cache pollution for the RMW counter access of any matching rules.) 
> 
> More complicated approach might even save the whole counter 
> memory consumption, faking 0 values when returning counters to 
> userlevel, and ignoring userlevel supplied values (iptables-restore) 

I'm not sure its worth doing, nftables does the right thing already.

I'm merely looking at the percpu rule deduplication because increase
in number of cpus is starting to make it problematic from memory
consumption point of view, and ip(6)tables isn't going to disappear anytime
soon.


^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2015-06-05 12:28 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2015-05-28 20:51 [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Florian Westphal
2015-05-28 20:51 ` [PATCH v2 -next 2/2] netfilter: store rules per NUMA node instead of per cpu Florian Westphal
2015-05-28 21:38   ` Eric Dumazet
2015-05-28 21:52     ` Florian Westphal
2015-05-28 22:04       ` Eric Dumazet
2015-05-29  9:41         ` Florian Westphal
2015-05-28 21:33 ` [PATCH v2 -next 1/2] netfilter: iptables: separate counters from iptables rules Eric Dumazet
2015-05-28 21:45   ` Florian Westphal
2015-05-28 21:54     ` Eric Dumazet
2015-05-29 10:05       ` Florian Westphal
2015-05-29 10:32         ` Eric Dumazet
2015-05-29 11:32 ` Patrick Schaaf
2015-06-05 12:28   ` Florian Westphal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).