* [PATCH 0/3] netfilter fixes for 2.6.37
@ 2011-01-11 23:26 pablo
2011-01-11 23:26 ` [PATCH 1/3] netfilter: x_tables: dont block BH while reading counters pablo
` (3 more replies)
0 siblings, 4 replies; 5+ messages in thread
From: pablo @ 2011-01-11 23:26 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, Pablo Neira Ayuso
From: Pablo Neira Ayuso <pablo@netfilter.org>
Hi David,
The following patchset contains Netfilter bugfixes for 2.6.37.
You can pull them from:
git://1984.lsi.us.es/net-2.6 master
Please, apply!
Eric Dumazet (1):
netfilter: x_tables: dont block BH while reading counters
Florian Westphal (1):
netfilter: ebtables: make broute table work again
Stephen Hemminger (1):
netfilter: fix race in conntrack between dump_table and destroy
include/linux/if_bridge.h | 2 +-
include/linux/netfilter/x_tables.h | 10 ++++----
net/ipv4/netfilter/arp_tables.c | 45 ++++++++++-----------------------
net/ipv4/netfilter/ip_tables.c | 45 ++++++++++-----------------------
net/ipv6/netfilter/ip6_tables.c | 45 ++++++++++-----------------------
net/netfilter/nf_conntrack_netlink.c | 14 ++++-------
net/netfilter/x_tables.c | 3 +-
7 files changed, 55 insertions(+), 109 deletions(-)
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH 1/3] netfilter: x_tables: dont block BH while reading counters
2011-01-11 23:26 [PATCH 0/3] netfilter fixes for 2.6.37 pablo
@ 2011-01-11 23:26 ` pablo
2011-01-11 23:26 ` [PATCH 2/3] netfilter: fix race in conntrack between dump_table and destroy pablo
` (2 subsequent siblings)
3 siblings, 0 replies; 5+ messages in thread
From: pablo @ 2011-01-11 23:26 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, Eric Dumazet, Patrick McHardy, Pablo Neira Ayuso
From: Eric Dumazet <eric.dumazet@gmail.com>
Using "iptables -L" with a lot of rules have a too big BH latency.
Jesper mentioned ~6 ms and worried of frame drops.
Switch to a per_cpu seqlock scheme, so that taking a snapshot of
counters doesnt need to block BH (for this cpu, but also other cpus).
This adds two increments on seqlock sequence per ipt_do_table() call,
its a reasonable cost for allowing "iptables -L" not block BH
processing.
Reported-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
CC: Patrick McHardy <kaber@trash.net>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Jesper Dangaard Brouer <hawk@comx.dk>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/netfilter/x_tables.h | 10 ++++----
net/ipv4/netfilter/arp_tables.c | 45 +++++++++++------------------------
net/ipv4/netfilter/ip_tables.c | 45 +++++++++++------------------------
net/ipv6/netfilter/ip6_tables.c | 45 +++++++++++------------------------
net/netfilter/x_tables.c | 3 +-
5 files changed, 49 insertions(+), 99 deletions(-)
diff --git a/include/linux/netfilter/x_tables.h b/include/linux/netfilter/x_tables.h
index 742bec0..6712e71 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -472,7 +472,7 @@ extern void xt_free_table_info(struct xt_table_info *info);
* necessary for reading the counters.
*/
struct xt_info_lock {
- spinlock_t lock;
+ seqlock_t lock;
unsigned char readers;
};
DECLARE_PER_CPU(struct xt_info_lock, xt_info_locks);
@@ -497,7 +497,7 @@ static inline void xt_info_rdlock_bh(void)
local_bh_disable();
lock = &__get_cpu_var(xt_info_locks);
if (likely(!lock->readers++))
- spin_lock(&lock->lock);
+ write_seqlock(&lock->lock);
}
static inline void xt_info_rdunlock_bh(void)
@@ -505,7 +505,7 @@ static inline void xt_info_rdunlock_bh(void)
struct xt_info_lock *lock = &__get_cpu_var(xt_info_locks);
if (likely(!--lock->readers))
- spin_unlock(&lock->lock);
+ write_sequnlock(&lock->lock);
local_bh_enable();
}
@@ -516,12 +516,12 @@ static inline void xt_info_rdunlock_bh(void)
*/
static inline void xt_info_wrlock(unsigned int cpu)
{
- spin_lock(&per_cpu(xt_info_locks, cpu).lock);
+ write_seqlock(&per_cpu(xt_info_locks, cpu).lock);
}
static inline void xt_info_wrunlock(unsigned int cpu)
{
- spin_unlock(&per_cpu(xt_info_locks, cpu).lock);
+ write_sequnlock(&per_cpu(xt_info_locks, cpu).lock);
}
/*
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 3fac340..e855fff 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -710,42 +710,25 @@ static void get_counters(const struct xt_table_info *t,
struct arpt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu = get_cpu();
-
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- *
- * Bottom half has to be disabled to prevent deadlock
- * if new softirq were to run and call ipt_do_table
- */
- local_bh_disable();
- i = 0;
- xt_entry_foreach(iter, t->entries[curcpu], t->size) {
- SET_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
- ++i;
- }
- local_bh_enable();
- /* Processing counters from other cpus, we can let bottom half enabled,
- * (preemption is disabled)
- */
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
+ seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
i = 0;
- local_bh_disable();
- xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
- ADD_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqbegin(lock);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqretry(lock, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
}
- xt_info_wrunlock(cpu);
- local_bh_enable();
}
- put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -759,7 +742,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
* about).
*/
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1007,7 +990,7 @@ static int __do_replace(struct net *net, const char *name,
struct arpt_entry *iter;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index a846d63..652efea 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -884,42 +884,25 @@ get_counters(const struct xt_table_info *t,
struct ipt_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu = get_cpu();
-
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU.
- *
- * Bottom half has to be disabled to prevent deadlock
- * if new softirq were to run and call ipt_do_table
- */
- local_bh_disable();
- i = 0;
- xt_entry_foreach(iter, t->entries[curcpu], t->size) {
- SET_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
- ++i;
- }
- local_bh_enable();
- /* Processing counters from other cpus, we can let bottom half enabled,
- * (preemption is disabled)
- */
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
+ seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
i = 0;
- local_bh_disable();
- xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
- ADD_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqbegin(lock);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqretry(lock, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
++i; /* macro does multi eval of i */
}
- xt_info_wrunlock(cpu);
- local_bh_enable();
}
- put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -932,7 +915,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1203,7 +1186,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct ipt_entry *iter;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 4555823..7d227c6 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -897,42 +897,25 @@ get_counters(const struct xt_table_info *t,
struct ip6t_entry *iter;
unsigned int cpu;
unsigned int i;
- unsigned int curcpu = get_cpu();
-
- /* Instead of clearing (by a previous call to memset())
- * the counters and using adds, we set the counters
- * with data used by 'current' CPU
- *
- * Bottom half has to be disabled to prevent deadlock
- * if new softirq were to run and call ipt_do_table
- */
- local_bh_disable();
- i = 0;
- xt_entry_foreach(iter, t->entries[curcpu], t->size) {
- SET_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
- ++i;
- }
- local_bh_enable();
- /* Processing counters from other cpus, we can let bottom half enabled,
- * (preemption is disabled)
- */
for_each_possible_cpu(cpu) {
- if (cpu == curcpu)
- continue;
+ seqlock_t *lock = &per_cpu(xt_info_locks, cpu).lock;
+
i = 0;
- local_bh_disable();
- xt_info_wrlock(cpu);
xt_entry_foreach(iter, t->entries[cpu], t->size) {
- ADD_COUNTER(counters[i], iter->counters.bcnt,
- iter->counters.pcnt);
+ u64 bcnt, pcnt;
+ unsigned int start;
+
+ do {
+ start = read_seqbegin(lock);
+ bcnt = iter->counters.bcnt;
+ pcnt = iter->counters.pcnt;
+ } while (read_seqretry(lock, start));
+
+ ADD_COUNTER(counters[i], bcnt, pcnt);
++i;
}
- xt_info_wrunlock(cpu);
- local_bh_enable();
}
- put_cpu();
}
static struct xt_counters *alloc_counters(const struct xt_table *table)
@@ -945,7 +928,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
(other than comefrom, which userspace doesn't care
about). */
countersize = sizeof(struct xt_counters) * private->number;
- counters = vmalloc(countersize);
+ counters = vzalloc(countersize);
if (counters == NULL)
return ERR_PTR(-ENOMEM);
@@ -1216,7 +1199,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
struct ip6t_entry *iter;
ret = 0;
- counters = vmalloc(num_counters * sizeof(struct xt_counters));
+ counters = vzalloc(num_counters * sizeof(struct xt_counters));
if (!counters) {
ret = -ENOMEM;
goto out;
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index 8046350..c942376 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -1325,7 +1325,8 @@ static int __init xt_init(void)
for_each_possible_cpu(i) {
struct xt_info_lock *lock = &per_cpu(xt_info_locks, i);
- spin_lock_init(&lock->lock);
+
+ seqlock_init(&lock->lock);
lock->readers = 0;
}
--
1.7.2.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 2/3] netfilter: fix race in conntrack between dump_table and destroy
2011-01-11 23:26 [PATCH 0/3] netfilter fixes for 2.6.37 pablo
2011-01-11 23:26 ` [PATCH 1/3] netfilter: x_tables: dont block BH while reading counters pablo
@ 2011-01-11 23:26 ` pablo
2011-01-11 23:26 ` [PATCH 3/3] netfilter: ebtables: make broute table work again pablo
2011-01-11 23:43 ` [PATCH 0/3] netfilter fixes for 2.6.37 David Miller
3 siblings, 0 replies; 5+ messages in thread
From: pablo @ 2011-01-11 23:26 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, Stephen Hemminger, Pablo Neira Ayuso
From: Stephen Hemminger <shemminger@vyatta.com>
The netlink interface to dump the connection tracking table has a race
when entries are deleted at the same time. A customer reported a crash
and the backtrace showed thatctnetlink_dump_table was running while a
conntrack entry was being destroyed.
(see https://bugzilla.vyatta.com/show_bug.cgi?id=6402).
According to RCU documentation, when using hlist_nulls the reader
must handle the case of seeing a deleted entry and not proceed
further down the linked list. The old code would continue
which caused the scan to walk into the free list.
This patch uses locking (rather than RCU) for this operation which
is guaranteed safe, and no longer requires getting reference while
doing dump operation.
Signed-off-by: Stephen Hemminger <shemminger@vyatta.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nf_conntrack_netlink.c | 14 +++++---------
1 files changed, 5 insertions(+), 9 deletions(-)
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 7461402..5cb8d30 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -645,25 +645,23 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
struct nfgenmsg *nfmsg = nlmsg_data(cb->nlh);
u_int8_t l3proto = nfmsg->nfgen_family;
- rcu_read_lock();
+ spin_lock_bh(&nf_conntrack_lock);
last = (struct nf_conn *)cb->args[1];
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart:
- hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[cb->args[0]],
+ hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
- if (!atomic_inc_not_zero(&ct->ct_general.use))
- continue;
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
if (l3proto && nf_ct_l3num(ct) != l3proto)
- goto releasect;
+ continue;
if (cb->args[1]) {
if (ct != last)
- goto releasect;
+ continue;
cb->args[1] = 0;
}
if (ctnetlink_fill_info(skb, NETLINK_CB(cb->skb).pid,
@@ -681,8 +679,6 @@ restart:
if (acct)
memset(acct, 0, sizeof(struct nf_conn_counter[IP_CT_DIR_MAX]));
}
-releasect:
- nf_ct_put(ct);
}
if (cb->args[1]) {
cb->args[1] = 0;
@@ -690,7 +686,7 @@ releasect:
}
}
out:
- rcu_read_unlock();
+ spin_unlock_bh(&nf_conntrack_lock);
if (last)
nf_ct_put(last);
--
1.7.2.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* [PATCH 3/3] netfilter: ebtables: make broute table work again
2011-01-11 23:26 [PATCH 0/3] netfilter fixes for 2.6.37 pablo
2011-01-11 23:26 ` [PATCH 1/3] netfilter: x_tables: dont block BH while reading counters pablo
2011-01-11 23:26 ` [PATCH 2/3] netfilter: fix race in conntrack between dump_table and destroy pablo
@ 2011-01-11 23:26 ` pablo
2011-01-11 23:43 ` [PATCH 0/3] netfilter fixes for 2.6.37 David Miller
3 siblings, 0 replies; 5+ messages in thread
From: pablo @ 2011-01-11 23:26 UTC (permalink / raw)
To: davem; +Cc: netfilter-devel, Florian Westphal, Pablo Neira Ayuso
From: Florian Westphal <fw@strlen.de>
broute table init hook sets up the "br_should_route_hook" pointer,
which then gets called from br_input.
commit a386f99025f13b32502fe5dedf223c20d7283826
(bridge: add proper RCU annotation to should_route_hook)
introduced a typedef, and then changed this to:
br_should_route_hook_t *rhook;
[..]
rhook = rcu_dereference(br_should_route_hook);
if (*rhook(skb))
problem is that "br_should_route_hook" contains the address of the function,
so calling *rhook() results in kernel panic.
Signed-off-by: Florian Westphal <fw@strlen.de>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
include/linux/if_bridge.h | 2 +-
1 files changed, 1 insertions(+), 1 deletions(-)
diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h
index f7e73c3..dd3f201 100644
--- a/include/linux/if_bridge.h
+++ b/include/linux/if_bridge.h
@@ -103,7 +103,7 @@ struct __fdb_entry {
extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *));
-typedef int (*br_should_route_hook_t)(struct sk_buff *skb);
+typedef int br_should_route_hook_t(struct sk_buff *skb);
extern br_should_route_hook_t __rcu *br_should_route_hook;
#endif
--
1.7.2.3
^ permalink raw reply related [flat|nested] 5+ messages in thread
* Re: [PATCH 0/3] netfilter fixes for 2.6.37
2011-01-11 23:26 [PATCH 0/3] netfilter fixes for 2.6.37 pablo
` (2 preceding siblings ...)
2011-01-11 23:26 ` [PATCH 3/3] netfilter: ebtables: make broute table work again pablo
@ 2011-01-11 23:43 ` David Miller
3 siblings, 0 replies; 5+ messages in thread
From: David Miller @ 2011-01-11 23:43 UTC (permalink / raw)
To: pablo; +Cc: netfilter-devel
From: pablo@netfilter.org
Date: Wed, 12 Jan 2011 00:26:15 +0100
> From: Pablo Neira Ayuso <pablo@netfilter.org>
>
> Hi David,
>
> The following patchset contains Netfilter bugfixes for 2.6.37.
>
> You can pull them from:
>
> git://1984.lsi.us.es/net-2.6 master
>
> Please, apply!
Pulled, thanks a lot!
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2011-01-11 23:43 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2011-01-11 23:26 [PATCH 0/3] netfilter fixes for 2.6.37 pablo
2011-01-11 23:26 ` [PATCH 1/3] netfilter: x_tables: dont block BH while reading counters pablo
2011-01-11 23:26 ` [PATCH 2/3] netfilter: fix race in conntrack between dump_table and destroy pablo
2011-01-11 23:26 ` [PATCH 3/3] netfilter: ebtables: make broute table work again pablo
2011-01-11 23:43 ` [PATCH 0/3] netfilter fixes for 2.6.37 David Miller
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).