* [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
@ 2017-11-12 20:23 Florian Westphal
2017-11-13 8:36 ` Greg KH
2017-11-14 7:07 ` Sebastian Gottschall
0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-12 20:23 UTC (permalink / raw)
To: stable; +Cc: g.nault, pablo, Florian Westphal
commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
It was not a good idea. The custom hash table was a much better
fit for this purpose.
A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.
rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).
The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks
1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.
I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics. rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.
Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.
Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
Hi Greg,
this is a 4.9.y backport of the same revert that already went into 4.13.y;
please consider applying this. I briefly tested this in kvm, at very least
it doesn't break nat redirect in obvious ways...
include/net/netfilter/nf_conntrack.h | 3 +-
include/net/netfilter/nf_nat.h | 1 -
net/netfilter/nf_nat_core.c | 133 +++++++++++++++--------------------
3 files changed, 57 insertions(+), 80 deletions(-)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d9d52c020a70..9ae819e27940 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
-#include <linux/rhashtable.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/netfilter/nf_conntrack_dccp.h>
@@ -101,7 +100,7 @@ struct nf_conn {
possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT)
- struct rhlist_head nat_bysource;
+ struct hlist_node nat_bysource;
#endif
/* all members below initialized via memset */
u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index c327a431a6f3..02515f7ed4cc 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
#ifndef _NF_NAT_H
#define _NF_NAT_H
-#include <linux/rhashtable.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 2916f4815c9c..87d67e714dc4 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
+static DEFINE_SPINLOCK(nf_nat_lock);
+
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
-struct nf_nat_conn_key {
- const struct net *net;
- const struct nf_conntrack_tuple *tuple;
- const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
@@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_tuple *t;
- const struct nf_conn *ct = data;
+ unsigned int hash;
+
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
/* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- seed ^= net_hash_mix(nf_ct_net(ct));
- return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
- t->dst.protonum ^ seed);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct,
t->src.u.all == tuple->src.u.all);
}
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct nf_nat_conn_key *key = arg->key;
- const struct nf_conn *ct = obj;
-
- if (!same_src(ct, key->tuple) ||
- !net_eq(nf_ct_net(ct), key->net) ||
- !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
- return 1;
-
- return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
- .head_offset = offsetof(struct nf_conn, nat_bysource),
- .obj_hashfn = nf_nat_bysource_hash,
- .obj_cmpfn = nf_nat_bysource_cmp,
- .nelem_hint = 256,
- .min_size = 1024,
-};
-
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@ find_appropriate_src(struct net *net,
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
+ unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct;
- struct nf_nat_conn_key key = {
- .net = net,
- .tuple = tuple,
- .zone = zone
- };
- struct rhlist_head *hl, *h;
-
- hl = rhltable_lookup(&nf_nat_bysource_table, &key,
- nf_nat_bysource_params);
- rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
- nf_ct_invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
-
- if (in_range(l3proto, l4proto, result, range))
- return 1;
+ hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+ if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
+
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
}
-
return 0;
}
@@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct,
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
+ struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
@@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct,
}
if (maniptype == NF_NAT_MANIP_SRC) {
- struct nf_nat_conn_key key = {
- .net = nf_ct_net(ct),
- .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- .zone = nf_ct_zone(ct),
- };
- int err;
-
- err = rhltable_insert_key(&nf_nat_bysource_table,
- &key,
- &ct->nat_bysource,
- nf_nat_bysource_params);
- if (err)
- return NF_DROP;
+ unsigned int srchash;
+
+ srchash = hash_by_src(net,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate extension aera */
+ nat = nfct_nat(ct);
+ hlist_add_head_rcu(&ct->nat_bysource,
+ &nf_nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
}
/* It's done. */
@@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
* Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
* will delete entry from already-freed table.
*/
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&ct->nat_bysource);
ct->status &= ~IPS_NAT_DONE_MASK;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ spin_unlock_bh(&nf_nat_lock);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
if (!nat)
return;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
+
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&ct->nat_bysource);
+ spin_unlock_bh(&nf_nat_lock);
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -846,13 +822,16 @@ static int __init nf_nat_init(void)
{
int ret;
- ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
- if (ret)
- return ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -876,7 +855,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
-
- rhltable_destroy(&nf_nat_bysource_table);
+ synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
--
2.13.6
^ permalink raw reply related [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
@ 2017-11-13 8:36 ` Greg KH
2017-11-14 7:07 ` Sebastian Gottschall
1 sibling, 0 replies; 14+ messages in thread
From: Greg KH @ 2017-11-13 8:36 UTC (permalink / raw)
To: Florian Westphal; +Cc: stable, g.nault, pablo
On Sun, Nov 12, 2017 at 09:23:47PM +0100, Florian Westphal wrote:
> commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
>
> This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
>
> It was not a good idea. The custom hash table was a much better
> fit for this purpose.
>
> A fast lookup is not essential, in fact for most cases there is no lookup
> at all because original tuple is not taken and can be used as-is.
> What needs to be fast is insertion and deletion.
>
> rhlist removal however requires a rhlist walk.
> We can have thousands of entries in such a list if source port/addresses
> are reused for multiple flows, if this happens removal requests are so
> expensive that deletions of a few thousand flows can take several
> seconds(!).
>
> The advantages that we got from rhashtable are:
> 1) table auto-sizing
> 2) multiple locks
>
> 1) would be nice to have, but it is not essential as we have at
> most one lookup per new flow, so even a million flows in the bysource
> table are not a problem compared to current deletion cost.
> 2) is easy to add to custom hash table.
>
> I tried to add hlist_node to rhlist to speed up rhltable_remove but this
> isn't doable without changing semantics. rhltable_remove_fast will
> check that the to-be-deleted object is part of the table and that
> requires a list walk that we want to avoid.
>
> Furthermore, using hlist_node increases size of struct rhlist_head, which
> in turn increases nf_conn size.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
> Reported-by: Ivan Babrou <ibobrik@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> Hi Greg,
>
> this is a 4.9.y backport of the same revert that already went into 4.13.y;
> please consider applying this. I briefly tested this in kvm, at very least
> it doesn't break nat redirect in obvious ways...
Very nice, thanks for this, now queued up.
greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
2017-11-13 8:36 ` Greg KH
@ 2017-11-14 7:07 ` Sebastian Gottschall
2017-11-14 11:05 ` Florian Westphal
1 sibling, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14 7:07 UTC (permalink / raw)
To: Florian Westphal, stable; +Cc: g.nault, pablo
this should belong to you. with the revert the kernel crashes for me
[ 24.838120] Unable to handle kernel NULL pointer dereference at
virtual address 00000055
[ 24.846193] pgd = c0004000
[ 24.848893] [00000055] *pgd=00000000
[ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
[ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd
ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base
qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k
ath9k_common ath9k_hw ath mac80211 cfg80211 compat
[ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
[ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
[ 24.892921] task: ef029ac0 task.stack: ef05a000
[ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
[ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
[ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153
[ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c
[ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800
[ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00
[ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718
[ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM
Segment user
[ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055
[ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
[ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Am 12.11.2017 um 21:23 schrieb Florian Westphal:
> commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
>
> This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
>
> It was not a good idea. The custom hash table was a much better
> fit for this purpose.
>
> A fast lookup is not essential, in fact for most cases there is no lookup
> at all because original tuple is not taken and can be used as-is.
> What needs to be fast is insertion and deletion.
>
> rhlist removal however requires a rhlist walk.
> We can have thousands of entries in such a list if source port/addresses
> are reused for multiple flows, if this happens removal requests are so
> expensive that deletions of a few thousand flows can take several
> seconds(!).
>
> The advantages that we got from rhashtable are:
> 1) table auto-sizing
> 2) multiple locks
>
> 1) would be nice to have, but it is not essential as we have at
> most one lookup per new flow, so even a million flows in the bysource
> table are not a problem compared to current deletion cost.
> 2) is easy to add to custom hash table.
>
> I tried to add hlist_node to rhlist to speed up rhltable_remove but this
> isn't doable without changing semantics. rhltable_remove_fast will
> check that the to-be-deleted object is part of the table and that
> requires a list walk that we want to avoid.
>
> Furthermore, using hlist_node increases size of struct rhlist_head, which
> in turn increases nf_conn size.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
> Reported-by: Ivan Babrou <ibobrik@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
> Hi Greg,
>
> this is a 4.9.y backport of the same revert that already went into 4.13.y;
> please consider applying this. I briefly tested this in kvm, at very least
> it doesn't break nat redirect in obvious ways...
>
> include/net/netfilter/nf_conntrack.h | 3 +-
> include/net/netfilter/nf_nat.h | 1 -
> net/netfilter/nf_nat_core.c | 133 +++++++++++++++--------------------
> 3 files changed, 57 insertions(+), 80 deletions(-)
>
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index d9d52c020a70..9ae819e27940 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -17,7 +17,6 @@
> #include <linux/bitops.h>
> #include <linux/compiler.h>
> #include <linux/atomic.h>
> -#include <linux/rhashtable.h>
>
> #include <linux/netfilter/nf_conntrack_tcp.h>
> #include <linux/netfilter/nf_conntrack_dccp.h>
> @@ -101,7 +100,7 @@ struct nf_conn {
> possible_net_t ct_net;
>
> #if IS_ENABLED(CONFIG_NF_NAT)
> - struct rhlist_head nat_bysource;
> + struct hlist_node nat_bysource;
> #endif
> /* all members below initialized via memset */
> u8 __nfct_init_offset[0];
> diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
> index c327a431a6f3..02515f7ed4cc 100644
> --- a/include/net/netfilter/nf_nat.h
> +++ b/include/net/netfilter/nf_nat.h
> @@ -1,6 +1,5 @@
> #ifndef _NF_NAT_H
> #define _NF_NAT_H
> -#include <linux/rhashtable.h>
> #include <linux/netfilter_ipv4.h>
> #include <linux/netfilter/nf_nat.h>
> #include <net/netfilter/nf_conntrack_tuple.h>
> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
> index 2916f4815c9c..87d67e714dc4 100644
> --- a/net/netfilter/nf_nat_core.c
> +++ b/net/netfilter/nf_nat_core.c
> @@ -30,19 +30,17 @@
> #include <net/netfilter/nf_conntrack_zones.h>
> #include <linux/netfilter/nf_nat.h>
>
> +static DEFINE_SPINLOCK(nf_nat_lock);
> +
> static DEFINE_MUTEX(nf_nat_proto_mutex);
> static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
> __read_mostly;
> static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
> __read_mostly;
>
> -struct nf_nat_conn_key {
> - const struct net *net;
> - const struct nf_conntrack_tuple *tuple;
> - const struct nf_conntrack_zone *zone;
> -};
> -
> -static struct rhltable nf_nat_bysource_table;
> +static struct hlist_head *nf_nat_bysource __read_mostly;
> +static unsigned int nf_nat_htable_size __read_mostly;
> +static unsigned int nf_nat_hash_rnd __read_mostly;
>
> inline const struct nf_nat_l3proto *
> __nf_nat_l3proto_find(u8 family)
> @@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
> EXPORT_SYMBOL(nf_xfrm_me_harder);
> #endif /* CONFIG_XFRM */
>
> -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
> +/* We keep an extra hash for each conntrack, for fast searching. */
> +static inline unsigned int
> +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
> {
> - const struct nf_conntrack_tuple *t;
> - const struct nf_conn *ct = data;
> + unsigned int hash;
> +
> + get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
>
> - t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
> /* Original src, to ensure we map it consistently if poss. */
> + hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
> + tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
>
> - seed ^= net_hash_mix(nf_ct_net(ct));
> - return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
> - t->dst.protonum ^ seed);
> + return reciprocal_scale(hash, nf_nat_htable_size);
> }
>
> /* Is this tuple already taken? (not by us) */
> @@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct,
> t->src.u.all == tuple->src.u.all);
> }
>
> -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
> - const void *obj)
> -{
> - const struct nf_nat_conn_key *key = arg->key;
> - const struct nf_conn *ct = obj;
> -
> - if (!same_src(ct, key->tuple) ||
> - !net_eq(nf_ct_net(ct), key->net) ||
> - !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
> - return 1;
> -
> - return 0;
> -}
> -
> -static struct rhashtable_params nf_nat_bysource_params = {
> - .head_offset = offsetof(struct nf_conn, nat_bysource),
> - .obj_hashfn = nf_nat_bysource_hash,
> - .obj_cmpfn = nf_nat_bysource_cmp,
> - .nelem_hint = 256,
> - .min_size = 1024,
> -};
> -
> /* Only called for SRC manip */
> static int
> find_appropriate_src(struct net *net,
> @@ -219,26 +197,22 @@ find_appropriate_src(struct net *net,
> struct nf_conntrack_tuple *result,
> const struct nf_nat_range *range)
> {
> + unsigned int h = hash_by_src(net, tuple);
> const struct nf_conn *ct;
> - struct nf_nat_conn_key key = {
> - .net = net,
> - .tuple = tuple,
> - .zone = zone
> - };
> - struct rhlist_head *hl, *h;
> -
> - hl = rhltable_lookup(&nf_nat_bysource_table, &key,
> - nf_nat_bysource_params);
>
> - rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
> - nf_ct_invert_tuplepr(result,
> - &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> - result->dst = tuple->dst;
> -
> - if (in_range(l3proto, l4proto, result, range))
> - return 1;
> + hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
> + if (same_src(ct, tuple) &&
> + net_eq(net, nf_ct_net(ct)) &&
> + nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
> + /* Copy source part from reply tuple. */
> + nf_ct_invert_tuplepr(result,
> + &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> + result->dst = tuple->dst;
> +
> + if (in_range(l3proto, l4proto, result, range))
> + return 1;
> + }
> }
> -
> return 0;
> }
>
> @@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct,
> const struct nf_nat_range *range,
> enum nf_nat_manip_type maniptype)
> {
> + struct net *net = nf_ct_net(ct);
> struct nf_conntrack_tuple curr_tuple, new_tuple;
> struct nf_conn_nat *nat;
>
> @@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct,
> }
>
> if (maniptype == NF_NAT_MANIP_SRC) {
> - struct nf_nat_conn_key key = {
> - .net = nf_ct_net(ct),
> - .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> - .zone = nf_ct_zone(ct),
> - };
> - int err;
> -
> - err = rhltable_insert_key(&nf_nat_bysource_table,
> - &key,
> - &ct->nat_bysource,
> - nf_nat_bysource_params);
> - if (err)
> - return NF_DROP;
> + unsigned int srchash;
> +
> + srchash = hash_by_src(net,
> + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> + spin_lock_bh(&nf_nat_lock);
> + /* nf_conntrack_alter_reply might re-allocate extension aera */
> + nat = nfct_nat(ct);
> + hlist_add_head_rcu(&ct->nat_bysource,
> + &nf_nat_bysource[srchash]);
> + spin_unlock_bh(&nf_nat_lock);
> }
>
> /* It's done. */
> @@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
> * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
> * will delete entry from already-freed table.
> */
> + spin_lock_bh(&nf_nat_lock);
> + hlist_del_rcu(&ct->nat_bysource);
> ct->status &= ~IPS_NAT_DONE_MASK;
> - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
> - nf_nat_bysource_params);
> + spin_unlock_bh(&nf_nat_lock);
>
> /* don't delete conntrack. Although that would make things a lot
> * simpler, we'd end up flushing all conntracks on nat rmmod.
> @@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> if (!nat)
> return;
>
> - rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
> - nf_nat_bysource_params);
> + NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> +
> + spin_lock_bh(&nf_nat_lock);
> + hlist_del_rcu(&ct->nat_bysource);
> + spin_unlock_bh(&nf_nat_lock);
> }
>
> static struct nf_ct_ext_type nat_extend __read_mostly = {
> @@ -846,13 +822,16 @@ static int __init nf_nat_init(void)
> {
> int ret;
>
> - ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
> - if (ret)
> - return ret;
> + /* Leave them the same for the moment. */
> + nf_nat_htable_size = nf_conntrack_htable_size;
> +
> + nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
> + if (!nf_nat_bysource)
> + return -ENOMEM;
>
> ret = nf_ct_extend_register(&nat_extend);
> if (ret < 0) {
> - rhltable_destroy(&nf_nat_bysource_table);
> + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
> printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
> return ret;
> }
> @@ -876,7 +855,7 @@ static int __init nf_nat_init(void)
> return 0;
>
> cleanup_extend:
> - rhltable_destroy(&nf_nat_bysource_table);
> + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
> nf_ct_extend_unregister(&nat_extend);
> return ret;
> }
> @@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void)
>
> for (i = 0; i < NFPROTO_NUMPROTO; i++)
> kfree(nf_nat_l4protos[i]);
> -
> - rhltable_destroy(&nf_nat_bysource_table);
> + synchronize_net();
> + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
> }
>
> MODULE_LICENSE("GPL");
--
Mit freundlichen Grüssen / Regards
Sebastian Gottschall / CTO
NewMedia-NET GmbH - DD-WRT
Firmensitz: Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 7:07 ` Sebastian Gottschall
@ 2017-11-14 11:05 ` Florian Westphal
2017-11-14 11:54 ` Guillaume Nault
2017-11-14 19:30 ` Guillaume Nault
0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-14 11:05 UTC (permalink / raw)
To: Sebastian Gottschall; +Cc: Florian Westphal, stable, g.nault, pablo
Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> this should belong to you. with the revert the kernel crashes for me
>
> [ᅵᅵ 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> address 00000055
> [ᅵᅵ 24.846193] pgd = c0004000
> [ᅵᅵ 24.848893] [00000055] *pgd=00000000
> [ᅵᅵ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> [ᅵᅵ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> ath9k_hw ath mac80211 cfg80211 compat
> [ᅵᅵ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> [ᅵᅵ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> [ᅵᅵ 24.892921] task: ef029ac0 task.stack: ef05a000
> [ᅵᅵ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> [ᅵᅵ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> [ᅵᅵ 24.907869] pc : [<c04858c8>]ᅵᅵᅵ lr : [<c04858b4>]ᅵᅵᅵ psr: 60000153
> [ᅵᅵ 24.907869] sp : ef05bb58ᅵ ip : ef05bb58ᅵ fp : ef05bb6c
> [ᅵᅵ 24.919317] r10: ed230cc0ᅵ r9 : ed230c00ᅵ r8 : edf45800
> [ᅵᅵ 24.924529] r7 : ebcd2f00ᅵ r6 : ec33739eᅵ r5 : c0892294ᅵ r4 : ebcd2f00
> [ᅵᅵ 24.931040] r3 : 00000000ᅵ r2 : 00000055ᅵ r1 : 00000000ᅵ r0 : c0892718
> [ᅵᅵ 24.937551] Flags: nZCvᅵ IRQs onᅵ FIQs offᅵ Mode SVC_32ᅵ ISA ARMᅵ Segment
> user
> [ᅵᅵ 24.944755] Control: 10c5387dᅵ Table: 2bd1006aᅵ DAC: 00000055
> [ᅵᅵ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> [ᅵᅵ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
Guillaume, can you check what I missed?
IIRC you said you had a revert for your 4.9 tree, correct?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 11:05 ` Florian Westphal
@ 2017-11-14 11:54 ` Guillaume Nault
2017-11-14 19:30 ` Guillaume Nault
1 sibling, 0 replies; 14+ messages in thread
From: Guillaume Nault @ 2017-11-14 11:54 UTC (permalink / raw)
To: Florian Westphal; +Cc: Sebastian Gottschall, stable, pablo
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>
>
> > this should belong to you. with the revert the kernel crashes for me
> >
> > [�� 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> > address 00000055
> > [�� 24.846193] pgd = c0004000
> > [�� 24.848893] [00000055] *pgd=00000000
> > [�� 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> > [�� 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> > ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> > gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> > ath9k_hw ath mac80211 cfg80211 compat
> > [�� 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> > [�� 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> > [�� 24.892921] task: ef029ac0 task.stack: ef05a000
> > [�� 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> > [�� 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> > [�� 24.907869] pc : [<c04858c8>]��� lr : [<c04858b4>]��� psr: 60000153
> > [�� 24.907869] sp : ef05bb58� ip : ef05bb58� fp : ef05bb6c
> > [�� 24.919317] r10: ed230cc0� r9 : ed230c00� r8 : edf45800
> > [�� 24.924529] r7 : ebcd2f00� r6 : ec33739e� r5 : c0892294� r4 : ebcd2f00
> > [�� 24.931040] r3 : 00000000� r2 : 00000055� r1 : 00000000� r0 : c0892718
> > [�� 24.937551] Flags: nZCv� IRQs on� FIQs off� Mode SVC_32� ISA ARM� Segment
> > user
> > [�� 24.944755] Control: 10c5387d� Table: 2bd1006a� DAC: 00000055
> > [�� 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> > [�� 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>
> Guillaume, can you check what I missed?
> IIRC you said you had a revert for your 4.9 tree, correct?
>
Yes, but my backport was on 4.9.18. IIRC some other nat fixes have been
applied in the mean time.
I just arrived back home yesterday night. I'm going look at this in the
afternoon.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 11:05 ` Florian Westphal
2017-11-14 11:54 ` Guillaume Nault
@ 2017-11-14 19:30 ` Guillaume Nault
2017-11-14 22:16 ` Sebastian Gottschall
1 sibling, 1 reply; 14+ messages in thread
From: Guillaume Nault @ 2017-11-14 19:30 UTC (permalink / raw)
To: Florian Westphal; +Cc: Sebastian Gottschall, stable, pablo
On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>
>
> > this should belong to you. with the revert the kernel crashes for me
> >
> > [�� 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> > address 00000055
> > [�� 24.846193] pgd = c0004000
> > [�� 24.848893] [00000055] *pgd=00000000
> > [�� 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> > [�� 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> > ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> > gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> > ath9k_hw ath mac80211 cfg80211 compat
> > [�� 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> > [�� 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> > [�� 24.892921] task: ef029ac0 task.stack: ef05a000
> > [�� 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> > [�� 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> > [�� 24.907869] pc : [<c04858c8>]��� lr : [<c04858b4>]��� psr: 60000153
> > [�� 24.907869] sp : ef05bb58� ip : ef05bb58� fp : ef05bb6c
> > [�� 24.919317] r10: ed230cc0� r9 : ed230c00� r8 : edf45800
> > [�� 24.924529] r7 : ebcd2f00� r6 : ec33739e� r5 : c0892294� r4 : ebcd2f00
> > [�� 24.931040] r3 : 00000000� r2 : 00000055� r1 : 00000000� r0 : c0892718
> > [�� 24.937551] Flags: nZCv� IRQs on� FIQs off� Mode SVC_32� ISA ARM� Segment
> > user
> > [�� 24.944755] Control: 10c5387d� Table: 2bd1006a� DAC: 00000055
> > [�� 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> > [�� 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>
> Guillaume, can you check what I missed?
> IIRC you said you had a revert for your 4.9 tree, correct?
>
Actually I had forgotten some details when I told you about it. What I
did is simpler than a revert. I just cherry-picked the following
commits:
124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT")
6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
Then I just had to fix one remaining call:
@@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
My original approach was to revert commit
870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
and fix the conflicts by hand. But I had crashes when testing the
result. After a bit of debugging, I decided to simply try cherry-picking
commit e1bf1687740c and its dependencies. Since I got better results
with this approach, I dropped my original revert code (so,
unfortunately, I can't compare with yours and I don't remember if the
crash was similar to Sebastian's).
We have the cherry-picked patches running on several machines since
September. No problem found so far.
Now, looking at Sebastian's report, I suspect that using
nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
instead, which is now set atomically. Theses changes weren't
intentional, but introduced by the dependency on commits 124dffea9e8e
and 6e699867f84c.
Does that make sense to you? I can look more deeply into this tomorrow.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 19:30 ` Guillaume Nault
@ 2017-11-14 22:16 ` Sebastian Gottschall
2017-11-14 22:44 ` Sebastian Gottschall
0 siblings, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14 22:16 UTC (permalink / raw)
To: Guillaume Nault, Florian Westphal; +Cc: stable, pablo
Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
> On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
>> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>>
>>
>>> this should belong to you. with the revert the kernel crashes for me
>>>
>>> [ 24.838120] Unable to handle kernel NULL pointer dereference at virtual
>>> address 00000055
>>> [ 24.846193] pgd = c0004000
>>> [ 24.848893] [00000055] *pgd=00000000
>>> [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
>>> [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
>>> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
>>> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
>>> ath9k_hw ath mac80211 cfg80211 compat
>>> [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
>>> [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
>>> [ 24.892921] task: ef029ac0 task.stack: ef05a000
>>> [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
>>> [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
>>> [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153
>>> [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c
>>> [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800
>>> [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 : ebcd2f00
>>> [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 : c0892718
>>> [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM Segment
>>> user
>>> [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055
>>> [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
>>> [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>>
>> Guillaume, can you check what I missed?
>> IIRC you said you had a revert for your 4.9 tree, correct?
>>
> Actually I had forgotten some details when I told you about it. What I
> did is simpler than a revert. I just cherry-picked the following
> commits:
> 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT")
> 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
> e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
>
> Then I just had to fix one remaining call:
> @@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
> return 0;
>
> cleanup_extend:
> - rhltable_destroy(&nf_nat_bysource_table);
> + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
> nf_ct_extend_unregister(&nat_extend);
> return ret;
> }
>
>
> My original approach was to revert commit
> 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
> and fix the conflicts by hand. But I had crashes when testing the
> result. After a bit of debugging, I decided to simply try cherry-picking
> commit e1bf1687740c and its dependencies. Since I got better results
> with this approach, I dropped my original revert code (so,
> unfortunately, I can't compare with yours and I don't remember if the
> crash was similar to Sebastian's).
> We have the cherry-picked patches running on several machines since
> September. No problem found so far.
>
>
> Now, looking at Sebastian's report, I suspect that using
> nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
> be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
> instead, which is now set atomically. Theses changes weren't
> intentional, but introduced by the dependency on commits 124dffea9e8e
> and 6e699867f84c.
>
> Does that make sense to you? I can look more deeply into this tomorrow.
>
i love crashing my devices. let me test this approach
Sebastian
--
Mit freundlichen Grüssen / Regards
Sebastian Gottschall / CTO
NewMedia-NET GmbH - DD-WRT
Firmensitz: Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 22:16 ` Sebastian Gottschall
@ 2017-11-14 22:44 ` Sebastian Gottschall
2017-11-15 10:13 ` Florian Westphal
0 siblings, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14 22:44 UTC (permalink / raw)
To: Guillaume Nault, Florian Westphal; +Cc: stable, pablo, Greg Kroah-Hartman
the following patch based on the suggestion by Guillaume works good in
my tests and does not crash anymore
--- nf_nat_core.c (Revision 33766)
+++ nf_nat_core.c (Arbeitskopie)
@@ -678,16 +678,11 @@
/* No one using conntrack by the time this called. */
static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
- struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
- if (!nat)
- return;
-
- NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
-
- spin_lock_bh(&nf_nat_lock);
- hlist_del_rcu(&ct->nat_bysource);
- spin_unlock_bh(&nf_nat_lock);
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&ct->nat_bysource);
+ spin_unlock_bh(&nf_nat_lock);
+ }
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
Am 14.11.2017 um 23:16 schrieb Sebastian Gottschall:
> Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
>> On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
>>> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>>>
>>>
>>>> this should belong to you. with the revert the kernel crashes for me
>>>>
>>>> [ 24.838120] Unable to handle kernel NULL pointer dereference at
>>>> virtual
>>>> address 00000055
>>>> [ 24.846193] pgd = c0004000
>>>> [ 24.848893] [00000055] *pgd=00000000
>>>> [ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
>>>> [ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd
>>>> ohci_hcd
>>>> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
>>>> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k
>>>> ath9k_common
>>>> ath9k_hw ath mac80211 cfg80211 compat
>>>> [ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1
>>>> #90
>>>> [ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
>>>> [ 24.892921] task: ef029ac0 task.stack: ef05a000
>>>> [ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
>>>> [ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
>>>> [ 24.907869] pc : [<c04858c8>] lr : [<c04858b4>] psr: 60000153
>>>> [ 24.907869] sp : ef05bb58 ip : ef05bb58 fp : ef05bb6c
>>>> [ 24.919317] r10: ed230cc0 r9 : ed230c00 r8 : edf45800
>>>> [ 24.924529] r7 : ebcd2f00 r6 : ec33739e r5 : c0892294 r4 :
>>>> ebcd2f00
>>>> [ 24.931040] r3 : 00000000 r2 : 00000055 r1 : 00000000 r0 :
>>>> c0892718
>>>> [ 24.937551] Flags: nZCv IRQs on FIQs off Mode SVC_32 ISA ARM
>>>> Segment
>>>> user
>>>> [ 24.944755] Control: 10c5387d Table: 2bd1006a DAC: 00000055
>>>> [ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
>>>> [ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>>> Guillaume, can you check what I missed?
>>> IIRC you said you had a revert for your 4.9 tree, correct?
>>>
>> Actually I had forgotten some details when I told you about it. What I
>> did is simpler than a revert. I just cherry-picked the following
>> commits:
>> 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the
>> _SRC_NAT_DONE_BIT")
>> 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
>> e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat
>> bysrc hash to rhashtable"")
>>
>> Then I just had to fix one remaining call:
>> @@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
>> return 0;
>> cleanup_extend:
>> - rhltable_destroy(&nf_nat_bysource_table);
>> + nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>> nf_ct_extend_unregister(&nat_extend);
>> return ret;
>> }
>>
>>
>> My original approach was to revert commit
>> 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
>> and fix the conflicts by hand. But I had crashes when testing the
>> result. After a bit of debugging, I decided to simply try cherry-picking
>> commit e1bf1687740c and its dependencies. Since I got better results
>> with this approach, I dropped my original revert code (so,
>> unfortunately, I can't compare with yours and I don't remember if the
>> crash was similar to Sebastian's).
>> We have the cherry-picked patches running on several machines since
>> September. No problem found so far.
>>
>>
>> Now, looking at Sebastian's report, I suspect that using
>> nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
>> be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
>> instead, which is now set atomically. Theses changes weren't
>> intentional, but introduced by the dependency on commits 124dffea9e8e
>> and 6e699867f84c.
>>
>> Does that make sense to you? I can look more deeply into this tomorrow.
>>
> i love crashing my devices. let me test this approach
>
>
> Sebastian
>
>
>
--
Mit freundlichen Grüssen / Regards
Sebastian Gottschall / CTO
NewMedia-NET GmbH - DD-WRT
Firmensitz: Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-14 22:44 ` Sebastian Gottschall
@ 2017-11-15 10:13 ` Florian Westphal
2017-11-15 14:04 ` Greg Kroah-Hartman
2017-11-15 14:11 ` Pablo Neira Ayuso
0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-15 10:13 UTC (permalink / raw)
To: Sebastian Gottschall
Cc: Guillaume Nault, Florian Westphal, stable, pablo,
Greg Kroah-Hartman
Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> the following patch based on the suggestion by Guillaume works good in my
> tests and does not crash anymore
>
> --- nf_nat_core.cᅵᅵᅵᅵᅵᅵ (Revision 33766)
> +++ nf_nat_core.cᅵᅵᅵᅵᅵᅵ (Arbeitskopie)
> @@ -678,16 +678,11 @@
> ᅵ/* No one using conntrack by the time this called. */
> ᅵstatic void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> ᅵ{
> -ᅵᅵᅵᅵᅵᅵ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> -
> -ᅵᅵᅵᅵᅵᅵ if (!nat)
> -ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ return;
> -
> -ᅵᅵᅵᅵᅵᅵ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> -
> -ᅵᅵᅵᅵᅵᅵ spin_lock_bh(&nf_nat_lock);
> -ᅵᅵᅵᅵᅵᅵ hlist_del_rcu(&ct->nat_bysource);
> -ᅵᅵᅵᅵᅵᅵ spin_unlock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵ if (ct->status & IPS_SRC_NAT_DONE) {
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ spin_lock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ hlist_del_rcu(&ct->nat_bysource);
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ spin_unlock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵ }
> ᅵ}
FWIW I can now reproduce the crash on 4.9-stable + backport.
This crash can be triggered by a simple
iptables -I INPUT 1 -j DROP
on first (new) incoming packet.
Problem is that as not SRC nat transformation is there for
inbound connections such conntrack isn't yet on nat_bysource list
so hlist_del_rcu() gets garbage input.
bug was added in
7c9664351980aaa6a4b8837a314360b3a4ad382a
("netfilter: move nat hlist_head to nf_conn").
... and then 'masked' by the rhashtable conversion (removing non-existing
rhashtable element has no ill effects).
The revert upstream and the 4.13.y one had no such issue because of earlier
change 6e699867f84c0f358fed233fe6162173aca28e04
("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
quoted above.
I will have an updated patch shortly, I think best solution is to just
cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
and update this backport accordingly.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-15 10:13 ` Florian Westphal
@ 2017-11-15 14:04 ` Greg Kroah-Hartman
2017-11-15 14:16 ` Pablo Neira Ayuso
2017-11-15 14:11 ` Pablo Neira Ayuso
1 sibling, 1 reply; 14+ messages in thread
From: Greg Kroah-Hartman @ 2017-11-15 14:04 UTC (permalink / raw)
To: Florian Westphal; +Cc: Sebastian Gottschall, Guillaume Nault, stable, pablo
On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> > the following patch based on the suggestion by Guillaume works good in my
> > tests and does not crash anymore
> >
> > --- nf_nat_core.c������ (Revision 33766)
> > +++ nf_nat_core.c������ (Arbeitskopie)
> > @@ -678,16 +678,11 @@
> > �/* No one using conntrack by the time this called. */
> > �static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> > �{
> > -������ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> > -
> > -������ if (!nat)
> > -�������������� return;
> > -
> > -������ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> > -
> > -������ spin_lock_bh(&nf_nat_lock);
> > -������ hlist_del_rcu(&ct->nat_bysource);
> > -������ spin_unlock_bh(&nf_nat_lock);
> > +������ if (ct->status & IPS_SRC_NAT_DONE) {
> > +�������������� spin_lock_bh(&nf_nat_lock);
> > +�������������� hlist_del_rcu(&ct->nat_bysource);
> > +�������������� spin_unlock_bh(&nf_nat_lock);
> > +������ }
> > �}
>
> FWIW I can now reproduce the crash on 4.9-stable + backport.
> This crash can be triggered by a simple
>
> iptables -I INPUT 1 -j DROP
>
> on first (new) incoming packet.
> Problem is that as not SRC nat transformation is there for
> inbound connections such conntrack isn't yet on nat_bysource list
> so hlist_del_rcu() gets garbage input.
>
> bug was added in
> 7c9664351980aaa6a4b8837a314360b3a4ad382a
> ("netfilter: move nat hlist_head to nf_conn").
>
> ... and then 'masked' by the rhashtable conversion (removing non-existing
> rhashtable element has no ill effects).
>
> The revert upstream and the 4.13.y one had no such issue because of earlier
> change 6e699867f84c0f358fed233fe6162173aca28e04
> ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
> if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
> quoted above.
>
> I will have an updated patch shortly, I think best solution is to just
> cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
> and update this backport accordingly.
Ok, let me just drop this patch, get this 4.9 release out, and then can
you all send me a patch, or series of patches, that you feel should be
applied to 4.9 to resolve this issue?
thanks,
greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-15 10:13 ` Florian Westphal
2017-11-15 14:04 ` Greg Kroah-Hartman
@ 2017-11-15 14:11 ` Pablo Neira Ayuso
1 sibling, 0 replies; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-15 14:11 UTC (permalink / raw)
To: Florian Westphal
Cc: Sebastian Gottschall, Guillaume Nault, stable, Greg Kroah-Hartman
On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> > the following patch based on the suggestion by Guillaume works good in my
> > tests and does not crash anymore
> >
> > --- nf_nat_core.c������ (Revision 33766)
> > +++ nf_nat_core.c������ (Arbeitskopie)
> > @@ -678,16 +678,11 @@
> > �/* No one using conntrack by the time this called. */
> > �static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> > �{
> > -������ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> > -
> > -������ if (!nat)
> > -�������������� return;
> > -
> > -������ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> > -
> > -������ spin_lock_bh(&nf_nat_lock);
> > -������ hlist_del_rcu(&ct->nat_bysource);
> > -������ spin_unlock_bh(&nf_nat_lock);
> > +������ if (ct->status & IPS_SRC_NAT_DONE) {
> > +�������������� spin_lock_bh(&nf_nat_lock);
> > +�������������� hlist_del_rcu(&ct->nat_bysource);
> > +�������������� spin_unlock_bh(&nf_nat_lock);
> > +������ }
> > �}
>
> FWIW I can now reproduce the crash on 4.9-stable + backport.
> This crash can be triggered by a simple
>
> iptables -I INPUT 1 -j DROP
>
> on first (new) incoming packet.
> Problem is that as not SRC nat transformation is there for
> inbound connections such conntrack isn't yet on nat_bysource list
> so hlist_del_rcu() gets garbage input.
>
> bug was added in
> 7c9664351980aaa6a4b8837a314360b3a4ad382a
> ("netfilter: move nat hlist_head to nf_conn").
>
> ... and then 'masked' by the rhashtable conversion (removing non-existing
> rhashtable element has no ill effects).
>
> The revert upstream and the 4.13.y one had no such issue because of earlier
> change 6e699867f84c0f358fed233fe6162173aca28e04
> ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
> if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
> quoted above.
>
> I will have an updated patch shortly, I think best solution is to just
> cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
> and update this backport accordingly.
That's fine. Let's just make sure we get them both patches in together
in the same go.
So I think it's better if we just keep back this cherry-pick until we
have the necessary backport in place.
OK?
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-15 14:04 ` Greg Kroah-Hartman
@ 2017-11-15 14:16 ` Pablo Neira Ayuso
2017-11-15 14:51 ` Greg Kroah-Hartman
2017-11-15 22:57 ` Sebastian Gottschall
0 siblings, 2 replies; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-15 14:16 UTC (permalink / raw)
To: Greg Kroah-Hartman
Cc: Florian Westphal, Sebastian Gottschall, Guillaume Nault, stable
On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
> Ok, let me just drop this patch, get this 4.9 release out, and then can
> you all send me a patch, or series of patches, that you feel should be
> applied to 4.9 to resolve this issue?
Please, indeed drop it for 4.9. We'll take the time here to sort out
things.
Thanks Greg.
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-15 14:16 ` Pablo Neira Ayuso
@ 2017-11-15 14:51 ` Greg Kroah-Hartman
2017-11-15 22:57 ` Sebastian Gottschall
1 sibling, 0 replies; 14+ messages in thread
From: Greg Kroah-Hartman @ 2017-11-15 14:51 UTC (permalink / raw)
To: Pablo Neira Ayuso
Cc: Florian Westphal, Sebastian Gottschall, Guillaume Nault, stable
On Wed, Nov 15, 2017 at 03:16:48PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
> > Ok, let me just drop this patch, get this 4.9 release out, and then can
> > you all send me a patch, or series of patches, that you feel should be
> > applied to 4.9 to resolve this issue?
>
> Please, indeed drop it for 4.9. We'll take the time here to sort out
> things.
Now dropped, thanks all.
greg k-h
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
2017-11-15 14:16 ` Pablo Neira Ayuso
2017-11-15 14:51 ` Greg Kroah-Hartman
@ 2017-11-15 22:57 ` Sebastian Gottschall
1 sibling, 0 replies; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-15 22:57 UTC (permalink / raw)
To: Pablo Neira Ayuso, Greg Kroah-Hartman
Cc: Florian Westphal, Guillaume Nault, stable
[-- Attachment #1: Type: text/plain, Size: 939 bytes --]
Am 15.11.2017 um 15:16 schrieb Pablo Neira Ayuso:
> On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
>> Ok, let me just drop this patch, get this 4.9 release out, and then can
>> you all send me a patch, or series of patches, that you feel should be
>> applied to 4.9 to resolve this issue?
> Please, indeed drop it for 4.9. We'll take the time here to sort out
> things.
>
> Thanks Greg.
>
@florian:
this is the cherry picked variant which applies clean to the current 4.9
series. since i'm not the author, please resubmit it from your side
using the correct format
--
Mit freundlichen Grüssen / Regards
Sebastian Gottschall / CTO
NewMedia-NET GmbH - DD-WRT
Firmensitz: Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565
[-- Attachment #2: revert_nat_bysrc_hash_to_rhashtable.patch --]
[-- Type: text/plain, Size: 8996 bytes --]
--- include/net/netfilter/nf_conntrack.h (revision 33719)
+++ include/net/netfilter/nf_conntrack.h (working copy)
@@ -17,7 +17,6 @@
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/atomic.h>
-#include <linux/rhashtable.h>
#include <linux/netfilter/nf_conntrack_tcp.h>
#include <linux/netfilter/nf_conntrack_dccp.h>
@@ -101,7 +100,7 @@
possible_net_t ct_net;
#if IS_ENABLED(CONFIG_NF_NAT)
- struct rhlist_head nat_bysource;
+ struct hlist_node nat_bysource;
#endif
/* all members below initialized via memset */
u8 __nfct_init_offset[0];
Index: include/net/netfilter/nf_nat.h
===================================================================
--- include/net/netfilter/nf_nat.h (revision 33719)
+++ include/net/netfilter/nf_nat.h (working copy)
@@ -1,6 +1,5 @@
#ifndef _NF_NAT_H
#define _NF_NAT_H
-#include <linux/rhashtable.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter/nf_nat.h>
#include <net/netfilter/nf_conntrack_tuple.h>
--- net/netfilter/nf_nat_core.c (revision 33719)
+++ net/netfilter/nf_nat_core.c (working copy)
@@ -30,6 +30,8 @@
#include <net/netfilter/nf_conntrack_zones.h>
#include <linux/netfilter/nf_nat.h>
+static DEFINE_SPINLOCK(nf_nat_lock);
+
static DEFINE_MUTEX(nf_nat_proto_mutex);
static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
__read_mostly;
@@ -36,14 +38,10 @@
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
-struct nf_nat_conn_key {
- const struct net *net;
- const struct nf_conntrack_tuple *tuple;
- const struct nf_conntrack_zone *zone;
-};
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
-static struct rhltable nf_nat_bysource_table;
-
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
{
@@ -121,17 +119,19 @@
EXPORT_SYMBOL(nf_xfrm_me_harder);
#endif /* CONFIG_XFRM */
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
- const struct nf_conntrack_tuple *t;
- const struct nf_conn *ct = data;
+ unsigned int hash;
- t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+ get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+
/* Original src, to ensure we map it consistently if poss. */
+ hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+ tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
- seed ^= net_hash_mix(nf_ct_net(ct));
- return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
- t->dst.protonum ^ seed);
+ return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
@@ -187,28 +187,6 @@
t->src.u.all == tuple->src.u.all);
}
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
- const void *obj)
-{
- const struct nf_nat_conn_key *key = arg->key;
- const struct nf_conn *ct = obj;
-
- if (!same_src(ct, key->tuple) ||
- !net_eq(nf_ct_net(ct), key->net) ||
- !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
- return 1;
-
- return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
- .head_offset = offsetof(struct nf_conn, nat_bysource),
- .obj_hashfn = nf_nat_bysource_hash,
- .obj_cmpfn = nf_nat_bysource_cmp,
- .nelem_hint = 256,
- .min_size = 1024,
-};
-
/* Only called for SRC manip */
static int
find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@
struct nf_conntrack_tuple *result,
const struct nf_nat_range *range)
{
+ unsigned int h = hash_by_src(net, tuple);
const struct nf_conn *ct;
- struct nf_nat_conn_key key = {
- .net = net,
- .tuple = tuple,
- .zone = zone
- };
- struct rhlist_head *hl, *h;
- hl = rhltable_lookup(&nf_nat_bysource_table, &key,
- nf_nat_bysource_params);
+ hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+ if (same_src(ct, tuple) &&
+ net_eq(net, nf_ct_net(ct)) &&
+ nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+ /* Copy source part from reply tuple. */
+ nf_ct_invert_tuplepr(result,
+ &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+ result->dst = tuple->dst;
- rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
- nf_ct_invert_tuplepr(result,
- &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
- result->dst = tuple->dst;
-
- if (in_range(l3proto, l4proto, result, range))
- return 1;
+ if (in_range(l3proto, l4proto, result, range))
+ return 1;
+ }
}
-
return 0;
}
@@ -411,6 +385,7 @@
const struct nf_nat_range *range,
enum nf_nat_manip_type maniptype)
{
+ struct net *net = nf_ct_net(ct);
struct nf_conntrack_tuple curr_tuple, new_tuple;
struct nf_conn_nat *nat;
@@ -452,19 +427,16 @@
}
if (maniptype == NF_NAT_MANIP_SRC) {
- struct nf_nat_conn_key key = {
- .net = nf_ct_net(ct),
- .tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
- .zone = nf_ct_zone(ct),
- };
- int err;
+ unsigned int srchash;
- err = rhltable_insert_key(&nf_nat_bysource_table,
- &key,
- &ct->nat_bysource,
- nf_nat_bysource_params);
- if (err)
- return NF_DROP;
+ srchash = hash_by_src(net,
+ &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+ spin_lock_bh(&nf_nat_lock);
+ /* nf_conntrack_alter_reply might re-allocate extension aera */
+ nat = nfct_nat(ct);
+ hlist_add_head_rcu(&ct->nat_bysource,
+ &nf_nat_bysource[srchash]);
+ spin_unlock_bh(&nf_nat_lock);
}
/* It's done. */
@@ -550,11 +522,7 @@
static int nf_nat_proto_remove(struct nf_conn *i, void *data)
{
const struct nf_nat_proto_clean *clean = data;
- struct nf_conn_nat *nat = nfct_nat(i);
- if (!nat)
- return 0;
-
if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
(clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
return 0;
@@ -564,12 +532,10 @@
static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
{
- struct nf_conn_nat *nat = nfct_nat(ct);
-
if (nf_nat_proto_remove(ct, data))
return 1;
- if (!nat)
+ if ((ct->status & IPS_SRC_NAT_DONE) == 0)
return 0;
/* This netns is being destroyed, and conntrack has nat null binding.
@@ -578,9 +544,10 @@
* Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
* will delete entry from already-freed table.
*/
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&ct->nat_bysource);
ct->status &= ~IPS_NAT_DONE_MASK;
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ spin_unlock_bh(&nf_nat_lock);
/* don't delete conntrack. Although that would make things a lot
* simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -705,13 +672,11 @@
/* No one using conntrack by the time this called. */
static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
{
- struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
- if (!nat)
- return;
-
- rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
- nf_nat_bysource_params);
+ if (ct->status & IPS_SRC_NAT_DONE) {
+ spin_lock_bh(&nf_nat_lock);
+ hlist_del_rcu(&ct->nat_bysource);
+ spin_unlock_bh(&nf_nat_lock);
+ }
}
static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -846,13 +811,16 @@
{
int ret;
- ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
- if (ret)
- return ret;
+ /* Leave them the same for the moment. */
+ nf_nat_htable_size = nf_conntrack_htable_size;
+ nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+ if (!nf_nat_bysource)
+ return -ENOMEM;
+
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
@@ -876,7 +844,7 @@
return 0;
cleanup_extend:
- rhltable_destroy(&nf_nat_bysource_table);
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
@@ -896,8 +864,8 @@
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
-
- rhltable_destroy(&nf_nat_bysource_table);
+ synchronize_net();
+ nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
Index: net/netfilter/nf_conntrack_core.c
===================================================================
--- net/netfilter/nf_conntrack_core.c (revision 33719)
+++ net/netfilter/nf_conntrack_core.c (working copy)
@@ -698,7 +698,7 @@
l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
if (l4proto->allow_clash &&
- !nfct_nat(ct) &&
+ ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
!nf_ct_is_dying(ct) &&
atomic_inc_not_zero(&ct->ct_general.use)) {
nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);
^ permalink raw reply [flat|nested] 14+ messages in thread
end of thread, other threads:[~2017-11-15 22:57 UTC | newest]
Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
2017-11-13 8:36 ` Greg KH
2017-11-14 7:07 ` Sebastian Gottschall
2017-11-14 11:05 ` Florian Westphal
2017-11-14 11:54 ` Guillaume Nault
2017-11-14 19:30 ` Guillaume Nault
2017-11-14 22:16 ` Sebastian Gottschall
2017-11-14 22:44 ` Sebastian Gottschall
2017-11-15 10:13 ` Florian Westphal
2017-11-15 14:04 ` Greg Kroah-Hartman
2017-11-15 14:16 ` Pablo Neira Ayuso
2017-11-15 14:51 ` Greg Kroah-Hartman
2017-11-15 22:57 ` Sebastian Gottschall
2017-11-15 14:11 ` Pablo Neira Ayuso
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).