stable.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
@ 2017-11-12 20:23 Florian Westphal
  2017-11-13  8:36 ` Greg KH
  2017-11-14  7:07 ` Sebastian Gottschall
  0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-12 20:23 UTC (permalink / raw)
  To: stable; +Cc: g.nault, pablo, Florian Westphal

commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.

This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.

It was not a good idea. The custom hash table was a much better
fit for this purpose.

A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.

rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).

The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks

1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.

I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics.  rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.

Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Hi Greg,

 this is a 4.9.y backport of the same revert that already went into 4.13.y;
 please consider applying this.  I briefly tested this in kvm, at very least
 it doesn't break nat redirect in obvious ways...

 include/net/netfilter/nf_conntrack.h |   3 +-
 include/net/netfilter/nf_nat.h       |   1 -
 net/netfilter/nf_nat_core.c          | 133 +++++++++++++++--------------------
 3 files changed, 57 insertions(+), 80 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index d9d52c020a70..9ae819e27940 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
-#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -101,7 +100,7 @@ struct nf_conn {
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
+	struct hlist_node	nat_bysource;
 #endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
index c327a431a6f3..02515f7ed4cc 100644
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
-#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 2916f4815c9c..87d67e714dc4 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
+
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
-
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
-
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
+
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
 
@@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
-
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		unsigned int srchash;
+
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		/* nf_conntrack_alter_reply might re-allocate extension aera */
+		nat = nfct_nat(ct);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
 	 * will delete entry from already-freed table.
 	 */
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 	if (!nat)
 		return;
 
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
+
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
+	spin_unlock_bh(&nf_nat_lock);
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -846,13 +822,16 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
+
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -876,7 +855,7 @@ static int __init nf_nat_init(void)
 	return 0;
 
  cleanup_extend:
-	rhltable_destroy(&nf_nat_bysource_table);
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");
-- 
2.13.6

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
@ 2017-11-13  8:36 ` Greg KH
  2017-11-14  7:07 ` Sebastian Gottschall
  1 sibling, 0 replies; 14+ messages in thread
From: Greg KH @ 2017-11-13  8:36 UTC (permalink / raw)
  To: Florian Westphal; +Cc: stable, g.nault, pablo

On Sun, Nov 12, 2017 at 09:23:47PM +0100, Florian Westphal wrote:
> commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
> 
> This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
> 
> It was not a good idea. The custom hash table was a much better
> fit for this purpose.
> 
> A fast lookup is not essential, in fact for most cases there is no lookup
> at all because original tuple is not taken and can be used as-is.
> What needs to be fast is insertion and deletion.
> 
> rhlist removal however requires a rhlist walk.
> We can have thousands of entries in such a list if source port/addresses
> are reused for multiple flows, if this happens removal requests are so
> expensive that deletions of a few thousand flows can take several
> seconds(!).
> 
> The advantages that we got from rhashtable are:
> 1) table auto-sizing
> 2) multiple locks
> 
> 1) would be nice to have, but it is not essential as we have at
> most one lookup per new flow, so even a million flows in the bysource
> table are not a problem compared to current deletion cost.
> 2) is easy to add to custom hash table.
> 
> I tried to add hlist_node to rhlist to speed up rhltable_remove but this
> isn't doable without changing semantics.  rhltable_remove_fast will
> check that the to-be-deleted object is part of the table and that
> requires a list walk that we want to avoid.
> 
> Furthermore, using hlist_node increases size of struct rhlist_head, which
> in turn increases nf_conn size.
> 
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
> Reported-by: Ivan Babrou <ibobrik@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>  Hi Greg,
> 
>  this is a 4.9.y backport of the same revert that already went into 4.13.y;
>  please consider applying this.  I briefly tested this in kvm, at very least
>  it doesn't break nat redirect in obvious ways...

Very nice, thanks for this, now queued up.

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
  2017-11-13  8:36 ` Greg KH
@ 2017-11-14  7:07 ` Sebastian Gottschall
  2017-11-14 11:05   ` Florian Westphal
  1 sibling, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14  7:07 UTC (permalink / raw)
  To: Florian Westphal, stable; +Cc: g.nault, pablo

this should belong to you. with the revert the kernel crashes for me

[   24.838120] Unable to handle kernel NULL pointer dereference at 
virtual address 00000055
[   24.846193] pgd = c0004000
[   24.848893] [00000055] *pgd=00000000
[   24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
[   24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd 
ohci_hcd ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base 
qca_ssdk gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k 
ath9k_common ath9k_hw ath mac80211 cfg80211 compat
[   24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
[   24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
[   24.892921] task: ef029ac0 task.stack: ef05a000
[   24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
[   24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
[   24.907869] pc : [<c04858c8>]    lr : [<c04858b4>]    psr: 60000153
[   24.907869] sp : ef05bb58  ip : ef05bb58  fp : ef05bb6c
[   24.919317] r10: ed230cc0  r9 : ed230c00  r8 : edf45800
[   24.924529] r7 : ebcd2f00  r6 : ec33739e  r5 : c0892294  r4 : ebcd2f00
[   24.931040] r3 : 00000000  r2 : 00000055  r1 : 00000000  r0 : c0892718
[   24.937551] Flags: nZCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  
Segment user
[   24.944755] Control: 10c5387d  Table: 2bd1006a  DAC: 00000055
[   24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
[   24.956477] Stack: (0xef05bb58 to 0xef05c000)



Am 12.11.2017 um 21:23 schrieb Florian Westphal:
> commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.
>
> This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.
>
> It was not a good idea. The custom hash table was a much better
> fit for this purpose.
>
> A fast lookup is not essential, in fact for most cases there is no lookup
> at all because original tuple is not taken and can be used as-is.
> What needs to be fast is insertion and deletion.
>
> rhlist removal however requires a rhlist walk.
> We can have thousands of entries in such a list if source port/addresses
> are reused for multiple flows, if this happens removal requests are so
> expensive that deletions of a few thousand flows can take several
> seconds(!).
>
> The advantages that we got from rhashtable are:
> 1) table auto-sizing
> 2) multiple locks
>
> 1) would be nice to have, but it is not essential as we have at
> most one lookup per new flow, so even a million flows in the bysource
> table are not a problem compared to current deletion cost.
> 2) is easy to add to custom hash table.
>
> I tried to add hlist_node to rhlist to speed up rhltable_remove but this
> isn't doable without changing semantics.  rhltable_remove_fast will
> check that the to-be-deleted object is part of the table and that
> requires a list walk that we want to avoid.
>
> Furthermore, using hlist_node increases size of struct rhlist_head, which
> in turn increases nf_conn size.
>
> Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
> Reported-by: Ivan Babrou <ibobrik@gmail.com>
> Signed-off-by: Florian Westphal <fw@strlen.de>
> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
> ---
>   Hi Greg,
>
>   this is a 4.9.y backport of the same revert that already went into 4.13.y;
>   please consider applying this.  I briefly tested this in kvm, at very least
>   it doesn't break nat redirect in obvious ways...
>
>   include/net/netfilter/nf_conntrack.h |   3 +-
>   include/net/netfilter/nf_nat.h       |   1 -
>   net/netfilter/nf_nat_core.c          | 133 +++++++++++++++--------------------
>   3 files changed, 57 insertions(+), 80 deletions(-)
>
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index d9d52c020a70..9ae819e27940 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -17,7 +17,6 @@
>   #include <linux/bitops.h>
>   #include <linux/compiler.h>
>   #include <linux/atomic.h>
> -#include <linux/rhashtable.h>
>   
>   #include <linux/netfilter/nf_conntrack_tcp.h>
>   #include <linux/netfilter/nf_conntrack_dccp.h>
> @@ -101,7 +100,7 @@ struct nf_conn {
>   	possible_net_t ct_net;
>   
>   #if IS_ENABLED(CONFIG_NF_NAT)
> -	struct rhlist_head nat_bysource;
> +	struct hlist_node	nat_bysource;
>   #endif
>   	/* all members below initialized via memset */
>   	u8 __nfct_init_offset[0];
> diff --git a/include/net/netfilter/nf_nat.h b/include/net/netfilter/nf_nat.h
> index c327a431a6f3..02515f7ed4cc 100644
> --- a/include/net/netfilter/nf_nat.h
> +++ b/include/net/netfilter/nf_nat.h
> @@ -1,6 +1,5 @@
>   #ifndef _NF_NAT_H
>   #define _NF_NAT_H
> -#include <linux/rhashtable.h>
>   #include <linux/netfilter_ipv4.h>
>   #include <linux/netfilter/nf_nat.h>
>   #include <net/netfilter/nf_conntrack_tuple.h>
> diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
> index 2916f4815c9c..87d67e714dc4 100644
> --- a/net/netfilter/nf_nat_core.c
> +++ b/net/netfilter/nf_nat_core.c
> @@ -30,19 +30,17 @@
>   #include <net/netfilter/nf_conntrack_zones.h>
>   #include <linux/netfilter/nf_nat.h>
>   
> +static DEFINE_SPINLOCK(nf_nat_lock);
> +
>   static DEFINE_MUTEX(nf_nat_proto_mutex);
>   static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
>   						__read_mostly;
>   static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
>   						__read_mostly;
>   
> -struct nf_nat_conn_key {
> -	const struct net *net;
> -	const struct nf_conntrack_tuple *tuple;
> -	const struct nf_conntrack_zone *zone;
> -};
> -
> -static struct rhltable nf_nat_bysource_table;
> +static struct hlist_head *nf_nat_bysource __read_mostly;
> +static unsigned int nf_nat_htable_size __read_mostly;
> +static unsigned int nf_nat_hash_rnd __read_mostly;
>   
>   inline const struct nf_nat_l3proto *
>   __nf_nat_l3proto_find(u8 family)
> @@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, struct sk_buff *skb, unsigned int family)
>   EXPORT_SYMBOL(nf_xfrm_me_harder);
>   #endif /* CONFIG_XFRM */
>   
> -static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
> +/* We keep an extra hash for each conntrack, for fast searching. */
> +static inline unsigned int
> +hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
>   {
> -	const struct nf_conntrack_tuple *t;
> -	const struct nf_conn *ct = data;
> +	unsigned int hash;
> +
> +	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
>   
> -	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
>   	/* Original src, to ensure we map it consistently if poss. */
> +	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
> +		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
>   
> -	seed ^= net_hash_mix(nf_ct_net(ct));
> -	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
> -		      t->dst.protonum ^ seed);
> +	return reciprocal_scale(hash, nf_nat_htable_size);
>   }
>   
>   /* Is this tuple already taken? (not by us) */
> @@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct,
>   		t->src.u.all == tuple->src.u.all);
>   }
>   
> -static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
> -			       const void *obj)
> -{
> -	const struct nf_nat_conn_key *key = arg->key;
> -	const struct nf_conn *ct = obj;
> -
> -	if (!same_src(ct, key->tuple) ||
> -	    !net_eq(nf_ct_net(ct), key->net) ||
> -	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
> -		return 1;
> -
> -	return 0;
> -}
> -
> -static struct rhashtable_params nf_nat_bysource_params = {
> -	.head_offset = offsetof(struct nf_conn, nat_bysource),
> -	.obj_hashfn = nf_nat_bysource_hash,
> -	.obj_cmpfn = nf_nat_bysource_cmp,
> -	.nelem_hint = 256,
> -	.min_size = 1024,
> -};
> -
>   /* Only called for SRC manip */
>   static int
>   find_appropriate_src(struct net *net,
> @@ -219,26 +197,22 @@ find_appropriate_src(struct net *net,
>   		     struct nf_conntrack_tuple *result,
>   		     const struct nf_nat_range *range)
>   {
> +	unsigned int h = hash_by_src(net, tuple);
>   	const struct nf_conn *ct;
> -	struct nf_nat_conn_key key = {
> -		.net = net,
> -		.tuple = tuple,
> -		.zone = zone
> -	};
> -	struct rhlist_head *hl, *h;
> -
> -	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
> -			     nf_nat_bysource_params);
>   
> -	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
> -		nf_ct_invert_tuplepr(result,
> -				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> -		result->dst = tuple->dst;
> -
> -		if (in_range(l3proto, l4proto, result, range))
> -			return 1;
> +	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
> +		if (same_src(ct, tuple) &&
> +		    net_eq(net, nf_ct_net(ct)) &&
> +		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
> +			/* Copy source part from reply tuple. */
> +			nf_ct_invert_tuplepr(result,
> +				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
> +			result->dst = tuple->dst;
> +
> +			if (in_range(l3proto, l4proto, result, range))
> +				return 1;
> +		}
>   	}
> -
>   	return 0;
>   }
>   
> @@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct,
>   		  const struct nf_nat_range *range,
>   		  enum nf_nat_manip_type maniptype)
>   {
> +	struct net *net = nf_ct_net(ct);
>   	struct nf_conntrack_tuple curr_tuple, new_tuple;
>   	struct nf_conn_nat *nat;
>   
> @@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct,
>   	}
>   
>   	if (maniptype == NF_NAT_MANIP_SRC) {
> -		struct nf_nat_conn_key key = {
> -			.net = nf_ct_net(ct),
> -			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
> -			.zone = nf_ct_zone(ct),
> -		};
> -		int err;
> -
> -		err = rhltable_insert_key(&nf_nat_bysource_table,
> -					  &key,
> -					  &ct->nat_bysource,
> -					  nf_nat_bysource_params);
> -		if (err)
> -			return NF_DROP;
> +		unsigned int srchash;
> +
> +		srchash = hash_by_src(net,
> +				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
> +		spin_lock_bh(&nf_nat_lock);
> +		/* nf_conntrack_alter_reply might re-allocate extension aera */
> +		nat = nfct_nat(ct);
> +		hlist_add_head_rcu(&ct->nat_bysource,
> +				   &nf_nat_bysource[srchash]);
> +		spin_unlock_bh(&nf_nat_lock);
>   	}
>   
>   	/* It's done. */
> @@ -578,9 +550,10 @@ static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
>   	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
>   	 * will delete entry from already-freed table.
>   	 */
> +	spin_lock_bh(&nf_nat_lock);
> +	hlist_del_rcu(&ct->nat_bysource);
>   	ct->status &= ~IPS_NAT_DONE_MASK;
> -	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
> -			nf_nat_bysource_params);
> +	spin_unlock_bh(&nf_nat_lock);
>   
>   	/* don't delete conntrack.  Although that would make things a lot
>   	 * simpler, we'd end up flushing all conntracks on nat rmmod.
> @@ -710,8 +683,11 @@ static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
>   	if (!nat)
>   		return;
>   
> -	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
> -			nf_nat_bysource_params);
> +	NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> +
> +	spin_lock_bh(&nf_nat_lock);
> +	hlist_del_rcu(&ct->nat_bysource);
> +	spin_unlock_bh(&nf_nat_lock);
>   }
>   
>   static struct nf_ct_ext_type nat_extend __read_mostly = {
> @@ -846,13 +822,16 @@ static int __init nf_nat_init(void)
>   {
>   	int ret;
>   
> -	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
> -	if (ret)
> -		return ret;
> +	/* Leave them the same for the moment. */
> +	nf_nat_htable_size = nf_conntrack_htable_size;
> +
> +	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
> +	if (!nf_nat_bysource)
> +		return -ENOMEM;
>   
>   	ret = nf_ct_extend_register(&nat_extend);
>   	if (ret < 0) {
> -		rhltable_destroy(&nf_nat_bysource_table);
> +		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>   		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
>   		return ret;
>   	}
> @@ -876,7 +855,7 @@ static int __init nf_nat_init(void)
>   	return 0;
>   
>    cleanup_extend:
> -	rhltable_destroy(&nf_nat_bysource_table);
> +	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>   	nf_ct_extend_unregister(&nat_extend);
>   	return ret;
>   }
> @@ -896,8 +875,8 @@ static void __exit nf_nat_cleanup(void)
>   
>   	for (i = 0; i < NFPROTO_NUMPROTO; i++)
>   		kfree(nf_nat_l4protos[i]);
> -
> -	rhltable_destroy(&nf_nat_bysource_table);
> +	synchronize_net();
> +	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>   }
>   
>   MODULE_LICENSE("GPL");


-- 
Mit freundlichen Grüssen / Regards

Sebastian Gottschall / CTO

NewMedia-NET GmbH - DD-WRT
Firmensitz:  Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14  7:07 ` Sebastian Gottschall
@ 2017-11-14 11:05   ` Florian Westphal
  2017-11-14 11:54     ` Guillaume Nault
  2017-11-14 19:30     ` Guillaume Nault
  0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-14 11:05 UTC (permalink / raw)
  To: Sebastian Gottschall; +Cc: Florian Westphal, stable, g.nault, pablo

Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:


> this should belong to you. with the revert the kernel crashes for me
> 
> [ᅵᅵ 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> address 00000055
> [ᅵᅵ 24.846193] pgd = c0004000
> [ᅵᅵ 24.848893] [00000055] *pgd=00000000
> [ᅵᅵ 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> [ᅵᅵ 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> ath9k_hw ath mac80211 cfg80211 compat
> [ᅵᅵ 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> [ᅵᅵ 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> [ᅵᅵ 24.892921] task: ef029ac0 task.stack: ef05a000
> [ᅵᅵ 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> [ᅵᅵ 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> [ᅵᅵ 24.907869] pc : [<c04858c8>]ᅵᅵᅵ lr : [<c04858b4>]ᅵᅵᅵ psr: 60000153
> [ᅵᅵ 24.907869] sp : ef05bb58ᅵ ip : ef05bb58ᅵ fp : ef05bb6c
> [ᅵᅵ 24.919317] r10: ed230cc0ᅵ r9 : ed230c00ᅵ r8 : edf45800
> [ᅵᅵ 24.924529] r7 : ebcd2f00ᅵ r6 : ec33739eᅵ r5 : c0892294ᅵ r4 : ebcd2f00
> [ᅵᅵ 24.931040] r3 : 00000000ᅵ r2 : 00000055ᅵ r1 : 00000000ᅵ r0 : c0892718
> [ᅵᅵ 24.937551] Flags: nZCvᅵ IRQs onᅵ FIQs offᅵ Mode SVC_32ᅵ ISA ARMᅵ Segment
> user
> [ᅵᅵ 24.944755] Control: 10c5387dᅵ Table: 2bd1006aᅵ DAC: 00000055
> [ᅵᅵ 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> [ᅵᅵ 24.956477] Stack: (0xef05bb58 to 0xef05c000)
 
Guillaume, can you check what I missed?
IIRC you said you had a revert for your 4.9 tree, correct?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14 11:05   ` Florian Westphal
@ 2017-11-14 11:54     ` Guillaume Nault
  2017-11-14 19:30     ` Guillaume Nault
  1 sibling, 0 replies; 14+ messages in thread
From: Guillaume Nault @ 2017-11-14 11:54 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Sebastian Gottschall, stable, pablo

On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> 
> 
> > this should belong to you. with the revert the kernel crashes for me
> > 
> > [�� 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> > address 00000055
> > [�� 24.846193] pgd = c0004000
> > [�� 24.848893] [00000055] *pgd=00000000
> > [�� 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> > [�� 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> > ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> > gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> > ath9k_hw ath mac80211 cfg80211 compat
> > [�� 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> > [�� 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> > [�� 24.892921] task: ef029ac0 task.stack: ef05a000
> > [�� 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> > [�� 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> > [�� 24.907869] pc : [<c04858c8>]��� lr : [<c04858b4>]��� psr: 60000153
> > [�� 24.907869] sp : ef05bb58� ip : ef05bb58� fp : ef05bb6c
> > [�� 24.919317] r10: ed230cc0� r9 : ed230c00� r8 : edf45800
> > [�� 24.924529] r7 : ebcd2f00� r6 : ec33739e� r5 : c0892294� r4 : ebcd2f00
> > [�� 24.931040] r3 : 00000000� r2 : 00000055� r1 : 00000000� r0 : c0892718
> > [�� 24.937551] Flags: nZCv� IRQs on� FIQs off� Mode SVC_32� ISA ARM� Segment
> > user
> > [�� 24.944755] Control: 10c5387d� Table: 2bd1006a� DAC: 00000055
> > [�� 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> > [�� 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>  
> Guillaume, can you check what I missed?
> IIRC you said you had a revert for your 4.9 tree, correct?
> 
Yes, but my backport was on 4.9.18. IIRC some other nat fixes have been
applied in the mean time.
I just arrived back home yesterday night. I'm going look at this in the
afternoon.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14 11:05   ` Florian Westphal
  2017-11-14 11:54     ` Guillaume Nault
@ 2017-11-14 19:30     ` Guillaume Nault
  2017-11-14 22:16       ` Sebastian Gottschall
  1 sibling, 1 reply; 14+ messages in thread
From: Guillaume Nault @ 2017-11-14 19:30 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Sebastian Gottschall, stable, pablo

On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> 
> 
> > this should belong to you. with the revert the kernel crashes for me
> > 
> > [�� 24.838120] Unable to handle kernel NULL pointer dereference at virtual
> > address 00000055
> > [�� 24.846193] pgd = c0004000
> > [�� 24.848893] [00000055] *pgd=00000000
> > [�� 24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
> > [�� 24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
> > ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
> > gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
> > ath9k_hw ath mac80211 cfg80211 compat
> > [�� 24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
> > [�� 24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
> > [�� 24.892921] task: ef029ac0 task.stack: ef05a000
> > [�� 24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
> > [�� 24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
> > [�� 24.907869] pc : [<c04858c8>]��� lr : [<c04858b4>]��� psr: 60000153
> > [�� 24.907869] sp : ef05bb58� ip : ef05bb58� fp : ef05bb6c
> > [�� 24.919317] r10: ed230cc0� r9 : ed230c00� r8 : edf45800
> > [�� 24.924529] r7 : ebcd2f00� r6 : ec33739e� r5 : c0892294� r4 : ebcd2f00
> > [�� 24.931040] r3 : 00000000� r2 : 00000055� r1 : 00000000� r0 : c0892718
> > [�� 24.937551] Flags: nZCv� IRQs on� FIQs off� Mode SVC_32� ISA ARM� Segment
> > user
> > [�� 24.944755] Control: 10c5387d� Table: 2bd1006a� DAC: 00000055
> > [�� 24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
> > [�� 24.956477] Stack: (0xef05bb58 to 0xef05c000)
>  
> Guillaume, can you check what I missed?
> IIRC you said you had a revert for your 4.9 tree, correct?
> 
Actually I had forgotten some details when I told you about it. What I
did is simpler than a revert. I just cherry-picked the following
commits:
124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT")
6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")

Then I just had to fix one remaining call:
@@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
        return 0;
  
  cleanup_extend:
-       rhltable_destroy(&nf_nat_bysource_table);
+       nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
        nf_ct_extend_unregister(&nat_extend);
        return ret;
 }


My original approach was to revert commit
870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
and fix the conflicts by hand. But I had crashes when testing the
result. After a bit of debugging, I decided to simply try cherry-picking
commit e1bf1687740c and its dependencies. Since I got better results
with this approach, I dropped my original revert code (so,
unfortunately, I can't compare with yours and I don't remember if the
crash was similar to Sebastian's).
We have the cherry-picked patches running on several machines since
September. No problem found so far.


Now, looking at Sebastian's report, I suspect that using
nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
instead, which is now set atomically. Theses changes weren't
intentional, but introduced by the dependency on commits 124dffea9e8e
and 6e699867f84c.

Does that make sense to you? I can look more deeply into this tomorrow.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14 19:30     ` Guillaume Nault
@ 2017-11-14 22:16       ` Sebastian Gottschall
  2017-11-14 22:44         ` Sebastian Gottschall
  0 siblings, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14 22:16 UTC (permalink / raw)
  To: Guillaume Nault, Florian Westphal; +Cc: stable, pablo

Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
> On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
>> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>>
>>
>>> this should belong to you. with the revert the kernel crashes for me
>>>
>>> [   24.838120] Unable to handle kernel NULL pointer dereference at virtual
>>> address 00000055
>>> [   24.846193] pgd = c0004000
>>> [   24.848893] [00000055] *pgd=00000000
>>> [   24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
>>> [   24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd ohci_hcd
>>> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
>>> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k ath9k_common
>>> ath9k_hw ath mac80211 cfg80211 compat
>>> [   24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 #90
>>> [   24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
>>> [   24.892921] task: ef029ac0 task.stack: ef05a000
>>> [   24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
>>> [   24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
>>> [   24.907869] pc : [<c04858c8>]    lr : [<c04858b4>]    psr: 60000153
>>> [   24.907869] sp : ef05bb58  ip : ef05bb58  fp : ef05bb6c
>>> [   24.919317] r10: ed230cc0  r9 : ed230c00  r8 : edf45800
>>> [   24.924529] r7 : ebcd2f00  r6 : ec33739e  r5 : c0892294  r4 : ebcd2f00
>>> [   24.931040] r3 : 00000000  r2 : 00000055  r1 : 00000000  r0 : c0892718
>>> [   24.937551] Flags: nZCv  IRQs on  FIQs off  Mode SVC_32  ISA ARM  Segment
>>> user
>>> [   24.944755] Control: 10c5387d  Table: 2bd1006a  DAC: 00000055
>>> [   24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
>>> [   24.956477] Stack: (0xef05bb58 to 0xef05c000)
>>   
>> Guillaume, can you check what I missed?
>> IIRC you said you had a revert for your 4.9 tree, correct?
>>
> Actually I had forgotten some details when I told you about it. What I
> did is simpler than a revert. I just cherry-picked the following
> commits:
> 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the _SRC_NAT_DONE_BIT")
> 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
> e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"")
>
> Then I just had to fix one remaining call:
> @@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
>          return 0;
>    
>    cleanup_extend:
> -       rhltable_destroy(&nf_nat_bysource_table);
> +       nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>          nf_ct_extend_unregister(&nat_extend);
>          return ret;
>   }
>
>
> My original approach was to revert commit
> 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
> and fix the conflicts by hand. But I had crashes when testing the
> result. After a bit of debugging, I decided to simply try cherry-picking
> commit e1bf1687740c and its dependencies. Since I got better results
> with this approach, I dropped my original revert code (so,
> unfortunately, I can't compare with yours and I don't remember if the
> crash was similar to Sebastian's).
> We have the cherry-picked patches running on several machines since
> September. No problem found so far.
>
>
> Now, looking at Sebastian's report, I suspect that using
> nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
> be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
> instead, which is now set atomically. Theses changes weren't
> intentional, but introduced by the dependency on commits 124dffea9e8e
> and 6e699867f84c.
>
> Does that make sense to you? I can look more deeply into this tomorrow.
>
i love crashing my devices. let me test this approach


Sebastian



-- 
Mit freundlichen Grüssen / Regards

Sebastian Gottschall / CTO

NewMedia-NET GmbH - DD-WRT
Firmensitz:  Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14 22:16       ` Sebastian Gottschall
@ 2017-11-14 22:44         ` Sebastian Gottschall
  2017-11-15 10:13           ` Florian Westphal
  0 siblings, 1 reply; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-14 22:44 UTC (permalink / raw)
  To: Guillaume Nault, Florian Westphal; +Cc: stable, pablo, Greg Kroah-Hartman

the following patch based on the suggestion by Guillaume works good in 
my tests and does not crash anymore

--- nf_nat_core.c       (Revision 33766)
+++ nf_nat_core.c       (Arbeitskopie)
@@ -678,16 +678,11 @@
  /* No one using conntrack by the time this called. */
  static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
  {
-       struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
-       if (!nat)
-               return;
-
-       NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
-
-       spin_lock_bh(&nf_nat_lock);
-       hlist_del_rcu(&ct->nat_bysource);
-       spin_unlock_bh(&nf_nat_lock);
+       if (ct->status & IPS_SRC_NAT_DONE) {
+               spin_lock_bh(&nf_nat_lock);
+               hlist_del_rcu(&ct->nat_bysource);
+               spin_unlock_bh(&nf_nat_lock);
+       }
  }

  static struct nf_ct_ext_type nat_extend __read_mostly = {


Am 14.11.2017 um 23:16 schrieb Sebastian Gottschall:
> Am 14.11.2017 um 20:30 schrieb Guillaume Nault:
>> On Tue, Nov 14, 2017 at 12:05:46PM +0100, Florian Westphal wrote:
>>> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
>>>
>>>
>>>> this should belong to you. with the revert the kernel crashes for me
>>>>
>>>> [   24.838120] Unable to handle kernel NULL pointer dereference at 
>>>> virtual
>>>> address 00000055
>>>> [   24.846193] pgd = c0004000
>>>> [   24.848893] [00000055] *pgd=00000000
>>>> [   24.852472] Internal error: Oops - BUG: 817 [#1] PREEMPT SMP ARM
>>>> [   24.858463] Modules linked in: xhci_plat_hcd xhci_pci xhci_hcd 
>>>> ohci_hcd
>>>> ehci_pci ehci_platform ehci_hcd usbcore usb_common nls_base qca_ssdk
>>>> gpio_pca953x mii_gpio wil6210 ath10k_pci ath10k_core ath9k 
>>>> ath9k_common
>>>> ath9k_hw ath mac80211 cfg80211 compat
>>>> [   24.880852] CPU: 2 PID: 0 Comm: swapper/2 Not tainted 4.9.62-rc1 
>>>> #90
>>>> [   24.887189] Hardware name: AnnapurnaLabs Alpine (Device Tree)
>>>> [   24.892921] task: ef029ac0 task.stack: ef05a000
>>>> [   24.897444] PC is at nf_nat_cleanup_conntrack+0x4c/0x74
>>>> [   24.902657] LR is at nf_nat_cleanup_conntrack+0x38/0x74
>>>> [   24.907869] pc : [<c04858c8>]    lr : [<c04858b4>]    psr: 60000153
>>>> [   24.907869] sp : ef05bb58  ip : ef05bb58  fp : ef05bb6c
>>>> [   24.919317] r10: ed230cc0  r9 : ed230c00  r8 : edf45800
>>>> [   24.924529] r7 : ebcd2f00  r6 : ec33739e  r5 : c0892294 r4 : 
>>>> ebcd2f00
>>>> [   24.931040] r3 : 00000000  r2 : 00000055  r1 : 00000000 r0 : 
>>>> c0892718
>>>> [   24.937551] Flags: nZCv  IRQs on  FIQs off  Mode SVC_32 ISA ARM  
>>>> Segment
>>>> user
>>>> [   24.944755] Control: 10c5387d  Table: 2bd1006a  DAC: 00000055
>>>> [   24.950486] Process swapper/2 (pid: 0, stack limit = 0xef05a210)
>>>> [   24.956477] Stack: (0xef05bb58 to 0xef05c000)
>>>   Guillaume, can you check what I missed?
>>> IIRC you said you had a revert for your 4.9 tree, correct?
>>>
>> Actually I had forgotten some details when I told you about it. What I
>> did is simpler than a revert. I just cherry-picked the following
>> commits:
>> 124dffea9e8e ("netfilter: nat: use atomic bit op to clear the 
>> _SRC_NAT_DONE_BIT")
>> 6e699867f84c ("netfilter: nat: avoid use of nf_conn_nat extension")
>> e1bf1687740c ("netfilter: nat: Revert "netfilter: nat: convert nat 
>> bysrc hash to rhashtable"")
>>
>> Then I just had to fix one remaining call:
>> @@ -842,7 +842,7 @@ static int __init nf_nat_init(void)
>>          return 0;
>>       cleanup_extend:
>> -       rhltable_destroy(&nf_nat_bysource_table);
>> +       nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
>>          nf_ct_extend_unregister(&nat_extend);
>>          return ret;
>>   }
>>
>>
>> My original approach was to revert commit
>> 870190a9ec90 ("netfilter: nat: convert nat bysrc hash to rhashtable")
>> and fix the conflicts by hand. But I had crashes when testing the
>> result. After a bit of debugging, I decided to simply try cherry-picking
>> commit e1bf1687740c and its dependencies. Since I got better results
>> with this approach, I dropped my original revert code (so,
>> unfortunately, I can't compare with yours and I don't remember if the
>> crash was similar to Sebastian's).
>> We have the cherry-picked patches running on several machines since
>> September. No problem found so far.
>>
>>
>> Now, looking at Sebastian's report, I suspect that using
>> nf_ct_ext_find(NF_CT_EXT_NAT) in nf_nat_cleanup_conntrack() might not
>> be appropriate anymore. My approach uses the IPS_SRC_NAT_DONE bit
>> instead, which is now set atomically. Theses changes weren't
>> intentional, but introduced by the dependency on commits 124dffea9e8e
>> and 6e699867f84c.
>>
>> Does that make sense to you? I can look more deeply into this tomorrow.
>>
> i love crashing my devices. let me test this approach
>
>
> Sebastian
>
>
>

-- 
Mit freundlichen Grüssen / Regards

Sebastian Gottschall / CTO

NewMedia-NET GmbH - DD-WRT
Firmensitz:  Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-14 22:44         ` Sebastian Gottschall
@ 2017-11-15 10:13           ` Florian Westphal
  2017-11-15 14:04             ` Greg Kroah-Hartman
  2017-11-15 14:11             ` Pablo Neira Ayuso
  0 siblings, 2 replies; 14+ messages in thread
From: Florian Westphal @ 2017-11-15 10:13 UTC (permalink / raw)
  To: Sebastian Gottschall
  Cc: Guillaume Nault, Florian Westphal, stable, pablo,
	Greg Kroah-Hartman

Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> the following patch based on the suggestion by Guillaume works good in my
> tests and does not crash anymore
> 
> --- nf_nat_core.cᅵᅵᅵᅵᅵᅵ (Revision 33766)
> +++ nf_nat_core.cᅵᅵᅵᅵᅵᅵ (Arbeitskopie)
> @@ -678,16 +678,11 @@
> ᅵ/* No one using conntrack by the time this called. */
> ᅵstatic void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> ᅵ{
> -ᅵᅵᅵᅵᅵᅵ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> -
> -ᅵᅵᅵᅵᅵᅵ if (!nat)
> -ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ return;
> -
> -ᅵᅵᅵᅵᅵᅵ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> -
> -ᅵᅵᅵᅵᅵᅵ spin_lock_bh(&nf_nat_lock);
> -ᅵᅵᅵᅵᅵᅵ hlist_del_rcu(&ct->nat_bysource);
> -ᅵᅵᅵᅵᅵᅵ spin_unlock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵ if (ct->status & IPS_SRC_NAT_DONE) {
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ spin_lock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ hlist_del_rcu(&ct->nat_bysource);
> +ᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵᅵ spin_unlock_bh(&nf_nat_lock);
> +ᅵᅵᅵᅵᅵᅵ }
> ᅵ}

FWIW I can now reproduce the crash on 4.9-stable + backport.
This crash can be triggered by a simple

iptables -I INPUT 1 -j DROP

on first (new) incoming packet.
Problem is that as not SRC nat transformation is there for
inbound connections such conntrack isn't yet on nat_bysource list
so hlist_del_rcu() gets garbage input.

bug was added in
7c9664351980aaa6a4b8837a314360b3a4ad382a
("netfilter: move nat hlist_head to nf_conn").

... and then 'masked' by the rhashtable conversion (removing non-existing
rhashtable element has no ill effects).

The revert upstream and the 4.13.y one had no such issue because of earlier
change 6e699867f84c0f358fed233fe6162173aca28e04
("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
quoted above.

I will have an updated patch shortly, I think best solution is to just
cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
and update this backport accordingly.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-15 10:13           ` Florian Westphal
@ 2017-11-15 14:04             ` Greg Kroah-Hartman
  2017-11-15 14:16               ` Pablo Neira Ayuso
  2017-11-15 14:11             ` Pablo Neira Ayuso
  1 sibling, 1 reply; 14+ messages in thread
From: Greg Kroah-Hartman @ 2017-11-15 14:04 UTC (permalink / raw)
  To: Florian Westphal; +Cc: Sebastian Gottschall, Guillaume Nault, stable, pablo

On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> > the following patch based on the suggestion by Guillaume works good in my
> > tests and does not crash anymore
> > 
> > --- nf_nat_core.c������ (Revision 33766)
> > +++ nf_nat_core.c������ (Arbeitskopie)
> > @@ -678,16 +678,11 @@
> > �/* No one using conntrack by the time this called. */
> > �static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> > �{
> > -������ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> > -
> > -������ if (!nat)
> > -�������������� return;
> > -
> > -������ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> > -
> > -������ spin_lock_bh(&nf_nat_lock);
> > -������ hlist_del_rcu(&ct->nat_bysource);
> > -������ spin_unlock_bh(&nf_nat_lock);
> > +������ if (ct->status & IPS_SRC_NAT_DONE) {
> > +�������������� spin_lock_bh(&nf_nat_lock);
> > +�������������� hlist_del_rcu(&ct->nat_bysource);
> > +�������������� spin_unlock_bh(&nf_nat_lock);
> > +������ }
> > �}
> 
> FWIW I can now reproduce the crash on 4.9-stable + backport.
> This crash can be triggered by a simple
> 
> iptables -I INPUT 1 -j DROP
> 
> on first (new) incoming packet.
> Problem is that as not SRC nat transformation is there for
> inbound connections such conntrack isn't yet on nat_bysource list
> so hlist_del_rcu() gets garbage input.
> 
> bug was added in
> 7c9664351980aaa6a4b8837a314360b3a4ad382a
> ("netfilter: move nat hlist_head to nf_conn").
> 
> ... and then 'masked' by the rhashtable conversion (removing non-existing
> rhashtable element has no ill effects).
> 
> The revert upstream and the 4.13.y one had no such issue because of earlier
> change 6e699867f84c0f358fed233fe6162173aca28e04
> ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
> if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
> quoted above.
> 
> I will have an updated patch shortly, I think best solution is to just
> cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
> and update this backport accordingly.

Ok, let me just drop this patch, get this 4.9 release out, and then can
you all send me a patch, or series of patches, that you feel should be
applied to 4.9 to resolve this issue?

thanks,

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-15 10:13           ` Florian Westphal
  2017-11-15 14:04             ` Greg Kroah-Hartman
@ 2017-11-15 14:11             ` Pablo Neira Ayuso
  1 sibling, 0 replies; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-15 14:11 UTC (permalink / raw)
  To: Florian Westphal
  Cc: Sebastian Gottschall, Guillaume Nault, stable, Greg Kroah-Hartman

On Wed, Nov 15, 2017 at 11:13:00AM +0100, Florian Westphal wrote:
> Sebastian Gottschall <s.gottschall@dd-wrt.com> wrote:
> > the following patch based on the suggestion by Guillaume works good in my
> > tests and does not crash anymore
> > 
> > --- nf_nat_core.c������ (Revision 33766)
> > +++ nf_nat_core.c������ (Arbeitskopie)
> > @@ -678,16 +678,11 @@
> > �/* No one using conntrack by the time this called. */
> > �static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
> > �{
> > -������ struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
> > -
> > -������ if (!nat)
> > -�������������� return;
> > -
> > -������ NF_CT_ASSERT(ct->status & IPS_SRC_NAT_DONE);
> > -
> > -������ spin_lock_bh(&nf_nat_lock);
> > -������ hlist_del_rcu(&ct->nat_bysource);
> > -������ spin_unlock_bh(&nf_nat_lock);
> > +������ if (ct->status & IPS_SRC_NAT_DONE) {
> > +�������������� spin_lock_bh(&nf_nat_lock);
> > +�������������� hlist_del_rcu(&ct->nat_bysource);
> > +�������������� spin_unlock_bh(&nf_nat_lock);
> > +������ }
> > �}
> 
> FWIW I can now reproduce the crash on 4.9-stable + backport.
> This crash can be triggered by a simple
> 
> iptables -I INPUT 1 -j DROP
> 
> on first (new) incoming packet.
> Problem is that as not SRC nat transformation is there for
> inbound connections such conntrack isn't yet on nat_bysource list
> so hlist_del_rcu() gets garbage input.
> 
> bug was added in
> 7c9664351980aaa6a4b8837a314360b3a4ad382a
> ("netfilter: move nat hlist_head to nf_conn").
> 
> ... and then 'masked' by the rhashtable conversion (removing non-existing
> rhashtable element has no ill effects).
> 
> The revert upstream and the 4.13.y one had no such issue because of earlier
> change 6e699867f84c0f358fed233fe6162173aca28e04
> ("netfilter: nat: avoid use of nf_conn_nat extension"), which replaces
> if (!nat) condition with the if (ct->status & IPS_SRC_NAT_DONE) test
> quoted above.
> 
> I will have an updated patch shortly, I think best solution is to just
> cherry-pick 6e699867f84c0f358fed233fe6162173aca28e04 in 4.9.y too
> and update this backport accordingly.

That's fine. Let's just make sure we get them both patches in together
in the same go.

So I think it's better if we just keep back this cherry-pick until we
have the necessary backport in place.

OK?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-15 14:04             ` Greg Kroah-Hartman
@ 2017-11-15 14:16               ` Pablo Neira Ayuso
  2017-11-15 14:51                 ` Greg Kroah-Hartman
  2017-11-15 22:57                 ` Sebastian Gottschall
  0 siblings, 2 replies; 14+ messages in thread
From: Pablo Neira Ayuso @ 2017-11-15 14:16 UTC (permalink / raw)
  To: Greg Kroah-Hartman
  Cc: Florian Westphal, Sebastian Gottschall, Guillaume Nault, stable

On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
> Ok, let me just drop this patch, get this 4.9 release out, and then can
> you all send me a patch, or series of patches, that you feel should be
> applied to 4.9 to resolve this issue?

Please, indeed drop it for 4.9. We'll take the time here to sort out
things.

Thanks Greg.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-15 14:16               ` Pablo Neira Ayuso
@ 2017-11-15 14:51                 ` Greg Kroah-Hartman
  2017-11-15 22:57                 ` Sebastian Gottschall
  1 sibling, 0 replies; 14+ messages in thread
From: Greg Kroah-Hartman @ 2017-11-15 14:51 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: Florian Westphal, Sebastian Gottschall, Guillaume Nault, stable

On Wed, Nov 15, 2017 at 03:16:48PM +0100, Pablo Neira Ayuso wrote:
> On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
> > Ok, let me just drop this patch, get this 4.9 release out, and then can
> > you all send me a patch, or series of patches, that you feel should be
> > applied to 4.9 to resolve this issue?
> 
> Please, indeed drop it for 4.9. We'll take the time here to sort out
> things.

Now dropped, thanks all.

greg k-h

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
  2017-11-15 14:16               ` Pablo Neira Ayuso
  2017-11-15 14:51                 ` Greg Kroah-Hartman
@ 2017-11-15 22:57                 ` Sebastian Gottschall
  1 sibling, 0 replies; 14+ messages in thread
From: Sebastian Gottschall @ 2017-11-15 22:57 UTC (permalink / raw)
  To: Pablo Neira Ayuso, Greg Kroah-Hartman
  Cc: Florian Westphal, Guillaume Nault, stable

[-- Attachment #1: Type: text/plain, Size: 939 bytes --]

Am 15.11.2017 um 15:16 schrieb Pablo Neira Ayuso:
> On Wed, Nov 15, 2017 at 03:04:06PM +0100, Greg Kroah-Hartman wrote:
>> Ok, let me just drop this patch, get this 4.9 release out, and then can
>> you all send me a patch, or series of patches, that you feel should be
>> applied to 4.9 to resolve this issue?
> Please, indeed drop it for 4.9. We'll take the time here to sort out
> things.
>
> Thanks Greg.
>
@florian:

this is the cherry picked variant which applies clean to the current 4.9 
series. since i'm not the author, please resubmit it from your side
using the correct format

-- 
Mit freundlichen Grüssen / Regards

Sebastian Gottschall / CTO

NewMedia-NET GmbH - DD-WRT
Firmensitz:  Stubenwaldallee 21a, 64625 Bensheim
Registergericht: Amtsgericht Darmstadt, HRB 25473
Geschäftsführer: Peter Steinhäuser, Christian Scheele
http://www.dd-wrt.com
email: s.gottschall@dd-wrt.com
Tel.: +496251-582650 / Fax: +496251-5826565


[-- Attachment #2: revert_nat_bysrc_hash_to_rhashtable.patch --]
[-- Type: text/plain, Size: 8996 bytes --]

--- include/net/netfilter/nf_conntrack.h	(revision 33719)
+++ include/net/netfilter/nf_conntrack.h	(working copy)
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
-#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -101,7 +100,7 @@
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
+	struct hlist_node	nat_bysource;
 #endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
Index: include/net/netfilter/nf_nat.h
===================================================================
--- include/net/netfilter/nf_nat.h	(revision 33719)
+++ include/net/netfilter/nf_nat.h	(working copy)
@@ -1,6 +1,5 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
-#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
--- net/netfilter/nf_nat_core.c	(revision 33719)
+++ net/netfilter/nf_nat_core.c	(working copy)
@@ -30,6 +30,8 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
@@ -36,14 +38,10 @@
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
-static struct rhltable nf_nat_bysource_table;
-
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
 {
@@ -121,17 +119,19 @@
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
+
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -187,28 +187,6 @@
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
 
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
-
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -411,6 +385,7 @@
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
 
@@ -452,19 +427,16 @@
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
+		unsigned int srchash;
 
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		/* nf_conntrack_alter_reply might re-allocate extension aera */
+		nat = nfct_nat(ct);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -550,11 +522,7 @@
 static int nf_nat_proto_remove(struct nf_conn *i, void *data)
 {
 	const struct nf_nat_proto_clean *clean = data;
-	struct nf_conn_nat *nat = nfct_nat(i);
 
-	if (!nat)
-		return 0;
-
 	if ((clean->l3proto && nf_ct_l3num(i) != clean->l3proto) ||
 	    (clean->l4proto && nf_ct_protonum(i) != clean->l4proto))
 		return 0;
@@ -564,12 +532,10 @@
 
 static int nf_nat_proto_clean(struct nf_conn *ct, void *data)
 {
-	struct nf_conn_nat *nat = nfct_nat(ct);
-
 	if (nf_nat_proto_remove(ct, data))
 		return 1;
 
-	if (!nat)
+	if ((ct->status & IPS_SRC_NAT_DONE) == 0)
 		return 0;
 
 	/* This netns is being destroyed, and conntrack has nat null binding.
@@ -578,9 +544,10 @@
 	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
 	 * will delete entry from already-freed table.
 	 */
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -705,13 +672,11 @@
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	struct nf_conn_nat *nat = nf_ct_ext_find(ct, NF_CT_EXT_NAT);
-
-	if (!nat)
-		return;
-
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	if (ct->status & IPS_SRC_NAT_DONE) {
+		spin_lock_bh(&nf_nat_lock);
+		hlist_del_rcu(&ct->nat_bysource);
+		spin_unlock_bh(&nf_nat_lock);
+	}
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -846,13 +811,16 @@
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
 
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
+
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -876,7 +844,7 @@
 	return 0;
 
  cleanup_extend:
-	rhltable_destroy(&nf_nat_bysource_table);
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -896,8 +864,8 @@
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");
Index: net/netfilter/nf_conntrack_core.c
===================================================================
--- net/netfilter/nf_conntrack_core.c	(revision 33719)
+++ net/netfilter/nf_conntrack_core.c	(working copy)
@@ -698,7 +698,7 @@
 
 	l4proto = __nf_ct_l4proto_find(nf_ct_l3num(ct), nf_ct_protonum(ct));
 	if (l4proto->allow_clash &&
-	    !nfct_nat(ct) &&
+	    ((ct->status & IPS_NAT_DONE_MASK) == 0) &&
 	    !nf_ct_is_dying(ct) &&
 	    atomic_inc_not_zero(&ct->ct_general.use)) {
 		nf_ct_acct_merge(ct, ctinfo, (struct nf_conn *)skb->nfct);

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2017-11-15 22:57 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-11-12 20:23 [PATCH 4.9.y] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable" Florian Westphal
2017-11-13  8:36 ` Greg KH
2017-11-14  7:07 ` Sebastian Gottschall
2017-11-14 11:05   ` Florian Westphal
2017-11-14 11:54     ` Guillaume Nault
2017-11-14 19:30     ` Guillaume Nault
2017-11-14 22:16       ` Sebastian Gottschall
2017-11-14 22:44         ` Sebastian Gottschall
2017-11-15 10:13           ` Florian Westphal
2017-11-15 14:04             ` Greg Kroah-Hartman
2017-11-15 14:16               ` Pablo Neira Ayuso
2017-11-15 14:51                 ` Greg Kroah-Hartman
2017-11-15 22:57                 ` Sebastian Gottschall
2017-11-15 14:11             ` Pablo Neira Ayuso

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).