All of lore.kernel.org
 help / color / mirror / Atom feed
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
To: linux-kernel@vger.kernel.org
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>,
	stable@vger.kernel.org, Ivan Babrou <ibobrik@gmail.com>,
	Florian Westphal <fw@strlen.de>,
	Pablo Neira Ayuso <pablo@netfilter.org>
Subject: [PATCH 4.9 34/39] netfilter: nat: Revert "netfilter: nat: convert nat bysrc hash to rhashtable"
Date: Thu, 16 Nov 2017 18:42:53 +0100	[thread overview]
Message-ID: <20171116174214.695880582@linuxfoundation.org> (raw)
In-Reply-To: <20171116174213.321860523@linuxfoundation.org>

4.9-stable review patch.  If anyone has any objections, please let me know.

------------------

From: Florian Westphal <fw@strlen.de>

commit e1bf1687740ce1a3598a1c5e452b852ff2190682 upstream.

This reverts commit 870190a9ec9075205c0fa795a09fa931694a3ff1.

It was not a good idea. The custom hash table was a much better
fit for this purpose.

A fast lookup is not essential, in fact for most cases there is no lookup
at all because original tuple is not taken and can be used as-is.
What needs to be fast is insertion and deletion.

rhlist removal however requires a rhlist walk.
We can have thousands of entries in such a list if source port/addresses
are reused for multiple flows, if this happens removal requests are so
expensive that deletions of a few thousand flows can take several
seconds(!).

The advantages that we got from rhashtable are:
1) table auto-sizing
2) multiple locks

1) would be nice to have, but it is not essential as we have at
most one lookup per new flow, so even a million flows in the bysource
table are not a problem compared to current deletion cost.
2) is easy to add to custom hash table.

I tried to add hlist_node to rhlist to speed up rhltable_remove but this
isn't doable without changing semantics.  rhltable_remove_fast will
check that the to-be-deleted object is part of the table and that
requires a list walk that we want to avoid.

Furthermore, using hlist_node increases size of struct rhlist_head, which
in turn increases nf_conn size.

Link: https://bugzilla.kernel.org/show_bug.cgi?id=196821
Reported-by: Ivan Babrou <ibobrik@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

---
 include/net/netfilter/nf_conntrack.h |    3 
 include/net/netfilter/nf_nat.h       |    1 
 net/netfilter/nf_nat_core.c          |  132 ++++++++++++++---------------------
 3 files changed, 56 insertions(+), 80 deletions(-)

--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -17,7 +17,6 @@
 #include <linux/bitops.h>
 #include <linux/compiler.h>
 #include <linux/atomic.h>
-#include <linux/rhashtable.h>
 
 #include <linux/netfilter/nf_conntrack_tcp.h>
 #include <linux/netfilter/nf_conntrack_dccp.h>
@@ -101,7 +100,7 @@ struct nf_conn {
 	possible_net_t ct_net;
 
 #if IS_ENABLED(CONFIG_NF_NAT)
-	struct rhlist_head nat_bysource;
+	struct hlist_node	nat_bysource;
 #endif
 	/* all members below initialized via memset */
 	u8 __nfct_init_offset[0];
--- a/include/net/netfilter/nf_nat.h
+++ b/include/net/netfilter/nf_nat.h
@@ -1,6 +1,5 @@
 #ifndef _NF_NAT_H
 #define _NF_NAT_H
-#include <linux/rhashtable.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/netfilter/nf_nat.h>
 #include <net/netfilter/nf_conntrack_tuple.h>
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -30,19 +30,17 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <linux/netfilter/nf_nat.h>
 
+static DEFINE_SPINLOCK(nf_nat_lock);
+
 static DEFINE_MUTEX(nf_nat_proto_mutex);
 static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
 						__read_mostly;
 
-struct nf_nat_conn_key {
-	const struct net *net;
-	const struct nf_conntrack_tuple *tuple;
-	const struct nf_conntrack_zone *zone;
-};
-
-static struct rhltable nf_nat_bysource_table;
+static struct hlist_head *nf_nat_bysource __read_mostly;
+static unsigned int nf_nat_htable_size __read_mostly;
+static unsigned int nf_nat_hash_rnd __read_mostly;
 
 inline const struct nf_nat_l3proto *
 __nf_nat_l3proto_find(u8 family)
@@ -121,17 +119,19 @@ int nf_xfrm_me_harder(struct net *net, s
 EXPORT_SYMBOL(nf_xfrm_me_harder);
 #endif /* CONFIG_XFRM */
 
-static u32 nf_nat_bysource_hash(const void *data, u32 len, u32 seed)
+/* We keep an extra hash for each conntrack, for fast searching. */
+static inline unsigned int
+hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
 {
-	const struct nf_conntrack_tuple *t;
-	const struct nf_conn *ct = data;
+	unsigned int hash;
+
+	get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
 
-	t = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple;
 	/* Original src, to ensure we map it consistently if poss. */
+	hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
+		      tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
 
-	seed ^= net_hash_mix(nf_ct_net(ct));
-	return jhash2((const u32 *)&t->src, sizeof(t->src) / sizeof(u32),
-		      t->dst.protonum ^ seed);
+	return reciprocal_scale(hash, nf_nat_htable_size);
 }
 
 /* Is this tuple already taken? (not by us) */
@@ -187,28 +187,6 @@ same_src(const struct nf_conn *ct,
 		t->src.u.all == tuple->src.u.all);
 }
 
-static int nf_nat_bysource_cmp(struct rhashtable_compare_arg *arg,
-			       const void *obj)
-{
-	const struct nf_nat_conn_key *key = arg->key;
-	const struct nf_conn *ct = obj;
-
-	if (!same_src(ct, key->tuple) ||
-	    !net_eq(nf_ct_net(ct), key->net) ||
-	    !nf_ct_zone_equal(ct, key->zone, IP_CT_DIR_ORIGINAL))
-		return 1;
-
-	return 0;
-}
-
-static struct rhashtable_params nf_nat_bysource_params = {
-	.head_offset = offsetof(struct nf_conn, nat_bysource),
-	.obj_hashfn = nf_nat_bysource_hash,
-	.obj_cmpfn = nf_nat_bysource_cmp,
-	.nelem_hint = 256,
-	.min_size = 1024,
-};
-
 /* Only called for SRC manip */
 static int
 find_appropriate_src(struct net *net,
@@ -219,26 +197,22 @@ find_appropriate_src(struct net *net,
 		     struct nf_conntrack_tuple *result,
 		     const struct nf_nat_range *range)
 {
+	unsigned int h = hash_by_src(net, tuple);
 	const struct nf_conn *ct;
-	struct nf_nat_conn_key key = {
-		.net = net,
-		.tuple = tuple,
-		.zone = zone
-	};
-	struct rhlist_head *hl, *h;
-
-	hl = rhltable_lookup(&nf_nat_bysource_table, &key,
-			     nf_nat_bysource_params);
 
-	rhl_for_each_entry_rcu(ct, h, hl, nat_bysource) {
-		nf_ct_invert_tuplepr(result,
-				     &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
-		result->dst = tuple->dst;
+	hlist_for_each_entry_rcu(ct, &nf_nat_bysource[h], nat_bysource) {
+		if (same_src(ct, tuple) &&
+		    net_eq(net, nf_ct_net(ct)) &&
+		    nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
+			/* Copy source part from reply tuple. */
+			nf_ct_invert_tuplepr(result,
+				       &ct->tuplehash[IP_CT_DIR_REPLY].tuple);
+			result->dst = tuple->dst;
 
-		if (in_range(l3proto, l4proto, result, range))
-			return 1;
+			if (in_range(l3proto, l4proto, result, range))
+				return 1;
+		}
 	}
-
 	return 0;
 }
 
@@ -411,6 +385,7 @@ nf_nat_setup_info(struct nf_conn *ct,
 		  const struct nf_nat_range *range,
 		  enum nf_nat_manip_type maniptype)
 {
+	struct net *net = nf_ct_net(ct);
 	struct nf_conntrack_tuple curr_tuple, new_tuple;
 	struct nf_conn_nat *nat;
 
@@ -452,19 +427,16 @@ nf_nat_setup_info(struct nf_conn *ct,
 	}
 
 	if (maniptype == NF_NAT_MANIP_SRC) {
-		struct nf_nat_conn_key key = {
-			.net = nf_ct_net(ct),
-			.tuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
-			.zone = nf_ct_zone(ct),
-		};
-		int err;
-
-		err = rhltable_insert_key(&nf_nat_bysource_table,
-					  &key,
-					  &ct->nat_bysource,
-					  nf_nat_bysource_params);
-		if (err)
-			return NF_DROP;
+		unsigned int srchash;
+
+		srchash = hash_by_src(net,
+				      &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
+		spin_lock_bh(&nf_nat_lock);
+		/* nf_conntrack_alter_reply might re-allocate extension aera */
+		nat = nfct_nat(ct);
+		hlist_add_head_rcu(&ct->nat_bysource,
+				   &nf_nat_bysource[srchash]);
+		spin_unlock_bh(&nf_nat_lock);
 	}
 
 	/* It's done. */
@@ -572,9 +544,10 @@ static int nf_nat_proto_clean(struct nf_
 	 * Else, when the conntrack is destoyed, nf_nat_cleanup_conntrack()
 	 * will delete entry from already-freed table.
 	 */
+	spin_lock_bh(&nf_nat_lock);
+	hlist_del_rcu(&ct->nat_bysource);
 	ct->status &= ~IPS_NAT_DONE_MASK;
-	rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-			nf_nat_bysource_params);
+	spin_unlock_bh(&nf_nat_lock);
 
 	/* don't delete conntrack.  Although that would make things a lot
 	 * simpler, we'd end up flushing all conntracks on nat rmmod.
@@ -699,9 +672,11 @@ EXPORT_SYMBOL_GPL(nf_nat_l3proto_unregis
 /* No one using conntrack by the time this called. */
 static void nf_nat_cleanup_conntrack(struct nf_conn *ct)
 {
-	if (ct->status & IPS_SRC_NAT_DONE)
-		rhltable_remove(&nf_nat_bysource_table, &ct->nat_bysource,
-				nf_nat_bysource_params);
+	if (ct->status & IPS_SRC_NAT_DONE) {
+		spin_lock_bh(&nf_nat_lock);
+		hlist_del_rcu(&ct->nat_bysource);
+		spin_unlock_bh(&nf_nat_lock);
+	}
 }
 
 static struct nf_ct_ext_type nat_extend __read_mostly = {
@@ -836,13 +811,16 @@ static int __init nf_nat_init(void)
 {
 	int ret;
 
-	ret = rhltable_init(&nf_nat_bysource_table, &nf_nat_bysource_params);
-	if (ret)
-		return ret;
+	/* Leave them the same for the moment. */
+	nf_nat_htable_size = nf_conntrack_htable_size;
+
+	nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
+	if (!nf_nat_bysource)
+		return -ENOMEM;
 
 	ret = nf_ct_extend_register(&nat_extend);
 	if (ret < 0) {
-		rhltable_destroy(&nf_nat_bysource_table);
+		nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 		printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
 		return ret;
 	}
@@ -866,7 +844,7 @@ static int __init nf_nat_init(void)
 	return 0;
 
  cleanup_extend:
-	rhltable_destroy(&nf_nat_bysource_table);
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 	nf_ct_extend_unregister(&nat_extend);
 	return ret;
 }
@@ -886,8 +864,8 @@ static void __exit nf_nat_cleanup(void)
 
 	for (i = 0; i < NFPROTO_NUMPROTO; i++)
 		kfree(nf_nat_l4protos[i]);
-
-	rhltable_destroy(&nf_nat_bysource_table);
+	synchronize_net();
+	nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
 }
 
 MODULE_LICENSE("GPL");

  parent reply	other threads:[~2017-11-16 18:05 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-11-16 17:42 [PATCH 4.9 00/39] 4.9.63-stable review Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 01/39] gso: fix payload length when gso_size is zero Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 02/39] tun/tap: sanitize TUNSETSNDBUF input Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 03/39] ipv6: addrconf: increment ifp refcount before ipv6_del_addr() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 04/39] netlink: do not set cb_running if dumps start() errs Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 05/39] net: call cgroup_sk_alloc() earlier in sk_clone_lock() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 06/39] tcp: fix tcp_mtu_probe() vs highest_sack Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 07/39] l2tp: check ps->sock before running pppol2tp_session_ioctl() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 08/39] tun: call dev_get_valid_name() before register_netdevice() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 09/39] sctp: add the missing sock_owned_by_user check in sctp_icmp_redirect Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 10/39] tcp/dccp: fix ireq->opt races Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 11/39] packet: avoid panic in packet_getsockopt() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 12/39] soreuseport: fix initialization race Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 13/39] ipv6: flowlabel: do not leave opt->tot_len with garbage Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 14/39] sctp: full support for ipv6 ip_nonlocal_bind & IP_FREEBIND Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 15/39] tcp/dccp: fix lockdep splat in inet_csk_route_req() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 17/39] net/unix: dont show information about sockets from other namespaces Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 18/39] tap: double-free in error path in tap_open() Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 19/39] ipip: only increase err_count for some certain type icmp in ipip_err Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 20/39] ip6_gre: only increase err_count for some certain type icmpv6 in ip6gre_err Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 21/39] ip6_gre: update dst pmtu if dev mtu has been updated by toobig in __gre6_xmit Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 22/39] tun: allow positive return values on dev_get_valid_name() call Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 23/39] sctp: reset owner sk for data chunks on out queues when migrating a sock Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 24/39] net_sched: avoid matching qdisc with zero handle Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 25/39] ppp: fix race in ppp device destruction Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 26/39] mac80211: accept key reinstall without changing anything Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 27/39] mac80211: use constant time comparison with keys Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 28/39] mac80211: dont compare TKIP TX MIC key in reinstall prevention Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 29/39] usb: usbtest: fix NULL pointer dereference Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 30/39] Input: ims-psu - check if CDC union descriptor is sane Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 31/39] ALSA: seq: Cancel pending autoload work at unbinding device Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 33/39] netfilter: nat: avoid use of nf_conn_nat extension Greg Kroah-Hartman
2017-11-16 17:42 ` Greg Kroah-Hartman [this message]
2017-11-16 17:42 ` [PATCH 4.9 35/39] security/keys: add CONFIG_KEYS_COMPAT to Kconfig Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 36/39] brcmfmac: remove setting IBSS mode when stopping AP Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 37/39] target/iscsi: Fix iSCSI task reassignment handling Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 38/39] qla2xxx: Fix incorrect tcm_qla2xxx_free_cmd use during TMR ABORT (v2) Greg Kroah-Hartman
2017-11-16 17:42 ` [PATCH 4.9 39/39] misc: panel: properly restore atomic counter on error path Greg Kroah-Hartman
2017-11-16 22:44 ` [PATCH 4.9 00/39] 4.9.63-stable review Shuah Khan
2017-11-17  2:02 ` Guenter Roeck
2017-11-17 10:34 ` Naresh Kamboju

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20171116174214.695880582@linuxfoundation.org \
    --to=gregkh@linuxfoundation.org \
    --cc=fw@strlen.de \
    --cc=ibobrik@gmail.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=pablo@netfilter.org \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.