netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org
Subject: [PATCH 38/38] netfilter: connlimit: use rbtree for per-host conntrack obj storage
Date: Mon, 17 Mar 2014 13:42:58 +0100	[thread overview]
Message-ID: <1395060178-11833-39-git-send-email-pablo@netfilter.org> (raw)
In-Reply-To: <1395060178-11833-1-git-send-email-pablo@netfilter.org>

From: Florian Westphal <fw@strlen.de>

With current match design every invocation of the connlimit_match
function means we have to perform (number_of_conntracks % 256) lookups
in the conntrack table [ to perform GC/delete stale entries ].
This is also the reason why ____nf_conntrack_find() in perf top has
> 20% cpu time per core.

This patch changes the storage to rbtree which cuts down the number of
ct objects that need testing.

When looking up a new tuple, we only test the connections of the host
objects we visit while searching for the wanted host/network (or
the leaf we need to insert at).

The slot count is reduced to 32.  Increasing slot count doesn't
speed up things much because of rbtree nature.

before patch (50kpps rx, 10kpps tx):
+  20.95%  ksoftirqd/0  [nf_conntrack] [k] ____nf_conntrack_find
+  20.50%  ksoftirqd/1  [nf_conntrack] [k] ____nf_conntrack_find
+  20.27%  ksoftirqd/2  [nf_conntrack] [k] ____nf_conntrack_find
+   5.76%  ksoftirqd/1  [nf_conntrack] [k] hash_conntrack_raw
+   5.39%  ksoftirqd/2  [nf_conntrack] [k] hash_conntrack_raw
+   5.35%  ksoftirqd/0  [nf_conntrack] [k] hash_conntrack_raw

after (90kpps, 51kpps tx):
+  17.24%       swapper  [nf_conntrack]    [k] ____nf_conntrack_find
+   6.60%   ksoftirqd/2  [nf_conntrack]    [k] ____nf_conntrack_find
+   2.73%       swapper  [nf_conntrack]    [k] hash_conntrack_raw
+   2.36%       swapper  [xt_connlimit]    [k] count_tree

Obvious disadvantages to previous version are the increase in code
complexity and the increased memory cost.

Partially based on Eric Dumazets fq scheduler.

Reviewed-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 net/netfilter/xt_connlimit.c |  224 +++++++++++++++++++++++++++++++++---------
 1 file changed, 177 insertions(+), 47 deletions(-)

diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c
index dc5207f..458464e 100644
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -19,6 +19,7 @@
 #include <linux/jhash.h>
 #include <linux/slab.h>
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/skbuff.h>
@@ -31,8 +32,9 @@
 #include <net/netfilter/nf_conntrack_tuple.h>
 #include <net/netfilter/nf_conntrack_zones.h>
 
-#define CONNLIMIT_SLOTS	256
+#define CONNLIMIT_SLOTS		32
 #define CONNLIMIT_LOCK_SLOTS	32
+#define CONNLIMIT_GC_MAX_NODES	8
 
 /* we will save the tuples of all connections we care about */
 struct xt_connlimit_conn {
@@ -41,12 +43,20 @@ struct xt_connlimit_conn {
 	union nf_inet_addr		addr;
 };
 
+struct xt_connlimit_rb {
+	struct rb_node node;
+	struct hlist_head hhead; /* connections/hosts in same subnet */
+	union nf_inet_addr addr; /* search key */
+};
+
 struct xt_connlimit_data {
-	struct hlist_head	iphash[CONNLIMIT_SLOTS];
+	struct rb_root climit_root4[CONNLIMIT_SLOTS];
+	struct rb_root climit_root6[CONNLIMIT_SLOTS];
 	spinlock_t		locks[CONNLIMIT_LOCK_SLOTS];
 };
 
 static u_int32_t connlimit_rnd __read_mostly;
+static struct kmem_cache *connlimit_rb_cachep __read_mostly;
 static struct kmem_cache *connlimit_conn_cachep __read_mostly;
 
 static inline unsigned int connlimit_iphash(__be32 addr)
@@ -99,19 +109,33 @@ same_source_net(const union nf_inet_addr *addr,
 	}
 }
 
-static int count_hlist(struct net *net,
-		       struct hlist_head *head,
-		       const struct nf_conntrack_tuple *tuple,
-		       const union nf_inet_addr *addr,
-		       const union nf_inet_addr *mask,
-		       u_int8_t family, bool *addit)
+static bool add_hlist(struct hlist_head *head,
+		      const struct nf_conntrack_tuple *tuple,
+		      const union nf_inet_addr *addr)
+{
+	struct xt_connlimit_conn *conn;
+
+	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
+	if (conn == NULL)
+		return false;
+	conn->tuple = *tuple;
+	conn->addr = *addr;
+	hlist_add_head(&conn->node, head);
+	return true;
+}
+
+static unsigned int check_hlist(struct net *net,
+				struct hlist_head *head,
+				const struct nf_conntrack_tuple *tuple,
+				bool *addit)
 {
 	const struct nf_conntrack_tuple_hash *found;
 	struct xt_connlimit_conn *conn;
 	struct hlist_node *n;
 	struct nf_conn *found_ct;
-	int matches = 0;
+	unsigned int length = 0;
 
+	*addit = true;
 	rcu_read_lock();
 
 	/* check the saved connections */
@@ -144,30 +168,114 @@ static int count_hlist(struct net *net,
 			continue;
 		}
 
-		if (same_source_net(addr, mask, &conn->addr, family) == 0)
-			/* same source network -> be counted! */
-			++matches;
 		nf_ct_put(found_ct);
+		length++;
 	}
 
 	rcu_read_unlock();
 
-	return matches;
+	return length;
 }
 
-static bool add_hlist(struct hlist_head *head,
-		      const struct nf_conntrack_tuple *tuple,
-		      const union nf_inet_addr *addr)
+static void tree_nodes_free(struct rb_root *root,
+			    struct xt_connlimit_rb *gc_nodes[],
+			    unsigned int gc_count)
+{
+	struct xt_connlimit_rb *rbconn;
+
+	while (gc_count) {
+		rbconn = gc_nodes[--gc_count];
+		rb_erase(&rbconn->node, root);
+		kmem_cache_free(connlimit_rb_cachep, rbconn);
+	}
+}
+
+static unsigned int
+count_tree(struct net *net, struct rb_root *root,
+	   const struct nf_conntrack_tuple *tuple,
+	   const union nf_inet_addr *addr, const union nf_inet_addr *mask,
+	   u8 family)
 {
+	struct xt_connlimit_rb *gc_nodes[CONNLIMIT_GC_MAX_NODES];
+	struct rb_node **rbnode, *parent;
+	struct xt_connlimit_rb *rbconn;
 	struct xt_connlimit_conn *conn;
+	unsigned int gc_count;
+	bool no_gc = false;
+
+ restart:
+	gc_count = 0;
+	parent = NULL;
+	rbnode = &(root->rb_node);
+	while (*rbnode) {
+		int diff;
+		bool addit;
+
+		rbconn = container_of(*rbnode, struct xt_connlimit_rb, node);
+
+		parent = *rbnode;
+		diff = same_source_net(addr, mask, &rbconn->addr, family);
+		if (diff < 0) {
+			rbnode = &((*rbnode)->rb_left);
+		} else if (diff > 0) {
+			rbnode = &((*rbnode)->rb_right);
+		} else {
+			/* same source network -> be counted! */
+			unsigned int count;
+			count = check_hlist(net, &rbconn->hhead, tuple, &addit);
+
+			tree_nodes_free(root, gc_nodes, gc_count);
+			if (!addit)
+				return count;
+
+			if (!add_hlist(&rbconn->hhead, tuple, addr))
+				return 0; /* hotdrop */
+
+			return count + 1;
+		}
+
+		if (no_gc || gc_count >= ARRAY_SIZE(gc_nodes))
+			continue;
+
+		/* only used for GC on hhead, retval and 'addit' ignored */
+		check_hlist(net, &rbconn->hhead, tuple, &addit);
+		if (hlist_empty(&rbconn->hhead))
+			gc_nodes[gc_count++] = rbconn;
+	}
+
+	if (gc_count) {
+		no_gc = true;
+		tree_nodes_free(root, gc_nodes, gc_count);
+		/* tree_node_free before new allocation permits
+		 * allocator to re-use newly free'd object.
+		 *
+		 * This is a rare event; in most cases we will find
+		 * existing node to re-use. (or gc_count is 0).
+		 */
+		goto restart;
+	}
+
+	/* no match, need to insert new node */
+	rbconn = kmem_cache_alloc(connlimit_rb_cachep, GFP_ATOMIC);
+	if (rbconn == NULL)
+		return 0;
 
 	conn = kmem_cache_alloc(connlimit_conn_cachep, GFP_ATOMIC);
-	if (conn == NULL)
-		return false;
+	if (conn == NULL) {
+		kmem_cache_free(connlimit_rb_cachep, rbconn);
+		return 0;
+	}
+
 	conn->tuple = *tuple;
 	conn->addr = *addr;
-	hlist_add_head(&conn->node, head);
-	return true;
+	rbconn->addr = *addr;
+
+	INIT_HLIST_HEAD(&rbconn->hhead);
+	hlist_add_head(&conn->node, &rbconn->hhead);
+
+	rb_link_node(&rbconn->node, parent, rbnode);
+	rb_insert_color(&rbconn->node, root);
+	return 1;
 }
 
 static int count_them(struct net *net,
@@ -177,26 +285,22 @@ static int count_them(struct net *net,
 		      const union nf_inet_addr *mask,
 		      u_int8_t family)
 {
-	struct hlist_head *hhead;
+	struct rb_root *root;
 	int count;
 	u32 hash;
-	bool addit = true;
 
-	if (family == NFPROTO_IPV6)
+	if (family == NFPROTO_IPV6) {
 		hash = connlimit_iphash6(addr, mask);
-	else
+		root = &data->climit_root6[hash];
+	} else {
 		hash = connlimit_iphash(addr->ip & mask->ip);
-
-	hhead = &data->iphash[hash];
+		root = &data->climit_root4[hash];
+	}
 
 	spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
-	count = count_hlist(net, hhead, tuple, addr, mask, family, &addit);
-	if (addit) {
-		if (add_hlist(hhead, tuple, addr))
-			count++;
-		else
-			count = -ENOMEM;
-	}
+
+	count = count_tree(net, root, tuple, addr, mask, family);
+
 	spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]);
 
 	return count;
@@ -212,7 +316,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	const struct nf_conntrack_tuple *tuple_ptr = &tuple;
 	enum ip_conntrack_info ctinfo;
 	const struct nf_conn *ct;
-	int connections;
+	unsigned int connections;
 
 	ct = nf_ct_get(skb, &ctinfo);
 	if (ct != NULL)
@@ -233,7 +337,7 @@ connlimit_mt(const struct sk_buff *skb, struct xt_action_param *par)
 
 	connections = count_them(net, info->data, tuple_ptr, &addr,
 	                         &info->mask, par->family);
-	if (connections < 0)
+	if (connections == 0)
 		/* kmalloc failed, drop it entirely */
 		goto hotdrop;
 
@@ -276,28 +380,44 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
 	for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i)
 		spin_lock_init(&info->data->locks[i]);
 
-	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i)
-		INIT_HLIST_HEAD(&info->data->iphash[i]);
+	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+		info->data->climit_root4[i] = RB_ROOT;
+	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+		info->data->climit_root6[i] = RB_ROOT;
 
 	return 0;
 }
 
-static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+static void destroy_tree(struct rb_root *r)
 {
-	const struct xt_connlimit_info *info = par->matchinfo;
 	struct xt_connlimit_conn *conn;
+	struct xt_connlimit_rb *rbconn;
 	struct hlist_node *n;
-	struct hlist_head *hash = info->data->iphash;
-	unsigned int i;
+	struct rb_node *node;
 
-	nf_ct_l3proto_module_put(par->family);
+	while ((node = rb_first(r)) != NULL) {
+		rbconn = container_of(node, struct xt_connlimit_rb, node);
 
-	for (i = 0; i < ARRAY_SIZE(info->data->iphash); ++i) {
-		hlist_for_each_entry_safe(conn, n, &hash[i], node) {
-			hlist_del(&conn->node);
+		rb_erase(node, r);
+
+		hlist_for_each_entry_safe(conn, n, &rbconn->hhead, node)
 			kmem_cache_free(connlimit_conn_cachep, conn);
-		}
+
+		kmem_cache_free(connlimit_rb_cachep, rbconn);
 	}
+}
+
+static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
+{
+	const struct xt_connlimit_info *info = par->matchinfo;
+	unsigned int i;
+
+	nf_ct_l3proto_module_put(par->family);
+
+	for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i)
+		destroy_tree(&info->data->climit_root4[i]);
+	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
+		destroy_tree(&info->data->climit_root6[i]);
 
 	kfree(info->data);
 }
@@ -326,9 +446,18 @@ static int __init connlimit_mt_init(void)
 	if (!connlimit_conn_cachep)
 		return -ENOMEM;
 
+	connlimit_rb_cachep = kmem_cache_create("xt_connlimit_rb",
+					   sizeof(struct xt_connlimit_rb),
+					   0, 0, NULL);
+	if (!connlimit_rb_cachep) {
+		kmem_cache_destroy(connlimit_conn_cachep);
+		return -ENOMEM;
+	}
 	ret = xt_register_match(&connlimit_mt_reg);
-	if (ret != 0)
+	if (ret != 0) {
 		kmem_cache_destroy(connlimit_conn_cachep);
+		kmem_cache_destroy(connlimit_rb_cachep);
+	}
 	return ret;
 }
 
@@ -336,6 +465,7 @@ static void __exit connlimit_mt_exit(void)
 {
 	xt_unregister_match(&connlimit_mt_reg);
 	kmem_cache_destroy(connlimit_conn_cachep);
+	kmem_cache_destroy(connlimit_rb_cachep);
 }
 
 module_init(connlimit_mt_init);
-- 
1.7.10.4


  parent reply	other threads:[~2014-03-17 12:42 UTC|newest]

Thread overview: 47+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2014-03-17 12:42 [PATCH 00/38] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 01/38] netfilter: remove double colon Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 02/38] netfilter: xt_ipcomp: Use ntohs to ease sparse warning Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 03/38] netfilter: nft_ct: labels get support Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 04/38] netfilter: ip_set: rename nfnl_dereference()/nfnl_set() Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 05/38] netfilter: nfnetlink: add rcu_dereference_protected() helpers Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 06/38] netfilter: nf_tables: add nft_dereference() macro Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 07/38] netfilter: nf_tables: accept QUEUE/DROP verdict parameters Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 08/38] netfilter: nfnetlink_log: remove unused code Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 09/38] netfilter: nf_tables: add optional user data area to rules Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 10/38] netfilter: ipset: Follow manual page behavior for SET target on list:set Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 11/38] netfilter: ipset: Add hash: fix coccinelle warnings Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 12/38] netfilter: ipset: add hash:ip,mark data type to ipset Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 13/38] netfilter: ipset: add markmask for hash:ip,mark data type Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 14/38] netfilter: ipset: Prepare the kernel for create option flags when no extension is needed Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 15/38] netfilter: ipset: kernel: uapi: fix MARKMASK attr ABI breakage Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 16/38] netfilter: ipset: move registration message to init from net_init Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 17/38] netfilter: ipset: add forceadd kernel support for hash set types Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 18/38] sections, ipvs: Remove useless __read_mostly for ipvs genl_ops Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 19/38] ipvs: Reduce checkpatch noise in ip_vs_lblc.c Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 20/38] netfilter: trivial code cleanup and doc changes Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 21/38] netfilter: conntrack: spinlock per cpu to protect special lists Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 22/38] netfilter: avoid race with exp->master ct Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 23/38] netfilter: conntrack: seperate expect locking from nf_conntrack_lock Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 24/38] netfilter: conntrack: remove central spinlock nf_conntrack_lock Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 25/38] netfilter: nft_hash: bug fixes and resizing Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 26/38] netfilter: nf_tables: clean up nf_tables_trans_add() argument order Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 27/38] netfilter: nf_tables: restore context for expression destructors Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 28/38] netfilter: nf_tables: restore notifications for anonymous set destruction Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 29/38] netfilter: nft_ct: remove family from struct nft_ct Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 30/38] netfilter: nft_nat: fix family validation Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 31/38] netfilter: connlimit: factor hlist search into new function Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 32/38] netfilter: connlimit: improve packet-to-closed-connection logic Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 33/38] netfilter: connlimit: move insertion of new element out of count function Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 34/38] netfilter: connlimit: use kmem_cache for conn objects Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 35/38] netfilter: Convert uses of __constant_<foo> to <foo> Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 36/38] netfilter: connlimit: use keyed locks Pablo Neira Ayuso
2014-03-17 12:54   ` David Laight
2014-03-17 14:26     ` Florian Westphal
2014-03-17 14:40       ` David Laight
2014-03-17 14:00   ` Eric Dumazet
2014-03-17 14:23     ` Florian Westphal
2014-03-18 13:46     ` Jesper Dangaard Brouer
2014-03-18 14:01       ` Eric Dumazet
2014-03-17 12:42 ` [PATCH 37/38] netfilter: connlimit: make same_source_net signed Pablo Neira Ayuso
2014-03-17 12:42 ` Pablo Neira Ayuso [this message]
2014-03-17 19:19 ` [PATCH 00/38] Netfilter/IPVS updates for net-next David Miller

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=1395060178-11833-39-git-send-email-pablo@netfilter.org \
    --to=pablo@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).