From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org
Subject: [PATCH 25/38] netfilter: nft_hash: bug fixes and resizing
Date: Mon, 17 Mar 2014 13:42:45 +0100 [thread overview]
Message-ID: <1395060178-11833-26-git-send-email-pablo@netfilter.org> (raw)
In-Reply-To: <1395060178-11833-1-git-send-email-pablo@netfilter.org>
From: Patrick McHardy <kaber@trash.net>
The hash set type is very broken and was never meant to be merged in this
state. Missing RCU synchronization on element removal, leaking chain
refcounts when used as a verdict map, races during lookups, a fixed table
size are probably just some of the problems. Luckily it is currently
never chosen by the kernel when the rbtree type is also available.
Rewrite it to be usable.
The new implementation supports automatic hash table resizing using RCU,
based on Paul McKenney's and Josh Triplett's algorithm "Optimized Resizing
For RCU-Protected Hash Tables" described in [1].
Resizing doesn't require a second list head in the elements, it works by
chosing a hash function that remaps elements to a predictable set of buckets,
only resizing by integral factors and
- during expansion: linking new buckets to the old bucket that contains
elements for any of the new buckets, thereby creating imprecise chains,
then incrementally seperating the elements until the new buckets only
contain elements that hash directly to them.
- during shrinking: linking the hash chains of all old buckets that hash
to the same new bucket to form a single chain.
Expansion requires at most the number of elements in the longest hash chain
grace periods, shrinking requires a single grace period.
Due to the requirement of having hash chains/elements linked to multiple
buckets during resizing, homemade single linked lists are used instead of
the existing list helpers, that don't support this in a clean fashion.
As a side effect, the amount of memory required per element is reduced by
one pointer.
Expansion is triggered when the load factors exceeds 75%, shrinking when
the load factor goes below 30%. Both operations are allowed to fail and
will be retried on the next insertion or removal if their respective
conditions still hold.
[1] http://dl.acm.org/citation.cfm?id=2002181.2002192
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Patrick McHardy <kaber@trash.net>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
net/netfilter/nft_hash.c | 260 ++++++++++++++++++++++++++++++++++++++--------
1 file changed, 214 insertions(+), 46 deletions(-)
diff --git a/net/netfilter/nft_hash.c b/net/netfilter/nft_hash.c
index 3d3f8fc..6a1acde 100644
--- a/net/netfilter/nft_hash.c
+++ b/net/netfilter/nft_hash.c
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2008-2009 Patrick McHardy <kaber@trash.net>
+ * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,17 +18,29 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
+#define NFT_HASH_MIN_SIZE 4
+
struct nft_hash {
- struct hlist_head *hash;
- unsigned int hsize;
+ struct nft_hash_table __rcu *tbl;
+};
+
+struct nft_hash_table {
+ unsigned int size;
+ unsigned int elements;
+ struct nft_hash_elem __rcu *buckets[];
};
struct nft_hash_elem {
- struct hlist_node hnode;
- struct nft_data key;
- struct nft_data data[];
+ struct nft_hash_elem __rcu *next;
+ struct nft_data key;
+ struct nft_data data[];
};
+#define nft_hash_for_each_entry(i, head) \
+ for (i = nft_dereference(head); i != NULL; i = nft_dereference(i->next))
+#define nft_hash_for_each_entry_rcu(i, head) \
+ for (i = rcu_dereference(head); i != NULL; i = rcu_dereference(i->next))
+
static u32 nft_hash_rnd __read_mostly;
static bool nft_hash_rnd_initted __read_mostly;
@@ -38,7 +50,7 @@ static unsigned int nft_hash_data(const struct nft_data *data,
unsigned int h;
h = jhash(data->data, len, nft_hash_rnd);
- return ((u64)h * hsize) >> 32;
+ return h & (hsize - 1);
}
static bool nft_hash_lookup(const struct nft_set *set,
@@ -46,11 +58,12 @@ static bool nft_hash_lookup(const struct nft_set *set,
struct nft_data *data)
{
const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = rcu_dereference(priv->tbl);
const struct nft_hash_elem *he;
unsigned int h;
- h = nft_hash_data(key, priv->hsize, set->klen);
- hlist_for_each_entry(he, &priv->hash[h], hnode) {
+ h = nft_hash_data(key, tbl->size, set->klen);
+ nft_hash_for_each_entry_rcu(he, tbl->buckets[h]) {
if (nft_data_cmp(&he->key, key, set->klen))
continue;
if (set->flags & NFT_SET_MAP)
@@ -60,19 +73,148 @@ static bool nft_hash_lookup(const struct nft_set *set,
return false;
}
-static void nft_hash_elem_destroy(const struct nft_set *set,
- struct nft_hash_elem *he)
+static void nft_hash_tbl_free(const struct nft_hash_table *tbl)
{
- nft_data_uninit(&he->key, NFT_DATA_VALUE);
- if (set->flags & NFT_SET_MAP)
- nft_data_uninit(he->data, set->dtype);
- kfree(he);
+ if (is_vmalloc_addr(tbl))
+ vfree(tbl);
+ else
+ kfree(tbl);
+}
+
+static struct nft_hash_table *nft_hash_tbl_alloc(unsigned int nbuckets)
+{
+ struct nft_hash_table *tbl;
+ size_t size;
+
+ size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
+ tbl = kzalloc(size, GFP_KERNEL | __GFP_REPEAT | __GFP_NOWARN);
+ if (tbl == NULL)
+ tbl = vzalloc(size);
+ if (tbl == NULL)
+ return NULL;
+ tbl->size = nbuckets;
+
+ return tbl;
+}
+
+static void nft_hash_chain_unzip(const struct nft_set *set,
+ const struct nft_hash_table *ntbl,
+ struct nft_hash_table *tbl, unsigned int n)
+{
+ struct nft_hash_elem *he, *last, *next;
+ unsigned int h;
+
+ he = nft_dereference(tbl->buckets[n]);
+ if (he == NULL)
+ return;
+ h = nft_hash_data(&he->key, ntbl->size, set->klen);
+
+ /* Find last element of first chain hashing to bucket h */
+ last = he;
+ nft_hash_for_each_entry(he, he->next) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+ break;
+ last = he;
+ }
+
+ /* Unlink first chain from the old table */
+ RCU_INIT_POINTER(tbl->buckets[n], last->next);
+
+ /* If end of chain reached, done */
+ if (he == NULL)
+ return;
+
+ /* Find first element of second chain hashing to bucket h */
+ next = NULL;
+ nft_hash_for_each_entry(he, he->next) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != h)
+ continue;
+ next = he;
+ break;
+ }
+
+ /* Link the two chains */
+ RCU_INIT_POINTER(last->next, next);
+}
+
+static int nft_hash_tbl_expand(const struct nft_set *set, struct nft_hash *priv)
+{
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+ struct nft_hash_elem *he;
+ unsigned int i, h;
+ bool complete;
+
+ ntbl = nft_hash_tbl_alloc(tbl->size * 2);
+ if (ntbl == NULL)
+ return -ENOMEM;
+
+ /* Link new table's buckets to first element in the old table
+ * hashing to the new bucket.
+ */
+ for (i = 0; i < ntbl->size; i++) {
+ h = i < tbl->size ? i : i - tbl->size;
+ nft_hash_for_each_entry(he, tbl->buckets[h]) {
+ if (nft_hash_data(&he->key, ntbl->size, set->klen) != i)
+ continue;
+ RCU_INIT_POINTER(ntbl->buckets[i], he);
+ break;
+ }
+ }
+ ntbl->elements = tbl->elements;
+
+ /* Publish new table */
+ rcu_assign_pointer(priv->tbl, ntbl);
+
+ /* Unzip interleaved hash chains */
+ do {
+ /* Wait for readers to use new table/unzipped chains */
+ synchronize_rcu();
+
+ complete = true;
+ for (i = 0; i < tbl->size; i++) {
+ nft_hash_chain_unzip(set, ntbl, tbl, i);
+ if (tbl->buckets[i] != NULL)
+ complete = false;
+ }
+ } while (!complete);
+
+ nft_hash_tbl_free(tbl);
+ return 0;
+}
+
+static int nft_hash_tbl_shrink(const struct nft_set *set, struct nft_hash *priv)
+{
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl), *ntbl;
+ struct nft_hash_elem __rcu **pprev;
+ unsigned int i;
+
+ ntbl = nft_hash_tbl_alloc(tbl->size / 2);
+ if (ntbl == NULL)
+ return -ENOMEM;
+
+ for (i = 0; i < ntbl->size; i++) {
+ ntbl->buckets[i] = tbl->buckets[i];
+
+ for (pprev = &ntbl->buckets[i]; *pprev != NULL;
+ pprev = &nft_dereference(*pprev)->next)
+ ;
+ RCU_INIT_POINTER(*pprev, tbl->buckets[i + ntbl->size]);
+ }
+ ntbl->elements = tbl->elements;
+
+ /* Publish new table */
+ rcu_assign_pointer(priv->tbl, ntbl);
+ synchronize_rcu();
+
+ nft_hash_tbl_free(tbl);
+ return 0;
}
static int nft_hash_insert(const struct nft_set *set,
const struct nft_set_elem *elem)
{
struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl);
struct nft_hash_elem *he;
unsigned int size, h;
@@ -91,33 +233,66 @@ static int nft_hash_insert(const struct nft_set *set,
if (set->flags & NFT_SET_MAP)
nft_data_copy(he->data, &elem->data);
- h = nft_hash_data(&he->key, priv->hsize, set->klen);
- hlist_add_head_rcu(&he->hnode, &priv->hash[h]);
+ h = nft_hash_data(&he->key, tbl->size, set->klen);
+ RCU_INIT_POINTER(he->next, tbl->buckets[h]);
+ rcu_assign_pointer(tbl->buckets[h], he);
+ tbl->elements++;
+
+ /* Expand table when exceeding 75% load */
+ if (tbl->elements > tbl->size / 4 * 3)
+ nft_hash_tbl_expand(set, priv);
+
return 0;
}
+static void nft_hash_elem_destroy(const struct nft_set *set,
+ struct nft_hash_elem *he)
+{
+ nft_data_uninit(&he->key, NFT_DATA_VALUE);
+ if (set->flags & NFT_SET_MAP)
+ nft_data_uninit(he->data, set->dtype);
+ kfree(he);
+}
+
static void nft_hash_remove(const struct nft_set *set,
const struct nft_set_elem *elem)
{
- struct nft_hash_elem *he = elem->cookie;
+ struct nft_hash *priv = nft_set_priv(set);
+ struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem *he, __rcu **pprev;
- hlist_del_rcu(&he->hnode);
+ pprev = elem->cookie;
+ he = nft_dereference((*pprev));
+
+ RCU_INIT_POINTER(*pprev, he->next);
+ synchronize_rcu();
kfree(he);
+ tbl->elements--;
+
+ /* Shrink table beneath 30% load */
+ if (tbl->elements < tbl->size * 3 / 10 &&
+ tbl->size > NFT_HASH_MIN_SIZE)
+ nft_hash_tbl_shrink(set, priv);
}
static int nft_hash_get(const struct nft_set *set, struct nft_set_elem *elem)
{
const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem __rcu * const *pprev;
struct nft_hash_elem *he;
unsigned int h;
- h = nft_hash_data(&elem->key, priv->hsize, set->klen);
- hlist_for_each_entry(he, &priv->hash[h], hnode) {
- if (nft_data_cmp(&he->key, &elem->key, set->klen))
+ h = nft_hash_data(&elem->key, tbl->size, set->klen);
+ pprev = &tbl->buckets[h];
+ nft_hash_for_each_entry(he, tbl->buckets[h]) {
+ if (nft_data_cmp(&he->key, &elem->key, set->klen)) {
+ pprev = &he->next;
continue;
+ }
- elem->cookie = he;
- elem->flags = 0;
+ elem->cookie = (void *)pprev;
+ elem->flags = 0;
if (set->flags & NFT_SET_MAP)
nft_data_copy(&elem->data, he->data);
return 0;
@@ -129,12 +304,13 @@ static void nft_hash_walk(const struct nft_ctx *ctx, const struct nft_set *set,
struct nft_set_iter *iter)
{
const struct nft_hash *priv = nft_set_priv(set);
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
const struct nft_hash_elem *he;
struct nft_set_elem elem;
unsigned int i;
- for (i = 0; i < priv->hsize; i++) {
- hlist_for_each_entry(he, &priv->hash[i], hnode) {
+ for (i = 0; i < tbl->size; i++) {
+ nft_hash_for_each_entry(he, tbl->buckets[i]) {
if (iter->count < iter->skip)
goto cont;
@@ -161,43 +337,35 @@ static int nft_hash_init(const struct nft_set *set,
const struct nlattr * const tb[])
{
struct nft_hash *priv = nft_set_priv(set);
- unsigned int cnt, i;
+ struct nft_hash_table *tbl;
if (unlikely(!nft_hash_rnd_initted)) {
get_random_bytes(&nft_hash_rnd, 4);
nft_hash_rnd_initted = true;
}
- /* Aim for a load factor of 0.75 */
- // FIXME: temporarily broken until we have set descriptions
- cnt = 100;
- cnt = cnt * 4 / 3;
-
- priv->hash = kcalloc(cnt, sizeof(struct hlist_head), GFP_KERNEL);
- if (priv->hash == NULL)
+ tbl = nft_hash_tbl_alloc(NFT_HASH_MIN_SIZE);
+ if (tbl == NULL)
return -ENOMEM;
- priv->hsize = cnt;
-
- for (i = 0; i < cnt; i++)
- INIT_HLIST_HEAD(&priv->hash[i]);
-
+ RCU_INIT_POINTER(priv->tbl, tbl);
return 0;
}
static void nft_hash_destroy(const struct nft_set *set)
{
const struct nft_hash *priv = nft_set_priv(set);
- const struct hlist_node *next;
- struct nft_hash_elem *elem;
+ const struct nft_hash_table *tbl = nft_dereference(priv->tbl);
+ struct nft_hash_elem *he, *next;
unsigned int i;
- for (i = 0; i < priv->hsize; i++) {
- hlist_for_each_entry_safe(elem, next, &priv->hash[i], hnode) {
- hlist_del(&elem->hnode);
- nft_hash_elem_destroy(set, elem);
+ for (i = 0; i < tbl->size; i++) {
+ for (he = nft_dereference(tbl->buckets[i]); he != NULL;
+ he = next) {
+ next = nft_dereference(he->next);
+ nft_hash_elem_destroy(set, he);
}
}
- kfree(priv->hash);
+ kfree(tbl);
}
static struct nft_set_ops nft_hash_ops __read_mostly = {
--
1.7.10.4
next prev parent reply other threads:[~2014-03-17 12:42 UTC|newest]
Thread overview: 47+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-03-17 12:42 [PATCH 00/38] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 01/38] netfilter: remove double colon Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 02/38] netfilter: xt_ipcomp: Use ntohs to ease sparse warning Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 03/38] netfilter: nft_ct: labels get support Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 04/38] netfilter: ip_set: rename nfnl_dereference()/nfnl_set() Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 05/38] netfilter: nfnetlink: add rcu_dereference_protected() helpers Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 06/38] netfilter: nf_tables: add nft_dereference() macro Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 07/38] netfilter: nf_tables: accept QUEUE/DROP verdict parameters Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 08/38] netfilter: nfnetlink_log: remove unused code Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 09/38] netfilter: nf_tables: add optional user data area to rules Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 10/38] netfilter: ipset: Follow manual page behavior for SET target on list:set Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 11/38] netfilter: ipset: Add hash: fix coccinelle warnings Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 12/38] netfilter: ipset: add hash:ip,mark data type to ipset Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 13/38] netfilter: ipset: add markmask for hash:ip,mark data type Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 14/38] netfilter: ipset: Prepare the kernel for create option flags when no extension is needed Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 15/38] netfilter: ipset: kernel: uapi: fix MARKMASK attr ABI breakage Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 16/38] netfilter: ipset: move registration message to init from net_init Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 17/38] netfilter: ipset: add forceadd kernel support for hash set types Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 18/38] sections, ipvs: Remove useless __read_mostly for ipvs genl_ops Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 19/38] ipvs: Reduce checkpatch noise in ip_vs_lblc.c Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 20/38] netfilter: trivial code cleanup and doc changes Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 21/38] netfilter: conntrack: spinlock per cpu to protect special lists Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 22/38] netfilter: avoid race with exp->master ct Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 23/38] netfilter: conntrack: seperate expect locking from nf_conntrack_lock Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 24/38] netfilter: conntrack: remove central spinlock nf_conntrack_lock Pablo Neira Ayuso
2014-03-17 12:42 ` Pablo Neira Ayuso [this message]
2014-03-17 12:42 ` [PATCH 26/38] netfilter: nf_tables: clean up nf_tables_trans_add() argument order Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 27/38] netfilter: nf_tables: restore context for expression destructors Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 28/38] netfilter: nf_tables: restore notifications for anonymous set destruction Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 29/38] netfilter: nft_ct: remove family from struct nft_ct Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 30/38] netfilter: nft_nat: fix family validation Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 31/38] netfilter: connlimit: factor hlist search into new function Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 32/38] netfilter: connlimit: improve packet-to-closed-connection logic Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 33/38] netfilter: connlimit: move insertion of new element out of count function Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 34/38] netfilter: connlimit: use kmem_cache for conn objects Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 35/38] netfilter: Convert uses of __constant_<foo> to <foo> Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 36/38] netfilter: connlimit: use keyed locks Pablo Neira Ayuso
2014-03-17 12:54 ` David Laight
2014-03-17 14:26 ` Florian Westphal
2014-03-17 14:40 ` David Laight
2014-03-17 14:00 ` Eric Dumazet
2014-03-17 14:23 ` Florian Westphal
2014-03-18 13:46 ` Jesper Dangaard Brouer
2014-03-18 14:01 ` Eric Dumazet
2014-03-17 12:42 ` [PATCH 37/38] netfilter: connlimit: make same_source_net signed Pablo Neira Ayuso
2014-03-17 12:42 ` [PATCH 38/38] netfilter: connlimit: use rbtree for per-host conntrack obj storage Pablo Neira Ayuso
2014-03-17 19:19 ` [PATCH 00/38] Netfilter/IPVS updates for net-next David Miller
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1395060178-11833-26-git-send-email-pablo@netfilter.org \
--to=pablo@netfilter.org \
--cc=davem@davemloft.net \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).