From: Eric Dumazet <eric.dumazet@gmail.com>
To: Patrick McHardy <kaber@trash.net>
Cc: Jorrit Kronjee <j.kronjee@infopact.nl>,
netfilter-devel@vger.kernel.org, netdev <netdev@vger.kernel.org>
Subject: Re: [PATCH nf-next-2.6] xt_hashlimit: RCU conversion
Date: Thu, 01 Apr 2010 14:10:04 +0200 [thread overview]
Message-ID: <1270123804.2229.91.camel@edumazet-laptop> (raw)
In-Reply-To: <4BB47D81.3060400@trash.net>
Le jeudi 01 avril 2010 à 13:03 +0200, Patrick McHardy a écrit :
> Eric Dumazet wrote:
> > xt_hashlimit uses a central lock per hash table and suffers from
> > contention on some workloads. (Multiqueue NIC or if RPS is enabled)
> >
> > After RCU conversion, central lock is only used when a writer wants to
> > add or delete an entry.
> >
> > For 'readers', updating an existing entry, they use an individual lock
> > per entry.
>
> Looks good to me, thanks Eric.
>
> > -/* allocate dsthash_ent, initialize dst, put in htable and lock it */
> > -static struct dsthash_ent *
> > -dsthash_alloc_init(struct xt_hashlimit_htable *ht,
> > - const struct dsthash_dst *dst)
>
> Is there a reason for moving this function downwards in the file?
> That unnecessarily increases the diff and makes the patch harder to
> review. For review purposes I moved it back up, resulting in 42
> lines less diff.
> --
Well, this is because I had to move inside this function various
initializations and these inits use user2credits() which was defined
after dsthash_alloc_init().
But we can avoid this since we hold the entry spinlock, and before hash
insertion.
Only the lookup fields and the spinlock MUST be committed to memory
before the insert. Other fields can be initialized later by caller.
Here is V2 of patch, I added locking as well in dl_seq_real_show()
because we call rateinfo_recalc(). htable main lock doesnt anymore
protects each entry rateinfo.
Thanks
[PATCH nf-next-2.6] xt_hashlimit: RCU conversion
xt_hashlimit uses a central lock per hash table and suffers from
contention on some workloads. (Multiqueue NIC or if RPS is enabled)
After RCU conversion, central lock is only used when a writer wants to
add or delete an entry.
For 'readers', updating an existing entry, they use an individual lock
per entry.
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
---
diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c
index 5470bb0..453178d 100644
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -81,12 +81,14 @@ struct dsthash_ent {
struct dsthash_dst dst;
/* modified structure members in the end */
+ spinlock_t lock;
unsigned long expires; /* precalculated expiry time */
struct {
unsigned long prev; /* last modification */
u_int32_t credit;
u_int32_t credit_cap, cost;
} rateinfo;
+ struct rcu_head rcu;
};
struct xt_hashlimit_htable {
@@ -143,9 +145,11 @@ dsthash_find(const struct xt_hashlimit_htable *ht,
u_int32_t hash = hash_dst(ht, dst);
if (!hlist_empty(&ht->hash[hash])) {
- hlist_for_each_entry(ent, pos, &ht->hash[hash], node)
- if (dst_cmp(ent, dst))
+ hlist_for_each_entry_rcu(ent, pos, &ht->hash[hash], node)
+ if (dst_cmp(ent, dst)) {
+ spin_lock(&ent->lock);
return ent;
+ }
}
return NULL;
}
@@ -157,9 +161,10 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
{
struct dsthash_ent *ent;
+ spin_lock(&ht->lock);
/* initialize hash with random val at the time we allocate
* the first hashtable entry */
- if (!ht->rnd_initialized) {
+ if (unlikely(!ht->rnd_initialized)) {
get_random_bytes(&ht->rnd, sizeof(ht->rnd));
ht->rnd_initialized = true;
}
@@ -168,27 +173,36 @@ dsthash_alloc_init(struct xt_hashlimit_htable *ht,
/* FIXME: do something. question is what.. */
if (net_ratelimit())
pr_err("max count of %u reached\n", ht->cfg.max);
- return NULL;
- }
-
- ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
+ ent = NULL;
+ } else
+ ent = kmem_cache_alloc(hashlimit_cachep, GFP_ATOMIC);
if (!ent) {
if (net_ratelimit())
pr_err("cannot allocate dsthash_ent\n");
- return NULL;
- }
- memcpy(&ent->dst, dst, sizeof(ent->dst));
+ } else {
+ memcpy(&ent->dst, dst, sizeof(ent->dst));
+ spin_lock_init(&ent->lock);
- hlist_add_head(&ent->node, &ht->hash[hash_dst(ht, dst)]);
- ht->count++;
+ spin_lock(&ent->lock);
+ hlist_add_head_rcu(&ent->node, &ht->hash[hash_dst(ht, dst)]);
+ ht->count++;
+ }
+ spin_unlock(&ht->lock);
return ent;
}
+static void dsthash_free_rcu(struct rcu_head *head)
+{
+ struct dsthash_ent *ent = container_of(head, struct dsthash_ent, rcu);
+
+ kmem_cache_free(hashlimit_cachep, ent);
+}
+
static inline void
dsthash_free(struct xt_hashlimit_htable *ht, struct dsthash_ent *ent)
{
- hlist_del(&ent->node);
- kmem_cache_free(hashlimit_cachep, ent);
+ hlist_del_rcu(&ent->node);
+ call_rcu_bh(&ent->rcu, dsthash_free_rcu);
ht->count--;
}
static void htable_gc(unsigned long htlong);
@@ -512,15 +526,14 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
if (hashlimit_init_dst(hinfo, &dst, skb, par->thoff) < 0)
goto hotdrop;
- spin_lock_bh(&hinfo->lock);
+ rcu_read_lock_bh();
dh = dsthash_find(hinfo, &dst);
if (dh == NULL) {
dh = dsthash_alloc_init(hinfo, &dst);
if (dh == NULL) {
- spin_unlock_bh(&hinfo->lock);
+ rcu_read_unlock_bh();
goto hotdrop;
}
-
dh->expires = jiffies + msecs_to_jiffies(hinfo->cfg.expire);
dh->rateinfo.prev = jiffies;
dh->rateinfo.credit = user2credits(hinfo->cfg.avg *
@@ -537,11 +550,13 @@ hashlimit_mt(const struct sk_buff *skb, const struct xt_match_param *par)
if (dh->rateinfo.credit >= dh->rateinfo.cost) {
/* below the limit */
dh->rateinfo.credit -= dh->rateinfo.cost;
- spin_unlock_bh(&hinfo->lock);
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
return !(info->cfg.mode & XT_HASHLIMIT_INVERT);
}
- spin_unlock_bh(&hinfo->lock);
+ spin_unlock(&dh->lock);
+ rcu_read_unlock_bh();
/* default match is underlimit - so over the limit, we need to invert */
return info->cfg.mode & XT_HASHLIMIT_INVERT;
@@ -666,12 +681,15 @@ static void dl_seq_stop(struct seq_file *s, void *v)
static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
struct seq_file *s)
{
+ int res;
+
+ spin_lock(&ent->lock);
/* recalculate to show accurate numbers */
rateinfo_recalc(ent, jiffies);
switch (family) {
case NFPROTO_IPV4:
- return seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
+ res = seq_printf(s, "%ld %pI4:%u->%pI4:%u %u %u %u\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip.src,
ntohs(ent->dst.src_port),
@@ -679,9 +697,10 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
ntohs(ent->dst.dst_port),
ent->rateinfo.credit, ent->rateinfo.credit_cap,
ent->rateinfo.cost);
+ break;
#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
case NFPROTO_IPV6:
- return seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
+ res = seq_printf(s, "%ld %pI6:%u->%pI6:%u %u %u %u\n",
(long)(ent->expires - jiffies)/HZ,
&ent->dst.ip6.src,
ntohs(ent->dst.src_port),
@@ -689,11 +708,14 @@ static int dl_seq_real_show(struct dsthash_ent *ent, u_int8_t family,
ntohs(ent->dst.dst_port),
ent->rateinfo.credit, ent->rateinfo.credit_cap,
ent->rateinfo.cost);
+ break;
#endif
default:
BUG();
- return 0;
+ res = 0;
}
+ spin_unlock(&ent->lock);
+ return res;
}
static int dl_seq_show(struct seq_file *s, void *v)
@@ -817,9 +839,11 @@ err1:
static void __exit hashlimit_mt_exit(void)
{
- kmem_cache_destroy(hashlimit_cachep);
xt_unregister_matches(hashlimit_mt_reg, ARRAY_SIZE(hashlimit_mt_reg));
unregister_pernet_subsys(&hashlimit_net_ops);
+
+ rcu_barrier_bh();
+ kmem_cache_destroy(hashlimit_cachep);
}
module_init(hashlimit_mt_init);
next prev parent reply other threads:[~2010-04-01 12:10 UTC|newest]
Thread overview: 10+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <4BA74950.6000305@infopact.nl>
[not found] ` <4BA7A5D8.5080101@trash.net>
[not found] ` <4BA8DAC5.6050002@infopact.nl>
[not found] ` <1269364893.2983.296.camel@edumazet-laptop>
[not found] ` <4BAA2DC5.7000409@infopact.nl>
[not found] ` <1269447674.3213.64.camel@edumazet-laptop>
2010-03-25 9:32 ` debugging kernel during packet drops Eric Dumazet
2010-03-25 10:35 ` Patrick McHardy
2010-03-25 11:02 ` Eric Dumazet
2010-03-31 12:23 ` [PATCH nf-next-2.6] xt_hashlimit: RCU conversion Eric Dumazet
2010-04-01 11:03 ` Patrick McHardy
2010-04-01 12:10 ` Eric Dumazet [this message]
2010-04-01 12:36 ` Patrick McHardy
2010-03-25 12:42 ` debugging kernel during packet drops Jan Engelhardt
2010-03-30 12:06 ` Jan Engelhardt
2010-03-30 14:12 ` Patrick McHardy
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1270123804.2229.91.camel@edumazet-laptop \
--to=eric.dumazet@gmail.com \
--cc=j.kronjee@infopact.nl \
--cc=kaber@trash.net \
--cc=netdev@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox