From: Thomas Gleixner <tglx@linutronix.de>
To: LKML <linux-kernel@vger.kernel.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>,
x86@kernel.org, Wangyang Guo <wangyang.guo@intel.com>,
Arjan van De Ven <arjan@linux.intel.com>,
"David S. Miller" <davem@davemloft.net>,
Eric Dumazet <edumazet@google.com>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
netdev@vger.kernel.org, Will Deacon <will@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Boqun Feng <boqun.feng@gmail.com>,
Mark Rutland <mark.rutland@arm.com>,
Marc Zyngier <maz@kernel.org>,
Arjan Van De Ven <arjan.van.de.ven@intel.com>
Subject: [patch V2 4/4] net: dst: Switch to rcuref_t reference counting
Date: Tue, 7 Mar 2023 13:57:46 +0100 (CET) [thread overview]
Message-ID: <20230307125538.989175656@linutronix.de> (raw)
In-Reply-To: 20230307125358.772287565@linutronix.de
Under high contention dst_entry::__refcnt becomes a significant bottleneck.
atomic_inc_not_zero() is implemented with a cmpxchg() loop, which goes into
high retry rates on contention.
Switch the reference count to rcuref_t which results in a significant
performance gain.
The gain depends on the micro-architecture and the number of concurrent
operations and has been measured in the range of +25% to +130% with a
localhost memtier/memcached benchmark which amplifies the problem
massively.
Running the memtier/memcached benchmark over a real (1Gb) network
connection the conversion on top of the false sharing fix for struct
dst_entry::__refcnt results in a total gain in the 2%-5% range over the
upstream baseline.
Reported-by: Wangyang Guo <wangyang.guo@intel.com>
Reported-by: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Jakub Kicinski <kuba@kernel.org>
Cc: Paolo Abeni <pabeni@redhat.com>
Cc: netdev@vger.kernel.org
---
include/net/dst.h | 9 +++++----
include/net/sock.h | 2 +-
net/bridge/br_nf_core.c | 2 +-
net/core/dst.c | 26 +++++---------------------
net/core/rtnetlink.c | 2 +-
net/ipv6/route.c | 6 +++---
net/netfilter/ipvs/ip_vs_xmit.c | 4 ++--
7 files changed, 18 insertions(+), 33 deletions(-)
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -16,6 +16,7 @@
#include <linux/bug.h>
#include <linux/jiffies.h>
#include <linux/refcount.h>
+#include <linux/rcuref.h>
#include <net/neighbour.h>
#include <asm/processor.h>
#include <linux/indirect_call_wrapper.h>
@@ -65,7 +66,7 @@ struct dst_entry {
* input/output/ops or performance tanks badly
*/
#ifdef CONFIG_64BIT
- atomic_t __refcnt; /* 64-bit offset 64 */
+ rcuref_t __refcnt; /* 64-bit offset 64 */
#endif
int __use;
unsigned long lastuse;
@@ -75,7 +76,7 @@ struct dst_entry {
__u32 tclassid;
#ifndef CONFIG_64BIT
struct lwtunnel_state *lwtstate;
- atomic_t __refcnt; /* 32-bit offset 64 */
+ rcuref_t __refcnt; /* 32-bit offset 64 */
#endif
netdevice_tracker dev_tracker;
@@ -241,7 +242,7 @@ static inline void dst_hold(struct dst_e
* the placement of __refcnt in struct dst_entry
*/
BUILD_BUG_ON(offsetof(struct dst_entry, __refcnt) & 63);
- WARN_ON(atomic_inc_not_zero(&dst->__refcnt) == 0);
+ WARN_ON(!rcuref_get(&dst->__refcnt));
}
static inline void dst_use_noref(struct dst_entry *dst, unsigned long time)
@@ -305,7 +306,7 @@ static inline void skb_dst_copy(struct s
*/
static inline bool dst_hold_safe(struct dst_entry *dst)
{
- return atomic_inc_not_zero(&dst->__refcnt);
+ return rcuref_get(&dst->__refcnt);
}
/**
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2131,7 +2131,7 @@ sk_dst_get(struct sock *sk)
rcu_read_lock();
dst = rcu_dereference(sk->sk_dst_cache);
- if (dst && !atomic_inc_not_zero(&dst->__refcnt))
+ if (dst && !rcuref_get(&dst->__refcnt))
dst = NULL;
rcu_read_unlock();
return dst;
--- a/net/bridge/br_nf_core.c
+++ b/net/bridge/br_nf_core.c
@@ -73,7 +73,7 @@ void br_netfilter_rtable_init(struct net
{
struct rtable *rt = &br->fake_rtable;
- atomic_set(&rt->dst.__refcnt, 1);
+ rcuref_init(&rt->dst.__refcnt, 1);
rt->dst.dev = br->dev;
dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE;
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -66,7 +66,7 @@ void dst_init(struct dst_entry *dst, str
dst->tclassid = 0;
#endif
dst->lwtstate = NULL;
- atomic_set(&dst->__refcnt, initial_ref);
+ rcuref_init(&dst->__refcnt, initial_ref);
dst->__use = 0;
dst->lastuse = jiffies;
dst->flags = flags;
@@ -162,31 +162,15 @@ EXPORT_SYMBOL(dst_dev_put);
void dst_release(struct dst_entry *dst)
{
- if (dst) {
- int newrefcnt;
-
- newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (WARN_ONCE(newrefcnt < 0, "dst_release underflow"))
- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
- __func__, dst, newrefcnt);
- if (!newrefcnt)
- call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
- }
+ if (dst && rcuref_put(&dst->__refcnt))
+ call_rcu_hurry(&dst->rcu_head, dst_destroy_rcu);
}
EXPORT_SYMBOL(dst_release);
void dst_release_immediate(struct dst_entry *dst)
{
- if (dst) {
- int newrefcnt;
-
- newrefcnt = atomic_dec_return(&dst->__refcnt);
- if (WARN_ONCE(newrefcnt < 0, "dst_release_immediate underflow"))
- net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
- __func__, dst, newrefcnt);
- if (!newrefcnt)
- dst_destroy(dst);
- }
+ if (dst && rcuref_put(&dst->__refcnt))
+ dst_destroy(dst);
}
EXPORT_SYMBOL(dst_release_immediate);
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -840,7 +840,7 @@ int rtnl_put_cacheinfo(struct sk_buff *s
if (dst) {
ci.rta_lastuse = jiffies_delta_to_clock_t(jiffies - dst->lastuse);
ci.rta_used = dst->__use;
- ci.rta_clntref = atomic_read(&dst->__refcnt);
+ ci.rta_clntref = rcuref_read(&dst->__refcnt);
}
if (expires) {
unsigned long clock;
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -293,7 +293,7 @@ static const struct fib6_info fib6_null_
static const struct rt6_info ip6_null_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__refcnt = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -ENETUNREACH,
@@ -307,7 +307,7 @@ static const struct rt6_info ip6_null_en
static const struct rt6_info ip6_prohibit_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__refcnt = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EACCES,
@@ -319,7 +319,7 @@ static const struct rt6_info ip6_prohibi
static const struct rt6_info ip6_blk_hole_entry_template = {
.dst = {
- .__refcnt = ATOMIC_INIT(1),
+ .__refcnt = RCUREF_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -339,7 +339,7 @@ static int
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest_dst->dst_saddr.ip,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__refcnt));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.ip;
@@ -507,7 +507,7 @@ static int
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest_dst->dst_saddr.in6,
- atomic_read(&rt->dst.__refcnt));
+ rcuref_read(&rt->dst.__refcnt));
}
if (ret_saddr)
*ret_saddr = dest_dst->dst_saddr.in6;
next prev parent reply other threads:[~2023-03-07 12:58 UTC|newest]
Thread overview: 13+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-03-07 12:57 [patch V2 0/4] net, refcount: Address dst_entry reference count scalability issues Thomas Gleixner
2023-03-07 12:57 ` [patch V2 1/4] net: dst: Prevent false sharing vs. dst_entry::__refcnt Thomas Gleixner
2023-03-15 20:36 ` Jakub Kicinski
2023-03-15 20:47 ` Eric Dumazet
2023-03-15 22:26 ` David Ahern
2023-03-07 12:57 ` [patch V2 2/4] atomics: Provide atomic_add_and_negative() variants Thomas Gleixner
2023-03-22 13:20 ` Mark Rutland
2023-03-07 12:57 ` [patch V2 3/4] atomics: Provide rcuref - scalable reference counting Thomas Gleixner
2023-03-09 8:35 ` Qiuxu Zhuo
2023-03-20 16:05 ` Thomas Gleixner
2023-03-07 12:57 ` Thomas Gleixner [this message]
2023-03-07 17:55 ` [patch V2 4/4] net: dst: Switch to rcuref_t " Linus Torvalds
2023-03-07 23:00 ` Thomas Gleixner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230307125538.989175656@linutronix.de \
--to=tglx@linutronix.de \
--cc=arjan.van.de.ven@intel.com \
--cc=arjan@linux.intel.com \
--cc=boqun.feng@gmail.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=maz@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=peterz@infradead.org \
--cc=torvalds@linuxfoundation.org \
--cc=wangyang.guo@intel.com \
--cc=will@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).