From: Thomas Gleixner <tglx@linutronix.de>
To: Eric Dumazet <edumazet@google.com>
Cc: LKML <linux-kernel@vger.kernel.org>,
Linus Torvalds <torvalds@linuxfoundation.org>,
x86@kernel.org, Wangyang Guo <wangyang.guo@intel.com>,
Arjan van De Ven <arjan@linux.intel.com>,
"David S. Miller" <davem@davemloft.net>,
Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>,
netdev@vger.kernel.org, Will Deacon <will@kernel.org>,
Peter Zijlstra <peterz@infradead.org>,
Boqun Feng <boqun.feng@gmail.com>,
Mark Rutland <mark.rutland@arm.com>,
Marc Zyngier <maz@kernel.org>
Subject: Re: [patch 1/3] net: dst: Prevent false sharing vs. dst_entry::__refcnt
Date: Thu, 02 Mar 2023 00:12:23 +0100 [thread overview]
Message-ID: <87edq8kqw8.ffs@tglx> (raw)
In-Reply-To: <875yblmq83.ffs@tglx>
On Tue, Feb 28 2023 at 22:31, Thomas Gleixner wrote:
> On Tue, Feb 28 2023 at 16:17, Eric Dumazet wrote:
>> Instead of mere pads, add some unions, and let rt6i_uncached/rt6i_uncached_list
>> use them.
>
> If I understand correctly, you suggest to move
>
> rt6_info::rt6i_uncached[_list], rtable::rt_uncached[_list]
>
> into struct dst_entry and fixup the usage sites, right?
>
> I don't see why that would need a union. dst_entry::rt_uncached[_list]
> would work for both, no?
So I came up with the below.
rt6_info shrinks from 232 to 224 bytes. rtable size is unchanged
Thanks,
tglx
---
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -69,15 +69,28 @@ struct dst_entry {
#endif
int __use;
unsigned long lastuse;
- struct lwtunnel_state *lwtstate;
struct rcu_head rcu_head;
short error;
short __pad;
__u32 tclassid;
#ifndef CONFIG_64BIT
+ struct lwtunnel_state *lwtstate;
atomic_t __refcnt; /* 32-bit offset 64 */
#endif
netdevice_tracker dev_tracker;
+
+ /*
+ * Used by rtable and rt6_info. Moves lwtstate into the next cache
+ * line on 64bit so that lwtstate does not cause false sharing with
+ * __refcnt under contention of __refcnt. This also puts the
+ * frequently accessed members of rtable and rt6_info out of the
+ * __refcnt cache line.
+ */
+ struct list_head rt_uncached;
+ struct uncached_list *rt_uncached_list;
+#ifdef CONFIG_64BIT
+ struct lwtunnel_state *lwtstate;
+#endif
};
struct dst_metrics {
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -217,9 +217,6 @@ struct rt6_info {
struct inet6_dev *rt6i_idev;
u32 rt6i_flags;
- struct list_head rt6i_uncached;
- struct uncached_list *rt6i_uncached_list;
-
/* more non-fragment space at head required */
unsigned short rt6i_nfheader_len;
};
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -104,7 +104,7 @@ static inline struct dst_entry *ip6_rout
static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags)
{
if (!(flags & RT6_LOOKUP_F_DST_NOREF) ||
- !list_empty(&rt->rt6i_uncached))
+ !list_empty(&rt->dst.rt_uncached))
ip6_rt_put(rt);
}
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -81,9 +81,6 @@ struct rtable {
/* Miscellaneous cached information */
u32 rt_mtu_locked:1,
rt_pmtu:31;
-
- struct list_head rt_uncached;
- struct uncached_list *rt_uncached_list;
};
static inline bool rt_is_input_route(const struct rtable *rt)
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1508,20 +1508,20 @@ void rt_add_uncached_list(struct rtable
{
struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
- rt->rt_uncached_list = ul;
+ rt->dst.rt_uncached_list = ul;
spin_lock_bh(&ul->lock);
- list_add_tail(&rt->rt_uncached, &ul->head);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt_del_uncached_list(struct rtable *rt)
{
- if (!list_empty(&rt->rt_uncached)) {
- struct uncached_list *ul = rt->rt_uncached_list;
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del_init(&rt->rt_uncached);
+ list_del_init(&rt->dst.rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
@@ -1546,13 +1546,13 @@ void rt_flush_dev(struct net_device *dev
continue;
spin_lock_bh(&ul->lock);
- list_for_each_entry_safe(rt, safe, &ul->head, rt_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
if (rt->dst.dev != dev)
continue;
rt->dst.dev = blackhole_netdev;
netdev_ref_replace(dev, blackhole_netdev,
&rt->dst.dev_tracker, GFP_ATOMIC);
- list_move(&rt->rt_uncached, &ul->quarantine);
+ list_move(&rt->dst.rt_uncached, &ul->quarantine);
}
spin_unlock_bh(&ul->lock);
}
@@ -1644,7 +1644,7 @@ struct rtable *rt_dst_alloc(struct net_d
rt->rt_uses_gateway = 0;
rt->rt_gw_family = 0;
rt->rt_gw4 = 0;
- INIT_LIST_HEAD(&rt->rt_uncached);
+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
rt->dst.output = ip_output;
if (flags & RTCF_LOCAL)
@@ -1675,7 +1675,7 @@ struct rtable *rt_dst_clone(struct net_d
new_rt->rt_gw4 = rt->rt_gw4;
else if (rt->rt_gw_family == AF_INET6)
new_rt->rt_gw6 = rt->rt_gw6;
- INIT_LIST_HEAD(&new_rt->rt_uncached);
+ INIT_LIST_HEAD(&new_rt->dst.rt_uncached);
new_rt->dst.input = rt->dst.input;
new_rt->dst.output = rt->dst.output;
@@ -2859,7 +2859,7 @@ struct dst_entry *ipv4_blackhole_route(s
else if (rt->rt_gw_family == AF_INET6)
rt->rt_gw6 = ort->rt_gw6;
- INIT_LIST_HEAD(&rt->rt_uncached);
+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
}
dst_release(dst_orig);
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -91,7 +91,7 @@ static int xfrm4_fill_dst(struct xfrm_ds
xdst->u.rt.rt_gw6 = rt->rt_gw6;
xdst->u.rt.rt_pmtu = rt->rt_pmtu;
xdst->u.rt.rt_mtu_locked = rt->rt_mtu_locked;
- INIT_LIST_HEAD(&xdst->u.rt.rt_uncached);
+ INIT_LIST_HEAD(&xdst->u.rt.dst.rt_uncached);
rt_add_uncached_list(&xdst->u.rt);
return 0;
@@ -121,7 +121,7 @@ static void xfrm4_dst_destroy(struct dst
struct xfrm_dst *xdst = (struct xfrm_dst *)dst;
dst_destroy_metrics_generic(dst);
- if (xdst->u.rt.rt_uncached_list)
+ if (xdst->u.rt.dst.rt_uncached_list)
rt_del_uncached_list(&xdst->u.rt);
xfrm_dst_destroy(xdst);
}
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -139,20 +139,20 @@ void rt6_uncached_list_add(struct rt6_in
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
- rt->rt6i_uncached_list = ul;
+ rt->dst.rt_uncached_list = ul;
spin_lock_bh(&ul->lock);
- list_add_tail(&rt->rt6i_uncached, &ul->head);
+ list_add_tail(&rt->dst.rt_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
- if (!list_empty(&rt->rt6i_uncached)) {
- struct uncached_list *ul = rt->rt6i_uncached_list;
+ if (!list_empty(&rt->dst.rt_uncached)) {
+ struct uncached_list *ul = rt->dst.rt_uncached_list;
spin_lock_bh(&ul->lock);
- list_del_init(&rt->rt6i_uncached);
+ list_del_init(&rt->dst.rt_uncached);
spin_unlock_bh(&ul->lock);
}
}
@@ -169,7 +169,7 @@ static void rt6_uncached_list_flush_dev(
continue;
spin_lock_bh(&ul->lock);
- list_for_each_entry_safe(rt, safe, &ul->head, rt6i_uncached) {
+ list_for_each_entry_safe(rt, safe, &ul->head, dst.rt_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
bool handled = false;
@@ -188,7 +188,7 @@ static void rt6_uncached_list_flush_dev(
handled = true;
}
if (handled)
- list_move(&rt->rt6i_uncached,
+ list_move(&rt->dst.rt_uncached,
&ul->quarantine);
}
spin_unlock_bh(&ul->lock);
@@ -334,7 +334,7 @@ static const struct rt6_info ip6_blk_hol
static void rt6_info_init(struct rt6_info *rt)
{
memset_after(rt, 0, dst);
- INIT_LIST_HEAD(&rt->rt6i_uncached);
+ INIT_LIST_HEAD(&rt->dst.rt_uncached);
}
/* allocate dst with ip6_dst_ops */
@@ -2638,7 +2638,7 @@ struct dst_entry *ip6_route_output_flags
dst = ip6_route_output_flags_noref(net, sk, fl6, flags);
rt6 = (struct rt6_info *)dst;
/* For dst cached in uncached_list, refcnt is already taken. */
- if (list_empty(&rt6->rt6i_uncached) && !dst_hold_safe(dst)) {
+ if (list_empty(&rt6->dst.rt_uncached) && !dst_hold_safe(dst)) {
dst = &net->ipv6.ip6_null_entry->dst;
dst_hold(dst);
}
@@ -2748,7 +2748,7 @@ INDIRECT_CALLABLE_SCOPE struct dst_entry
from = rcu_dereference(rt->from);
if (from && (rt->rt6i_flags & RTF_PCPU ||
- unlikely(!list_empty(&rt->rt6i_uncached))))
+ unlikely(!list_empty(&rt->dst.rt_uncached))))
dst_ret = rt6_dst_from_check(rt, from, cookie);
else
dst_ret = rt6_check(rt, from, cookie);
@@ -6483,7 +6483,7 @@ static int __net_init ip6_route_net_init
net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
ip6_template_metrics, true);
- INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->rt6i_uncached);
+ INIT_LIST_HEAD(&net->ipv6.ip6_null_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
net->ipv6.fib6_has_custom_rules = false;
@@ -6495,7 +6495,7 @@ static int __net_init ip6_route_net_init
net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
ip6_template_metrics, true);
- INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->rt6i_uncached);
+ INIT_LIST_HEAD(&net->ipv6.ip6_prohibit_entry->dst.rt_uncached);
net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
sizeof(*net->ipv6.ip6_blk_hole_entry),
@@ -6505,7 +6505,7 @@ static int __net_init ip6_route_net_init
net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
ip6_template_metrics, true);
- INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->rt6i_uncached);
+ INIT_LIST_HEAD(&net->ipv6.ip6_blk_hole_entry->dst.rt_uncached);
#ifdef CONFIG_IPV6_SUBTREES
net->ipv6.fib6_routes_require_src = 0;
#endif
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -89,7 +89,7 @@ static int xfrm6_fill_dst(struct xfrm_ds
xdst->u.rt6.rt6i_gateway = rt->rt6i_gateway;
xdst->u.rt6.rt6i_dst = rt->rt6i_dst;
xdst->u.rt6.rt6i_src = rt->rt6i_src;
- INIT_LIST_HEAD(&xdst->u.rt6.rt6i_uncached);
+ INIT_LIST_HEAD(&xdst->u.rt6.dst.rt_uncached);
rt6_uncached_list_add(&xdst->u.rt6);
return 0;
@@ -121,7 +121,7 @@ static void xfrm6_dst_destroy(struct dst
if (likely(xdst->u.rt6.rt6i_idev))
in6_dev_put(xdst->u.rt6.rt6i_idev);
dst_destroy_metrics_generic(dst);
- if (xdst->u.rt6.rt6i_uncached_list)
+ if (xdst->u.rt6.dst.rt_uncached_list)
rt6_uncached_list_del(&xdst->u.rt6);
xfrm_dst_destroy(xdst);
}
next prev parent reply other threads:[~2023-03-01 23:12 UTC|newest]
Thread overview: 19+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-02-28 14:33 [patch 0/3] net, refcount: Address dst_entry reference count scalability issues Thomas Gleixner
2023-02-28 14:33 ` [patch 1/3] net: dst: Prevent false sharing vs. dst_entry::__refcnt Thomas Gleixner
2023-02-28 15:17 ` Eric Dumazet
2023-02-28 21:31 ` Thomas Gleixner
2023-03-01 23:12 ` Thomas Gleixner [this message]
2023-02-28 14:33 ` [patch 2/3] atomics: Provide rcuref - scalable reference counting Thomas Gleixner
2023-03-01 0:42 ` Linus Torvalds
2023-03-01 1:07 ` Linus Torvalds
2023-03-01 11:09 ` Thomas Gleixner
2023-03-02 1:05 ` Thomas Gleixner
2023-03-02 1:29 ` Randy Dunlap
2023-03-02 19:36 ` Linus Torvalds
2023-02-28 14:33 ` [patch 3/3] net: dst: Switch to rcuref_t " Thomas Gleixner
2023-02-28 15:07 ` [patch 0/3] net, refcount: Address dst_entry reference count scalability issues Eric Dumazet
2023-02-28 16:38 ` Thomas Gleixner
2023-02-28 16:59 ` Eric Dumazet
2023-03-01 1:00 ` Thomas Gleixner
2023-03-01 3:17 ` Jakub Kicinski
2023-03-01 10:40 ` Thomas Gleixner
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=87edq8kqw8.ffs@tglx \
--to=tglx@linutronix.de \
--cc=arjan@linux.intel.com \
--cc=boqun.feng@gmail.com \
--cc=davem@davemloft.net \
--cc=edumazet@google.com \
--cc=kuba@kernel.org \
--cc=linux-kernel@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=maz@kernel.org \
--cc=netdev@vger.kernel.org \
--cc=pabeni@redhat.com \
--cc=peterz@infradead.org \
--cc=torvalds@linuxfoundation.org \
--cc=wangyang.guo@intel.com \
--cc=will@kernel.org \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox