From: Julian Anastasov <ja@ssi.bg>
To: Simon Horman <horms@verge.net.au>
Cc: lvs-devel@vger.kernel.org, netfilter-devel@vger.kernel.org,
Dust Li <dust.li@linux.alibaba.com>,
Jiejian Wu <jiejian@linux.alibaba.com>,
rcu@vger.kernel.org
Subject: [PATCHv6 net-next 05/14] ipvs: do not keep dest_dst after dest is removed
Date: Sun, 19 Oct 2025 18:57:02 +0300 [thread overview]
Message-ID: <20251019155711.67609-6-ja@ssi.bg> (raw)
In-Reply-To: <20251019155711.67609-1-ja@ssi.bg>
Before now dest->dest_dst is not released when server is moved into
dest_trash list after removal. As result, we can keep dst/dev
references for long time without actively using them.
It is better to avoid walking the dest_trash list when
ip_vs_dst_event() receives dev events. So, make sure we do not
hold dev references in dest_trash list. As packets can be flying
while server is being removed, check the IP_VS_DEST_F_AVAILABLE
flag in slow path to ensure we do not save new dev references to
removed servers.
Signed-off-by: Julian Anastasov <ja@ssi.bg>
---
net/netfilter/ipvs/ip_vs_ctl.c | 20 +++++++----------
net/netfilter/ipvs/ip_vs_xmit.c | 39 ++++++++++++++++++++++++---------
2 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 6c04920f9c87..db9f3565051c 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -810,7 +810,6 @@ static void ip_vs_dest_free(struct ip_vs_dest *dest)
{
struct ip_vs_service *svc = rcu_dereference_protected(dest->svc, 1);
- __ip_vs_dst_cache_reset(dest);
__ip_vs_svc_put(svc);
call_rcu(&dest->rcu_head, ip_vs_dest_rcu_free);
}
@@ -1013,10 +1012,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
dest->af = udest->af;
- spin_lock_bh(&dest->dst_lock);
- __ip_vs_dst_cache_reset(dest);
- spin_unlock_bh(&dest->dst_lock);
-
if (add) {
list_add_rcu(&dest->n_list, &svc->destinations);
svc->num_dests++;
@@ -1024,6 +1019,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest,
if (sched && sched->add_dest)
sched->add_dest(svc, dest);
} else {
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+
sched = rcu_dereference_protected(svc->scheduler, 1);
if (sched && sched->upd_dest)
sched->upd_dest(svc, dest);
@@ -1258,6 +1257,10 @@ static void __ip_vs_unlink_dest(struct ip_vs_service *svc,
{
dest->flags &= ~IP_VS_DEST_F_AVAILABLE;
+ spin_lock_bh(&dest->dst_lock);
+ __ip_vs_dst_cache_reset(dest);
+ spin_unlock_bh(&dest->dst_lock);
+
/*
* Remove it from the d-linked destination list.
*/
@@ -1748,13 +1751,6 @@ static int ip_vs_dst_event(struct notifier_block *this, unsigned long event,
}
rcu_read_unlock();
- mutex_lock(&ipvs->service_mutex);
- spin_lock_bh(&ipvs->dest_trash_lock);
- list_for_each_entry(dest, &ipvs->dest_trash, t_list) {
- ip_vs_forget_dev(dest, dev);
- }
- spin_unlock_bh(&ipvs->dest_trash_lock);
- mutex_unlock(&ipvs->service_mutex);
return NOTIFY_DONE;
}
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 95af252b2939..d57af95f1ebd 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -310,9 +310,11 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
- if (likely(dest_dst))
+ if (likely(dest_dst)) {
rt = dst_rtable(dest_dst->dst_cache);
- else {
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
+ } else {
dest_dst = ip_vs_dest_dst_alloc();
spin_lock_bh(&dest->dst_lock);
if (!dest_dst) {
@@ -328,14 +330,24 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
ip_vs_dest_dst_free(dest_dst);
goto err_unreach;
}
- __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ /* It is forbidden to attach dest->dest_dst if
+ * server is deleted. We can see the flag going down,
+ * for very short period and it must be checked under
+ * dst_lock.
+ */
+ if (dest->flags & IP_VS_DEST_F_AVAILABLE)
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, 0);
+ else
+ noref = 0;
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI4, src %pI4, refcnt=%d\n",
&dest->addr.ip, &dest_dst->dst_saddr.ip,
rcuref_read(&rt->dst.__rcuref));
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.ip;
+ if (!noref)
+ ip_vs_dest_dst_free(dest_dst);
}
- if (ret_saddr)
- *ret_saddr = dest_dst->dst_saddr.ip;
} else {
noref = 0;
@@ -469,9 +481,11 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
if (dest) {
dest_dst = __ip_vs_dst_check(dest);
- if (likely(dest_dst))
+ if (likely(dest_dst)) {
rt = dst_rt6_info(dest_dst->dst_cache);
- else {
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.in6;
+ } else {
u32 cookie;
dest_dst = ip_vs_dest_dst_alloc();
@@ -492,14 +506,19 @@ __ip_vs_get_out_rt_v6(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb,
}
rt = dst_rt6_info(dst);
cookie = rt6_get_cookie(rt);
- __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ if (dest->flags & IP_VS_DEST_F_AVAILABLE)
+ __ip_vs_dst_set(dest, dest_dst, &rt->dst, cookie);
+ else
+ noref = 0;
spin_unlock_bh(&dest->dst_lock);
IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n",
&dest->addr.in6, &dest_dst->dst_saddr.in6,
rcuref_read(&rt->dst.__rcuref));
+ if (ret_saddr)
+ *ret_saddr = dest_dst->dst_saddr.in6;
+ if (!noref)
+ ip_vs_dest_dst_free(dest_dst);
}
- if (ret_saddr)
- *ret_saddr = dest_dst->dst_saddr.in6;
} else {
noref = 0;
dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm,
--
2.51.0
next prev parent reply other threads:[~2025-10-19 16:01 UTC|newest]
Thread overview: 18+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-10-19 15:56 [PATCHv6 net-next 00/14] ipvs: per-net tables and optimizations Julian Anastasov
2025-10-19 15:56 ` [PATCHv6 net-next 01/14] rculist_bl: add hlist_bl_for_each_entry_continue_rcu Julian Anastasov
2025-10-23 11:44 ` Florian Westphal
2025-10-23 13:33 ` Julian Anastasov
2025-10-19 15:56 ` [PATCHv6 net-next 02/14] ipvs: make ip_vs_svc_table and ip_vs_svc_fwm_table per netns Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 03/14] ipvs: some service readers can use RCU Julian Anastasov
2025-10-24 2:21 ` Dust Li
2025-10-19 15:57 ` [PATCHv6 net-next 04/14] ipvs: use single svc table Julian Anastasov
2025-10-19 15:57 ` Julian Anastasov [this message]
2025-10-19 15:57 ` [PATCHv6 net-next 06/14] ipvs: use more counters to avoid service lookups Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 07/14] ipvs: add resizable hash tables Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 08/14] ipvs: use resizable hash table for services Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 09/14] ipvs: switch to per-net connection table Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 10/14] ipvs: show the current conn_tab size to users Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 11/14] ipvs: no_cport and dropentry counters can be per-net Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 12/14] ipvs: use more keys for connection hashing Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 13/14] ipvs: add ip_vs_status info Julian Anastasov
2025-10-19 15:57 ` [PATCHv6 net-next 14/14] ipvs: add conn_lfactor and svc_lfactor sysctl vars Julian Anastasov
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20251019155711.67609-6-ja@ssi.bg \
--to=ja@ssi.bg \
--cc=dust.li@linux.alibaba.com \
--cc=horms@verge.net.au \
--cc=jiejian@linux.alibaba.com \
--cc=lvs-devel@vger.kernel.org \
--cc=netfilter-devel@vger.kernel.org \
--cc=rcu@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).