Netdev List
 help / color / mirror / Atom feed
From: Pablo Neira Ayuso <pablo@netfilter.org>
To: netfilter-devel@vger.kernel.org
Cc: davem@davemloft.net, netdev@vger.kernel.org, kuba@kernel.org,
	pabeni@redhat.com, edumazet@google.com, fw@strlen.de,
	horms@kernel.org
Subject: [PATCH net-next 01/15] ipvs: add conn_max sysctl to limit connections
Date: Sun,  7 Jun 2026 11:49:40 +0200	[thread overview]
Message-ID: <20260607094954.48892-2-pablo@netfilter.org> (raw)
In-Reply-To: <20260607094954.48892-1-pablo@netfilter.org>

From: Julian Anastasov <ja@ssi.bg>

Currently, we are using atomic_t to track the number of
connections. On 64-bit setups with large memory there is
a risk this counter to overflow. Also, setups with many
containers may need to tune the limit for connections.

Add sysctl control to limit the number of connections to
1,073,741,824 (64-bit) and 16,777,216 (32-bit).
Depending on the admin's privilege, the value is
used to change a soft or hard limit allowing
unprivileged admins to change the soft limit in
range determined by privileged admins.

Link: https://sashiko.dev/#/patchset/20260523172715.94795-1-ja%40ssi.bg
Link: https://sashiko.dev/#/patchset/20260430074420.26697-7-ja%40ssi.bg
Link: https://sashiko.dev/#/patchset/20260522105546.13732-1-ja%40ssi.bg
Signed-off-by: Julian Anastasov <ja@ssi.bg>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
---
 Documentation/networking/ipvs-sysctl.rst | 35 ++++++++++++++++
 include/net/ip_vs.h                      | 22 ++++++++++
 net/netfilter/ipvs/ip_vs_conn.c          | 10 ++++-
 net/netfilter/ipvs/ip_vs_ctl.c           | 53 ++++++++++++++++++++++++
 4 files changed, 119 insertions(+), 1 deletion(-)

diff --git a/Documentation/networking/ipvs-sysctl.rst b/Documentation/networking/ipvs-sysctl.rst
index a556439f8be7..b6bac2612420 100644
--- a/Documentation/networking/ipvs-sysctl.rst
+++ b/Documentation/networking/ipvs-sysctl.rst
@@ -56,6 +56,41 @@ conn_lfactor - INTEGER
 	-4: grow if load goes above 6% (buckets = nodes * 16)
 	2: grow if load goes above 400% (buckets = nodes / 4)
 
+conn_max - INTEGER
+	Limit for number of connections, per netns.
+
+	Controls the soft and hard limit for number of connections.
+	Initially, the platform specific limit is assigned for init_net.
+	The value can be changed and later the soft limit propagated
+	to other networking namespaces.
+
+	Privileged admin can change both limits up to the value of the
+	platform limit while the unprivileged admin can change only the
+	soft limit up to the value of the hard limit.
+
+	For setups using conntrack=1 (CONFIG_IP_VS_NFCT for
+	Netfilter connection tracking) the connections can be
+	limited also by nf_conntrack_max.
+
+				soft limit	hard limit
+	=====================================================
+	init_net:
+	create netns		platform	platform
+	priv admin		0 .. platform	0 .. platform
+	=====================================================
+	new netns:
+	create netns		init_net:soft	init_net:soft
+	priv admin		0 .. platform	0 .. platform
+	unpriv admin		0 .. hard	N/A
+
+	Limits per platform:
+	1,073,741,824 (2^30 for 64-bit)
+	   16,777,216 (2^24 for 32-bit)
+
+	Possible values: 0 .. platform limit
+
+	Default: platform limit
+
 conn_reuse_mode - INTEGER
 	1 - default
 
diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h
index e517eaaa177b..49297fec448a 100644
--- a/include/net/ip_vs.h
+++ b/include/net/ip_vs.h
@@ -44,6 +44,14 @@
 #define IP_VS_CONN_TAB_MAX_BITS	20
 #endif
 
+/* conn_max limits */
+#if BITS_PER_LONG > 32
+/* Limit of atomic_t but restricted by roundup_pow_of_two() in ip_vs_core.c */
+#define IP_VS_CONN_MAX	(1 << 30)
+#else
+#define IP_VS_CONN_MAX	(1 << 24)
+#endif
+
 /* svc_table limits */
 #define IP_VS_SVC_TAB_MIN_BITS	4
 #define IP_VS_SVC_TAB_MAX_BITS	20
@@ -1220,6 +1228,10 @@ struct netns_ipvs {
 	/* sysctl variables */
 	int			sysctl_amemthresh;
 	int			sysctl_am_droprate;
+#ifdef CONFIG_SYSCTL
+	int			sysctl_conn_max;/* soft limit for conns */
+	int			conn_max_limit;	/* hard limit for conn_max */
+#endif
 	int			sysctl_drop_entry;
 	int			sysctl_drop_packet;
 	int			sysctl_secure_tcp;
@@ -1317,6 +1329,11 @@ struct netns_ipvs {
 
 #ifdef CONFIG_SYSCTL
 
+static inline int sysctl_conn_max(struct netns_ipvs *ipvs)
+{
+	return READ_ONCE(ipvs->sysctl_conn_max);
+}
+
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
 {
 	return ipvs->sysctl_sync_threshold[0];
@@ -1436,6 +1453,11 @@ static inline int sysctl_est_nice(struct netns_ipvs *ipvs)
 
 #else
 
+static inline int sysctl_conn_max(struct netns_ipvs *ipvs)
+{
+	return IP_VS_CONN_MAX;
+}
+
 static inline int sysctl_sync_threshold(struct netns_ipvs *ipvs)
 {
 	return DEFAULT_SYNC_THRESHOLD;
diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c
index 9ea6b4fa78bf..e76a73d183d5 100644
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1358,9 +1358,18 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 	struct netns_ipvs *ipvs = p->ipvs;
 	struct ip_vs_proto_data *pd = ip_vs_proto_data_get(p->ipvs,
 							   p->protocol);
+	/* Increment conn_count up to conn_max */
+	int count = atomic_read(&ipvs->conn_count);
+	int max = sysctl_conn_max(ipvs);
+
+	do {
+		if (count >= max)
+			return NULL;
+	} while (!atomic_try_cmpxchg(&ipvs->conn_count, &count, count + 1));
 
 	cp = kmem_cache_alloc(ip_vs_conn_cachep, GFP_ATOMIC);
 	if (cp == NULL) {
+		atomic_dec(&ipvs->conn_count);
 		IP_VS_ERR_RL("%s(): no memory\n", __func__);
 		return NULL;
 	}
@@ -1414,7 +1423,6 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, int dest_af,
 	cp->in_seq.delta = 0;
 	cp->out_seq.delta = 0;
 
-	atomic_inc(&ipvs->conn_count);
 	if (unlikely(flags & IP_VS_CONN_F_NO_CPORT)) {
 		int af_id = ip_vs_af_index(cp->af);
 
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index 16daba8cac83..f765d1506839 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -2322,6 +2322,45 @@ static int ip_vs_zero_all(struct netns_ipvs *ipvs)
 
 #ifdef CONFIG_SYSCTL
 
+static int
+proc_do_conn_max(const struct ctl_table *table, int write,
+		 void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int *valp = table->data;
+	/* We can not use *valp to check if new value is provided, use INT_MIN
+	 * for this because different admins change different limits.
+	 */
+	int unset = INT_MIN;
+	int val = write ? unset : READ_ONCE(*valp);
+	int rc;
+
+	const struct ctl_table tmp = {
+		.data = &val,
+		.maxlen = sizeof(int),
+	};
+
+	rc = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+	if (write && !rc && val != unset) {
+		struct netns_ipvs *ipvs = table->extra2;
+		bool priv = capable(CAP_NET_ADMIN);
+		int max;
+
+		mutex_lock(&ipvs->service_mutex);
+		/* Unprivileged admins can not go above the hard limit */
+		max = priv ? IP_VS_CONN_MAX : ipvs->conn_max_limit;
+		if (val < 0 || val > max) {
+			rc = -EINVAL;
+		} else {
+			/* Privileged admin changes both limits */
+			if (priv)
+				ipvs->conn_max_limit = val;
+			WRITE_ONCE(*valp, val);
+		}
+		mutex_unlock(&ipvs->service_mutex);
+	}
+	return rc;
+}
+
 static int
 proc_do_defense_mode(const struct ctl_table *table, int write,
 		     void *buffer, size_t *lenp, loff_t *ppos)
@@ -2626,6 +2665,12 @@ static struct ctl_table vs_vars[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "conn_max",
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_do_conn_max,
+	},
 	{
 		.procname	= "drop_entry",
 		.maxlen		= sizeof(int),
@@ -4980,6 +5025,14 @@ static int __net_init ip_vs_control_net_init_sysctl(struct netns_ipvs *ipvs)
 	tbl[idx++].data = &ipvs->sysctl_amemthresh;
 	ipvs->sysctl_am_droprate = 10;
 	tbl[idx++].data = &ipvs->sysctl_am_droprate;
+
+	/* Inherit both limits from init_net:conn_max */
+	ipvs->conn_max_limit = net_eq(net, &init_net) ? IP_VS_CONN_MAX :
+			       READ_ONCE(*(int *)vs_vars[idx].data);
+	ipvs->sysctl_conn_max = ipvs->conn_max_limit;
+	tbl[idx].extra2 = ipvs;
+	tbl[idx++].data = &ipvs->sysctl_conn_max;
+
 	tbl[idx++].data = &ipvs->sysctl_drop_entry;
 	tbl[idx++].data = &ipvs->sysctl_drop_packet;
 #ifdef CONFIG_IP_VS_NFCT
-- 
2.47.3


  reply	other threads:[~2026-06-07  9:50 UTC|newest]

Thread overview: 16+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-07  9:49 [PATCH net-next 00/15] Netfilter/IPVS updates for net-next Pablo Neira Ayuso
2026-06-07  9:49 ` Pablo Neira Ayuso [this message]
2026-06-07  9:49 ` [PATCH net-next 02/15] netfilter: nfnetlink_osf: fix mss parsing on big-endian architectures Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 03/15] netfilter: nfnetlink_cthelper: use {READ,WRITE}_ONCE for accessing helper flags Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 04/15] netfilter: synproxy: drop packets if timestamp adjustment fails Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 05/15] netfilter: synproxy: adjust duplicate timestamp options Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 06/15] netfilter: synproxy: fix unaligned memory access in timestamp adjustment Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 07/15] netfilter: synproxy: protect nf_ct_seqadj_init() with conntrack lock Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 08/15] netfilter: cttimeout: detach dataplane timeout policy and repurpose refcount Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 09/15] netfilter: nf_conntrack_helper: dynamically allocate struct nf_conntrack_helper Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 10/15] netfilter: nf_conntrack_pptp: move GRE specific cleanup to GRE tracker Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 11/15] netfilter: nf_conntrack_helper: add refcounting from datapath Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 12/15] netfilter: conntrack: revert ct extension genid infrastructure Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 13/15] netfilter: conntrack: call nf_ct_gre_keymap_destroy() if master helper is pptp Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 14/15] netfilter: flowtable: avoid num_encaps underflow on bridge VLAN untag Pablo Neira Ayuso
2026-06-07  9:49 ` [PATCH net-next 15/15] netfilter: nf_conntrack: use get_unaligned_be32() in tcp_sack() Pablo Neira Ayuso

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260607094954.48892-2-pablo@netfilter.org \
    --to=pablo@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=horms@kernel.org \
    --cc=kuba@kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox