linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
@ 2025-04-07  9:50 lvxiafei
  2025-04-07 10:13 ` Florian Westphal
                   ` (5 more replies)
  0 siblings, 6 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-07  9:50 UTC (permalink / raw)
  To: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Pablo Neira Ayuso, Jozsef Kadlecsik
  Cc: lvxiafei, netdev, linux-kernel, netfilter-devel, coreteam

From: lvxiafei <lvxiafei@sensetime.com>

The modification of nf_conntrack_max in one netns
should not affect the value in another one.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 include/net/netns/conntrack.h           |  1 +
 net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
 net/netfilter/nf_conntrack_standalone.c |  6 ++++--
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	u8			sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5ac1893e9045 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1139,6 +1140,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = nf_conntrack_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
@ 2025-04-07 10:13 ` Florian Westphal
  2025-04-07 10:56   ` Jan Engelhardt
  2025-04-08  8:17   ` lvxiafei
  2025-04-08  9:03 ` [PATCH V2] " lvxiafei
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 35+ messages in thread
From: Florian Westphal @ 2025-04-07 10:13 UTC (permalink / raw)
  To: lvxiafei
  Cc: David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Pablo Neira Ayuso, Jozsef Kadlecsik, lvxiafei,
	netdev, linux-kernel, netfilter-devel, coreteam

lvxiafei <xiafei_xupt@163.com> wrote:
> The modification of nf_conntrack_max in one netns
> should not affect the value in another one.

nf_conntrack_max can only be changed in init_net.

Given the check isn't removed:
   /* Don't allow non-init_net ns to alter global sysctls */
   if (!net_eq(&init_net, net)) {
       table[NF_SYSCTL_CT_MAX].mode = 0444;

... this patch seems untested?

But, removing this check would allow any netns to consume
arbitrary amount of kernel memory.

How do you prevent this?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07 10:13 ` Florian Westphal
@ 2025-04-07 10:56   ` Jan Engelhardt
  2025-04-08  8:27     ` lvxiafei
  2025-04-08  8:38     ` lvxiafei
  2025-04-08  8:17   ` lvxiafei
  1 sibling, 2 replies; 35+ messages in thread
From: Jan Engelhardt @ 2025-04-07 10:56 UTC (permalink / raw)
  To: Florian Westphal
  Cc: lvxiafei, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Simon Horman, Pablo Neira Ayuso, Jozsef Kadlecsik,
	lvxiafei, netdev, linux-kernel, netfilter-devel, coreteam


On Monday 2025-04-07 12:13, Florian Westphal wrote:
>lvxiafei <xiafei_xupt@163.com> wrote:
>> The modification of nf_conntrack_max in one netns
>> should not affect the value in another one.
>
>nf_conntrack_max can only be changed in init_net.
>
>Given the check isn't removed:
>   /* Don't allow non-init_net ns to alter global sysctls */
>   if (!net_eq(&init_net, net)) {
>       table[NF_SYSCTL_CT_MAX].mode = 0444;
>
>... this patch seems untested?
>
>But, removing this check would allow any netns to consume
>arbitrary amount of kernel memory.
>
>How do you prevent this?

By inheriting an implicit limit from the parent namespace somehow.
For example, even if you set the kernel.pid_max sysctl in the initial
namespace to something like 9999, subordinate namespace have
kernel.pid_max=4million again, but nevertheless are unable to use
more than 9999 PIDs. Or so documentation the documentation
from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me
(I did not try to create so many processes by myself).

A similar logic would have to be applied for netfilter sysctls
if they are made modifiable in subordinate namespaces.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07 10:13 ` Florian Westphal
  2025-04-07 10:56   ` Jan Engelhardt
@ 2025-04-08  8:17   ` lvxiafei
  1 sibling, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-08  8:17 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

On Monday 2025-04-07 12:13, Florian Westphal wrote:
>lvxiafei <xiafei_xupt@163.com> wrote:
>> The modification of nf_conntrack_max in one netns
>> should not affect the value in another one.
>
>nf_conntrack_max can only be changed in init_net.
>
>Given the check isn't removed:
>   /* Don't allow non-init_net ns to alter global sysctls */
>   if (!net_eq(&init_net, net)) {
>       table[NF_SYSCTL_CT_MAX].mode = 0444;
>
>... this patch seems untested?
>
>But, removing this check would allow any netns to consume
>arbitrary amount of kernel memory.
>
>How do you prevent this?

Yes, this check needs to be deleted
All netns share the original nf_conntrack_max. The nf_conntrack_max
limit does not limit the total ct_count of all netns. When it is changed
to a netns-level parameter, the default value is the same as the original
default value (=max_factor*nf_conntrack_htable_size), which is a global
(ancestral) limit, and kernel memory consumption is not affected.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07 10:56   ` Jan Engelhardt
@ 2025-04-08  8:27     ` lvxiafei
  2025-04-08  8:38     ` lvxiafei
  1 sibling, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-08  8:27 UTC (permalink / raw)
  To: ej
  Cc: coreteam, davem, edumazet, fw, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

On Mon, 7 Apr 2025 12:56:33
Jan Engelhardt <ej@inai.de> wrote:
> By inheriting an implicit limit from the parent namespace somehow.
> For example, even if you set the kernel.pid_max sysctl in the initial
> namespace to something like 9999, subordinate namespace have
> kernel.pid_max=4million again, but nevertheless are unable to use
> more than 9999 PIDs. Or so documentation the documentation
> from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me
> (I did not try to create so many processes by myself).
>
> A similar logic would have to be applied for netfilter sysctls
> if they are made modifiable in subordinate namespaces.

The patch is to use nf_conntrack_max to more flexibly limit the
ct_count in different netns, which may be greater than the parent
namespace, belonging to the global (ancestral) limit, and there
is no implicit limit inherited from the parent namespace


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07 10:56   ` Jan Engelhardt
  2025-04-08  8:27     ` lvxiafei
@ 2025-04-08  8:38     ` lvxiafei
  1 sibling, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-08  8:38 UTC (permalink / raw)
  To: ej
  Cc: coreteam, davem, edumazet, fw, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

On Mon, 7 Apr 2025 12:56:33 Jan Engelhardt <ej@inai.de> wrote:
> By inheriting an implicit limit from the parent namespace somehow.
> For example, even if you set the kernel.pid_max sysctl in the initial
> namespace to something like 9999, subordinate namespace have
> kernel.pid_max=4million again, but nevertheless are unable to use
> more than 9999 PIDs. Or so documentation the documentation
> from commit d385c8bceb14665e935419334aa3d3fac2f10456 tells me
> (I did not try to create so many processes by myself).
>
> A similar logic would have to be applied for netfilter sysctls
> if they are made modifiable in subordinate namespaces.

The patch is to use nf_conntrack_max to more flexibly limit the
ct_count in different netns, which may be greater than the parent
namespace, belonging to the global (ancestral) limit, and there
is no implicit limit inherited from the parent namespace


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
  2025-04-07 10:13 ` Florian Westphal
@ 2025-04-08  9:03 ` lvxiafei
  2025-04-08  9:58   ` Florian Westphal
  2025-04-09  4:25 ` [PATCH V3] " lvxiafei
                   ` (3 subsequent siblings)
  5 siblings, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-08  9:03 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

From: lvxiafei <lvxiafei@sensetime.com>

Support nf_conntrack_max settings in different netns,
nf_conntrack_max is used to more flexibly limit the
ct_count in different netns, which may be greater than
the value in the parent namespace. The default value
belongs to the global (ancestral) limit and no implicit
limit is inherited from the parent namespace.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 include/net/netns/conntrack.h           |  1 +
 net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
 net/netfilter/nf_conntrack_standalone.c |  7 ++++---
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	u8			sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..5f0dbd358d66 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = net->ct.sysctl_max / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (net->ct.sysctl_max && unlikely(ct_count > net->ct.sysctl_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..77c9c01c7278 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = nf_conntrack_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-08  9:03 ` [PATCH V2] " lvxiafei
@ 2025-04-08  9:58   ` Florian Westphal
  2025-04-08 12:39     ` lvxiafei
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-08  9:58 UTC (permalink / raw)
  To: lvxiafei
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> From: lvxiafei <lvxiafei@sensetime.com>
> 
> Support nf_conntrack_max settings in different netns,
> nf_conntrack_max is used to more flexibly limit the
> ct_count in different netns, which may be greater than
> the value in the parent namespace. The default value
> belongs to the global (ancestral) limit and no implicit
> limit is inherited from the parent namespace.

That seems the wrong thing to do.
There must be some way to limit the netns conntrack usage.

Whats the actual intent here?

You could apply max = min(init_net->max, net->max)
Or, you could relax it as long as netns are owned
by initial user ns, I guess.

Or perhaps its possible to make a guesstimate of
the maximum memory needed by the new limit, then
account that to memcg (at sysctl change time), and
reject if memcg is exhausted.

No other ideas at the moment, but I do not like the
"no limits" approach.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-08  9:58   ` Florian Westphal
@ 2025-04-08 12:39     ` lvxiafei
  2025-04-08 13:28       ` Florian Westphal
  0 siblings, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-08 12:39 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt


On Tue, 8 Apr 2025 11:58:54 Florian Westphal <fw@strlen.de> wrote:
> That seems the wrong thing to do.
> There must be some way to limit the netns conntrack usage.
>
> Whats the actual intent here?
>
> You could apply max = min(init_net->max, net->max)
> Or, you could relax it as long as netns are owned
> by initial user ns, I guess.
>
> Or perhaps its possible to make a guesstimate of
> the maximum memory needed by the new limit, then
> account that to memcg (at sysctl change time), and
> reject if memcg is exhausted.
>
> No other ideas at the moment, but I do not like the
> "no limits" approach.

The original nf_conntrack_max is a global variable.
Modification will affect the connection tracking
limit in other netns, and the maximum memory
consumption = number of netns * nf_conntrack_max

This modification can make nf_conntrack_max support
the netns level to set the size of the connection
tracking table, and more flexibly limit the connection
tracking of each netns. For example, the initial user ns
has a default value (=max_factor*nf_conntrack_htable_size).
The nf_conntrack_max when netns 1 and netns 2 are created
is the same as the nf_conntrack_max in the initial user ns.
You can set it to netns 1 1k and netns 2 2k without
affecting each other.

If you are worried that different netns may exceed the
initial user limit and memory limit when setting,
apply max = min(init_net->max, net->max), the value in
netns is not greater than init_net->max, and the new
maximum memory consumption <= the original maximum memory
consumption, which limits memory consumption to a certain
extent. However, this will bring several problems:

1. Do not allow nf_conntrack_max in other netns to be greater
than nf_conntrack_max of the initial user. For example, when
other netns carry north-south traffic, the actual number of
connection tracking is greater than that of the initial user.

2. If nf_conntrack_max of the initial user is increased, the
maximum memory consumption will inevitably increase by n copies

3. If nf_conntrack_max of the initial user is reduced, will
the existing connections in other netns be affected?


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-08 12:39     ` lvxiafei
@ 2025-04-08 13:28       ` Florian Westphal
  2025-04-09  4:14         ` lvxiafei
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-08 13:28 UTC (permalink / raw)
  To: lvxiafei
  Cc: fw, coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> This modification can make nf_conntrack_max support
> the netns level to set the size of the connection
> tracking table, and more flexibly limit the connection
> tracking of each netns. For example, the initial user ns
> has a default value (=max_factor*nf_conntrack_htable_size).
> The nf_conntrack_max when netns 1 and netns 2 are created
> is the same as the nf_conntrack_max in the initial user ns.
> You can set it to netns 1 1k and netns 2 2k without
> affecting each other.

Netns 2 can also set it to 2**31 and cause machine go OOM.

> If you are worried that different netns may exceed the
> initial user limit and memory limit when setting,
> apply max = min(init_net->max, net->max), the value in
> netns is not greater than init_net->max, and the new
> maximum memory consumption <= the original maximum memory
> consumption, which limits memory consumption to a certain
> extent.

That was one of the suggestions that I see how one could have
tunable pernet variable without allowing netns2 go haywire.

> However, this will bring several problems:
> 
> 1. Do not allow nf_conntrack_max in other netns to be greater
> than nf_conntrack_max of the initial user. For example, when
> other netns carry north-south traffic, the actual number of
> connection tracking is greater than that of the initial user.

Sure.

> 2. If nf_conntrack_max of the initial user is increased, the
> maximum memory consumption will inevitably increase by n copies

How is that different to current state of affairs?

> 3. If nf_conntrack_max of the initial user is reduced, will
> the existing connections in other netns be affected?

No, but new ones will be blocked until its below init_net limit again.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V2] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-08 13:28       ` Florian Westphal
@ 2025-04-09  4:14         ` lvxiafei
  0 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-09  4:14 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

On 2025-04-08 13:28 Florian Westphal <fw@strlen.de> wrote:
> That was one of the suggestions that I see how one could have
> tunable pernet variable without allowing netns2 go haywire.

Yes, After net.netfilter.nf_conntrack_max is set in different
netns, it should be designed to not be allowed to be larger
than the global (ancestor) limit when working.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
  2025-04-07 10:13 ` Florian Westphal
  2025-04-08  9:03 ` [PATCH V2] " lvxiafei
@ 2025-04-09  4:25 ` lvxiafei
  2025-04-09  7:20   ` Florian Westphal
  2025-04-12 14:37 ` [PATCH V4] " lvxiafei
                   ` (2 subsequent siblings)
  5 siblings, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-09  4:25 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

From: lvxiafei <lvxiafei@sensetime.com>

Support net.netfilter.nf_conntrack_max settings in
different netns, net.netfilter.nf_conntrack_max is
used to more flexibly limit the ct_count in different
netns. The default value belongs to the global (ancestral)
limit and no implicit limit is inherited from the parent
namespace.

After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the global
(ancestral) limit net.nf_conntrack_max when working.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 include/net/netns/conntrack.h           |  1 +
 net/netfilter/nf_conntrack_core.c       | 12 +++++++-----
 net/netfilter/nf_conntrack_standalone.c |  5 +++--
 3 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..dd31ba205419 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	u8			sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..4116c2f2b57f 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1498,7 +1498,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1509,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1536,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1566,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = min(nf_conntrack_max, net->ct.sysctl_max) / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1654,7 +1656,7 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..4a073c4de1b7 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = nf_conntrack_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-09  4:25 ` [PATCH V3] " lvxiafei
@ 2025-04-09  7:20   ` Florian Westphal
  2025-04-09  9:13     ` lvxiafei
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-09  7:20 UTC (permalink / raw)
  To: lvxiafei
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> -	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +	if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
>  		if (!early_drop(net, hash)) {
>  			if (!conntrack_gc_work.early_drop)
>  				conntrack_gc_work.early_drop = true;
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..4a073c4de1b7 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
>  static struct ctl_table nf_ct_sysctl_table[] = {
>  	[NF_SYSCTL_CT_MAX] = {
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,

Whats the function of nf_conntrack_max?
After this change its always 0?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-09  7:20   ` Florian Westphal
@ 2025-04-09  9:13     ` lvxiafei
  2025-04-09  9:42       ` Florian Westphal
  0 siblings, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-09  9:13 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

Florian Westphal <fw@strlen.de> wrote:
> Whats the function of nf_conntrack_max?
> After this change its always 0?

nf_conntrack_max is a global (ancestor) limit, by default
nf_conntrack_max = max_factor * nf_conntrack_htable_size.

init_net.ct.sysctl_max is a parameter for each netns, and
setting it will not affect the value of nf_conntrack_max.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-09  9:13     ` lvxiafei
@ 2025-04-09  9:42       ` Florian Westphal
  2025-04-10 10:02         ` lvxiafei
  2025-04-10 13:05         ` lvxiafei
  0 siblings, 2 replies; 35+ messages in thread
From: Florian Westphal @ 2025-04-09  9:42 UTC (permalink / raw)
  To: lvxiafei
  Cc: fw, coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > Whats the function of nf_conntrack_max?
> > After this change its always 0?
> 
> nf_conntrack_max is a global (ancestor) limit, by default
> nf_conntrack_max = max_factor * nf_conntrack_htable_size.

Argh.

net.netfilter.nf_conntrack_max
is replaced by init_net.nf_conntrack_max in your patch.

But not net.nf_conntrack_max, so they are now different and not
related at all anymore except that the latter overrides the former
even in init_net.

I'm not sure this is sane.  And it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst

in any case.

Also:

-       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {


... can't be right, this allows a 0 setting in the netns.
So, setting 0 in non-init-net must be disallowed.

I suggest to remove nf_conntrack_max as a global variable,
make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
so in the init_net both sysctls remain the same.

Then, change __nf_conntrack_alloc() to do:

unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);

and leave the if-condition as is, i.e.:

if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...

It means:
each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).

When a new conntrack is allocated, then:

If the limit in the init_net is lower than the netns, then
that limit applies, so it provides upper cap.

If the limit in the init_net is higher, the lower pernet limit
is applied.

If the init_net has 0 setting, no limit is applied.

This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.

Or, alternative, try the other suggestion I made
(memcg charge at sysctl change time,
 https://lore.kernel.org/netfilter-devel/20250408095854.GB536@breakpoint.cc/).

Or come up with a better proposal.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-09  9:42       ` Florian Westphal
@ 2025-04-10 10:02         ` lvxiafei
  2025-04-10 10:53           ` Florian Westphal
  2025-04-10 13:05         ` lvxiafei
  1 sibling, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-10 10:02 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

Florian Westphal <fw@strlen.de> wrote:
> net.netfilter.nf_conntrack_max
> is replaced by init_net.nf_conntrack_max in your patch.
>
> But not net.nf_conntrack_max, so they are now different and not
> related at all anymore except that the latter overrides the former
> even in init_net.
>
> I'm not sure this is sane.  And it needs an update to
> Documentation/networking/nf_conntrack-sysctl.rst

Yes, it needs an update to
Documentation/networking/nf_conntrack-sysctl.rst.

in different netns,
net.netfilter.nf_conntrack_max = init_net.ct.sysctl_max;
the global (ancestral) limit,
net.nf_conntrack_max = nf_conntrack_max = max_factor * nf_conntrack_htable_size;

> in any case.
>
> Also:
>
> -       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
>
>
> ... can't be right, this allows a 0 setting in the netns.
> So, setting 0 in non-init-net must be disallowed.

Yes, setting 0 in non-init-net must be disallowed.

Should be used:
unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {

min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
but it can be limited by nf_conntrack_max in different netns.

> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.
>
> Then, change __nf_conntrack_alloc() to do:
>
> unsigned int nf_conntrack_max = min(net->ct.sysctl_max, &init_net.ct.sysctl_max);
>
> and leave the if-condition as is, i.e.:
>
> if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) { ...

Yes, each netns can pick an arbitrary value (but not 0, this ability needs to
be removed).

Should be used:
unsigned int nf_conntrack_max = max(min(net->ct.sysctl_max, init_net.ct.sysctl_max, 0);

This also needs an update to Documentation/networking/nf_conntrack-sysctl.rst
to explain the restrictions.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-10 10:02         ` lvxiafei
@ 2025-04-10 10:53           ` Florian Westphal
  0 siblings, 0 replies; 35+ messages in thread
From: Florian Westphal @ 2025-04-10 10:53 UTC (permalink / raw)
  To: lvxiafei
  Cc: fw, coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> > in any case.
> >
> > Also:
> >
> > -       if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> > +       if (net->ct.sysctl_max && unlikely(ct_count > min(nf_conntrack_max, net->ct.sysctl_max))) {
> >
> >
> > ... can't be right, this allows a 0 setting in the netns.
> > So, setting 0 in non-init-net must be disallowed.
> 
> Yes, setting 0 in non-init-net must be disallowed.
> 
> Should be used:
> unsigned int net_ct_sysctl_max = max(min(nf_conntrack_max, net->ct.sysctl_max), 0);
> if (nf_conntrack_max && unlikely(ct_count > net_ct_sysctl_max)) {

That would work.  Alternative, probably preferrable, is to do
something like this:

@@ -615,10 +615,10 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
-               .proc_handler   = proc_dointvec,
+               .proc_handler   = proc_douintvec_minmax,
+               .extra1         = SYSCTL_ZERO, /* 0 == no limit */
        },
        [NF_SYSCTL_CT_COUNT] = {
                .procname       = "nf_conntrack_count",
@@ -1081,9 +1082,11 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)

        /* Don't allow non-init_net ns to alter global sysctls */
        if (!net_eq(&init_net, net)) {
                table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
                table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
+
+               /* 0 means no limit, only allowed in init_net */
+               table[NF_SYSCTL_CT_MAX].extra1 = SYSCTL_ONE;
        }

That will make setting a 0 value illegal for non-init net case:

sysctl net.netfilter.nf_conntrack_max=0
sysctl: setting key "net.netfilter.nf_conntrack_max": Invalid argument

> min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
> At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
> but it can be limited by nf_conntrack_max in different netns.

Sounds good to me.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-09  9:42       ` Florian Westphal
  2025-04-10 10:02         ` lvxiafei
@ 2025-04-10 13:05         ` lvxiafei
  2025-04-10 13:17           ` Florian Westphal
  1 sibling, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-10 13:05 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

Florian Westphal <fw@strlen.de> wrote:
> I suggest to remove nf_conntrack_max as a global variable,
> make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> so in the init_net both sysctls remain the same.

The nf_conntrack_max global variable is a system calculated
value and should not be removed.
nf_conntrack_max = max_factor * nf_conntrack_htable_size;

> When a new conntrack is allocated, then:
>
> If the limit in the init_net is lower than the netns, then
> that limit applies, so it provides upper cap.
>
> If the limit in the init_net is higher, the lower pernet limit
> is applied.
>
> If the init_net has 0 setting, no limit is applied.

If the init_net has 0 setting, it should depend on the
limit of other netns.

The netns Limit Behavior:
+------------------------+--------------------+-----------------------+
| init_net.ct.sysctl_max | net->ct.sysctl_max | netns Limit Behavior  |
+------------------------+--------------------+-----------------------+
| 0                      | 0                  | No limit              |
+------------------------+--------------------+-----------------------+
| 0                      | Non-zero           | net->ct.sysctl_max    |
+------------------------+--------------------+-----------------------+
| Non-zero               | 0                  | init_net.ct.sysctl_max|
+------------------------+--------------------+-----------------------+
| Non-zero               | Non-zero           | min                   |
+------------------------+--------------------+-----------------------+

net_ct_sysctl_max = likely(a && b) ? min(a, b) : max(a, b);
or
net_ct_sysctl_max = unlikely(a == 0 || b == 0) ? max(a, b) : min(a, b);

if (net_ct_sysctl_max && unlikely(ct_count > net_ct_sysctl_max)) { ...


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-10 13:05         ` lvxiafei
@ 2025-04-10 13:17           ` Florian Westphal
  2025-04-10 14:16             ` Florian Westphal
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-10 13:17 UTC (permalink / raw)
  To: lvxiafei
  Cc: fw, coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> Florian Westphal <fw@strlen.de> wrote:
> > I suggest to remove nf_conntrack_max as a global variable,
> > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> > so in the init_net both sysctls remain the same.
> 
> The nf_conntrack_max global variable is a system calculated
> value and should not be removed.
> nf_conntrack_max = max_factor * nf_conntrack_htable_size;

Thats the default calculation for the initial sysctl value:

net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,

You can make an initial patch that replaces all occurences of
nf_conntrack_max with cnet->sysctl_conntrack_max

(adding a 'unsigned int sysctl_conntrack_max' to struct
 nf_conntrack_net).

Then, in a second patch, remove the '0444' readonly and redirect
the child netns to use the copy in its own pernet area rather than the
init_net one.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-10 13:17           ` Florian Westphal
@ 2025-04-10 14:16             ` Florian Westphal
  2025-04-11  4:09               ` lvxiafei
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-10 14:16 UTC (permalink / raw)
  To: Florian Westphal
  Cc: lvxiafei, coreteam, davem, edumazet, horms, kadlec, kuba,
	linux-kernel, lvxiafei, netdev, netfilter-devel, pabeni, pablo

Florian Westphal <fw@strlen.de> wrote:
> lvxiafei <xiafei_xupt@163.com> wrote:
> > Florian Westphal <fw@strlen.de> wrote:
> > > I suggest to remove nf_conntrack_max as a global variable,
> > > make net.nf_conntrack_max use init_net.nf_conntrack_max too internally,
> > > so in the init_net both sysctls remain the same.
> > 
> > The nf_conntrack_max global variable is a system calculated
> > value and should not be removed.
> > nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> 
> Thats the default calculation for the initial sysctl value:
> 
> net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
> net/netfilter/nf_conntrack_standalone.c:                .data           = &nf_conntrack_max,
> 
> You can make an initial patch that replaces all occurences of
> nf_conntrack_max with cnet->sysctl_conntrack_max

Something like this:

diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -360,6 +359,11 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
 
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+	return net->ct.sysctl_conntrack_max;
+}
+
 int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
 int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 			   u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	unsigned int		sysctl_conntrack_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..8ae9c22cfcb3 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 
@@ -1509,8 +1507,7 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
+		nf_conntrack_max95 = nf_conntrack_max(&init_net) / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1648,13 +1645,14 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	unsigned int ct_count;
+	unsigned int ct_max, ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
+	ct_max = nf_conntrack_max(&init_net);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (ct_max && unlikely(ct_count > ct_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2650,7 +2648,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.sysctl_conntrack_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index db23876a6016..f1938204b827 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 502cf10aab41..8a185dfd3261 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -944,7 +944,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_conntrack_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH V3] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-10 14:16             ` Florian Westphal
@ 2025-04-11  4:09               ` lvxiafei
  0 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-11  4:09 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

Florian Westphal <fw@strlen.de> wrote:

> > You can make an initial patch that replaces all occurences of
> > nf_conntrack_max with cnet->sysctl_conntrack_max
>
> Something like this:
> ...

Agreed, I can submit the changes later.
First of all, a patch should do one thing clearly,
which is convenient for maintainers to review.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH V4] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
                   ` (2 preceding siblings ...)
  2025-04-09  4:25 ` [PATCH V3] " lvxiafei
@ 2025-04-12 14:37 ` lvxiafei
  2025-04-12 17:26 ` [PATCH V5] " lvxiafei
  2025-04-15  9:08 ` [PATCH V6] " lvxiafei
  5 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-12 14:37 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

From: lvxiafei <lvxiafei@sensetime.com>

Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.

After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 .../networking/nf_conntrack-sysctl.rst        | 29 +++++++++++++++----
 include/net/netfilter/nf_conntrack.h          |  8 ++++-
 include/net/netns/conntrack.h                 |  1 +
 net/netfilter/nf_conntrack_core.c             | 19 ++++++------
 net/netfilter/nf_conntrack_netlink.c          |  2 +-
 net/netfilter/nf_conntrack_standalone.c       |  5 ++--
 6 files changed, 44 insertions(+), 20 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..58aad49ea179 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
 	Log invalid packets of a type specified by value.
 
 nf_conntrack_max - INTEGER
-        Maximum number of allowed connection tracking entries. This value is set
-        to nf_conntrack_buckets by default.
-        Note that connection tracking entries are added to the table twice -- once
-        for the original direction and once for the reply direction (i.e., with
-        the reversed address). This means that with default settings a maxed-out
-        table will have a average hash chain length of 2, not 1.
+    - 0 - disabled (unlimited)
+	- not 0 - enabled
+
+    Maximum number of allowed connection tracking entries per netns. This value
+    is set to nf_conntrack_buckets by default.
+
+    Note that connection tracking entries are added to the table twice -- once
+    for the original direction and once for the reply direction (i.e., with
+    the reversed address). This means that with default settings a maxed-out
+    table will have a average hash chain length of 2, not 1.
+
+    The limit of other netns cannot be greater than init_net netns.
+    +----------------+-------------+----------------+
+    | init_net netns | other netns | limit behavior |
+    +----------------+-------------+----------------+
+    | 0              | 0           | unlimited      |
+    +----------------+-------------+----------------+
+    | 0              | not 0       | other          |
+    +----------------+-------------+----------------+
+    | not 0          | 0           | init_net       |
+    +----------------+-------------+----------------+
+    | not 0          | not 0       | min            |
+    +----------------+-------------+----------------+
 
 nf_conntrack_tcp_be_liberal - BOOLEAN
 	- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..062e67b9a5d7 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
 
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+}
+
 int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
 int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 			   u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..2f343842c2d9 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	unsigned int sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	unsigned int ct_count;
+	unsigned int ct_max, ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
+	ct_max = nf_conntrack_max(net);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (ct_max && unlikely(ct_count > ct_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..8608aec88758 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1087,7 +1087,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
                   ` (3 preceding siblings ...)
  2025-04-12 14:37 ` [PATCH V4] " lvxiafei
@ 2025-04-12 17:26 ` lvxiafei
  2025-04-12 17:30   ` lvxiafei
                     ` (2 more replies)
  2025-04-15  9:08 ` [PATCH V6] " lvxiafei
  5 siblings, 3 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-12 17:26 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

From: lvxiafei <lvxiafei@sensetime.com>

Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.

After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 .../networking/nf_conntrack-sysctl.rst        | 29 +++++++++++++++----
 include/net/netfilter/nf_conntrack.h          |  8 ++++-
 include/net/netns/conntrack.h                 |  1 +
 net/netfilter/nf_conntrack_core.c             | 19 ++++++------
 net/netfilter/nf_conntrack_netlink.c          |  2 +-
 net/netfilter/nf_conntrack_standalone.c       |  7 +++--
 6 files changed, 46 insertions(+), 20 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..6e7f17f5959a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
 	Log invalid packets of a type specified by value.
 
 nf_conntrack_max - INTEGER
-        Maximum number of allowed connection tracking entries. This value is set
-        to nf_conntrack_buckets by default.
-        Note that connection tracking entries are added to the table twice -- once
-        for the original direction and once for the reply direction (i.e., with
-        the reversed address). This means that with default settings a maxed-out
-        table will have a average hash chain length of 2, not 1.
+    - 0 - disabled (unlimited)
+    - not 0 - enabled
+
+    Maximum number of allowed connection tracking entries per netns. This value
+    is set to nf_conntrack_buckets by default.
+
+    Note that connection tracking entries are added to the table twice -- once
+    for the original direction and once for the reply direction (i.e., with
+    the reversed address). This means that with default settings a maxed-out
+    table will have a average hash chain length of 2, not 1.
+
+    The limit of other netns cannot be greater than init_net netns.
+    +----------------+-------------+----------------+
+    | init_net netns | other netns | limit behavior |
+    +----------------+-------------+----------------+
+    | 0              | 0           | unlimited      |
+    +----------------+-------------+----------------+
+    | 0              | not 0       | other          |
+    +----------------+-------------+----------------+
+    | not 0          | 0           | init_net       |
+    +----------------+-------------+----------------+
+    | not 0          | not 0       | min            |
+    +----------------+-------------+----------------+
 
 nf_conntrack_tcp_be_liberal - BOOLEAN
 	- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..062e67b9a5d7 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
 
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+}
+
 int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
 int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 			   u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..d3fcd0b92b2d 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	unsigned int		sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	unsigned int ct_count;
+	unsigned int ct_max, ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
+	ct_max = nf_conntrack_max(net);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (ct_max && unlikely(ct_count > ct_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5db6df0e4eb3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = init_net.ct.sysctl_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-12 17:26 ` [PATCH V5] " lvxiafei
@ 2025-04-12 17:30   ` lvxiafei
  2025-04-12 21:16   ` Jakub Kicinski
  2025-04-13  9:07   ` Florian Westphal
  2 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-12 17:30 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

The limit of other netns cannot be greater than init_net netns.
    +----------------+-------------+----------------+
    | init_net netns | other netns | limit behavior |
    +----------------+-------------+----------------+
    | 0              | 0           | unlimited      |
    +----------------+-------------+----------------+
    | 0              | not 0       | other          |
    +----------------+-------------+----------------+
    | not 0          | 0           | init_net       |
    +----------------+-------------+----------------+
    | not 0          | not 0       | min            |
    +----------------+-------------+----------------+

0,0
[Apr12 17:16] init_net.ct.sysctl_max 0, net->ct.sysctl_max 0, ct_max 0
0,1
[Apr12 17:17] init_net.ct.sysctl_max 0, net->ct.sysctl_max 1, ct_max 1
[  +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet
1,0
Apr12 17:18] init_net.ct.sysctl_max 1, net->ct.sysctl_max 0, ct_max 1
[  +0.000006] nf_conntrack: nf_conntrack: table full, dropping packet
1,2
[Apr12 17:19] init_net.ct.sysctl_max 1, net->ct.sysctl_max 2, ct_max 1
[  +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet
2,1
[Apr12 17:21] init_net.ct.sysctl_max 2, net->ct.sysctl_max 1, ct_max 1
[  +0.000004] nf_conntrack: nf_conntrack: table full, dropping packet


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-12 17:26 ` [PATCH V5] " lvxiafei
  2025-04-12 17:30   ` lvxiafei
@ 2025-04-12 21:16   ` Jakub Kicinski
  2025-04-13  1:14     ` lvxiafei
  2025-04-13  9:07   ` Florian Westphal
  2 siblings, 1 reply; 35+ messages in thread
From: Jakub Kicinski @ 2025-04-12 21:16 UTC (permalink / raw)
  To: lvxiafei
  Cc: coreteam, davem, edumazet, horms, kadlec, linux-kernel, lvxiafei,
	netdev, netfilter-devel, pabeni, pablo

On Sun, 13 Apr 2025 01:26:10 +0800 lvxiafei wrote:
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> +	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> +	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> +	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);

If you CC netdev@ please do not post multiple versions a day.
Please wait with posting v6 until you get some feedback (and
this email does not count).

You need to be careful with the Kconfig, this file may be included 
when contrack is not built:

In file included from ./include/linux/kernel.h:28,
                 from ./include/linux/cpumask.h:11,
                 from ./arch/x86/include/asm/cpumask.h:5,
                 from ./arch/x86/include/asm/msr.h:11,
                 from ./arch/x86/include/asm/tsc.h:10,
                 from ./arch/x86/include/asm/timex.h:6,
                 from ./include/linux/timex.h:67,
                 from ./include/linux/time32.h:13,
                 from ./include/linux/time.h:60,
                 from ./include/linux/compat.h:10,
                 from ./include/linux/ethtool.h:17,
                 from drivers/net/vrf.c:12:
include/net/netfilter/nf_conntrack.h:365:25: error: ‘struct net’ has no member named ‘ct’
  365 |             min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
      |                         ^

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-12 21:16   ` Jakub Kicinski
@ 2025-04-13  1:14     ` lvxiafei
  0 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-13  1:14 UTC (permalink / raw)
  To: kuba
  Cc: coreteam, davem, edumazet, horms, kadlec, linux-kernel, lvxiafei,
	netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

> If you CC netdev@ please do not post multiple versions a day.
> Please wait with posting v6 until you get some feedback (and
> this email does not count).

Thanks for the reminder and the review.

I’ll hold off on posting v6 until I receive proper feedback.
Also, I’ll double-check the Kconfig dependencies and ensure the
file doesn’t break builds when conntrack is not enabled.

Appreciate your time!

Sincerely.

> You need to be careful with the Kconfig, this file may be included
> when contrack is not built:
>
> In file included from ./include/linux/kernel.h:28,
>                  from ./include/linux/cpumask.h:11,
>                  from ./arch/x86/include/asm/cpumask.h:5,
>                  from ./arch/x86/include/asm/msr.h:11,
>                  from ./arch/x86/include/asm/tsc.h:10,
>                  from ./arch/x86/include/asm/timex.h:6,
>                  from ./include/linux/timex.h:67,
>                  from ./include/linux/time32.h:13,
>                  from ./include/linux/time.h:60,
>                  from ./include/linux/compat.h:10,
>                  from ./include/linux/ethtool.h:17,
>                  from drivers/net/vrf.c:12:
> include/net/netfilter/nf_conntrack.h:365:25: error: ‘struct net’ has no member named ‘ct’
>   365 |             min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
>       |                         ^

Add conditional compilation protection:

+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+#else
+	return 0;
+#endif
+}


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-12 17:26 ` [PATCH V5] " lvxiafei
  2025-04-12 17:30   ` lvxiafei
  2025-04-12 21:16   ` Jakub Kicinski
@ 2025-04-13  9:07   ` Florian Westphal
  2025-04-14  3:04     ` lvxiafei
  2 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-04-13  9:07 UTC (permalink / raw)
  To: lvxiafei
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

lvxiafei <xiafei_xupt@163.com> wrote:
> +    Maximum number of allowed connection tracking entries per netns. This value
> +    is set to nf_conntrack_buckets by default.
> +
> +    Note that connection tracking entries are added to the table twice -- once
> +    for the original direction and once for the reply direction (i.e., with
> +    the reversed address). This means that with default settings a maxed-out
> +    table will have a average hash chain length of 2, not 1.
> +
> +    The limit of other netns cannot be greater than init_net netns.
> +    +----------------+-------------+----------------+
> +    | init_net netns | other netns | limit behavior |
> +    +----------------+-------------+----------------+
> +    | 0              | 0           | unlimited      |
> +    +----------------+-------------+----------------+
> +    | 0              | not 0       | other          |
> +    +----------------+-------------+----------------+
> +    | not 0          | 0           | init_net       |
> +    +----------------+-------------+----------------+
> +    | not 0          | not 0       | min            |
> +    +----------------+-------------+----------------+
>  
>  nf_conntrack_tcp_be_liberal - BOOLEAN
>  	- 0 - disabled (default)
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 3f02a45773e8..062e67b9a5d7 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
>  extern struct hlist_nulls_head *nf_conntrack_hash;
>  extern unsigned int nf_conntrack_htable_size;
>  extern seqcount_spinlock_t nf_conntrack_generation;
> -extern unsigned int nf_conntrack_max;
>  
>  /* must be called with rcu read lock held */
>  static inline void
> @@ -360,6 +359,13 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
>  	return net_generic(net, nf_conntrack_net_id);
>  }
>  
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> +	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> +	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> +	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
> +}

Is there a reason you did not follow my suggstion in
https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/

to disable net->ct.sysctl_max == 0 for non init netns?

You could then make this

   if (likely(init_net.ct.sysctl_max))
	return min(init_net.ct.sysctl_max, net->ct.sysctl_max);

   return net->ct.sysctl_max;

... or am i missing something?

Aside from that (and the needed #IS_ENABLED() guard) this change looks
good to me.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V5] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-13  9:07   ` Florian Westphal
@ 2025-04-14  3:04     ` lvxiafei
  0 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-14  3:04 UTC (permalink / raw)
  To: fw
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo, xiafei_xupt

On Sun, 13 Apr 2025 11:07:55 +0200 Florian Westphal <fw@strlen.de> wrote:
> Is there a reason you did not follow my suggstion in
> https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/
>
> to disable net->ct.sysctl_max == 0 for non init netns?

in https://lore.kernel.org/netdev/20250410105352.GB6272@breakpoint.cc/
> > min(nf_conntrack_max, net->ct.sysctl_max) is the upper limit of ct_count
> > At the same time, when net->ct.sysctl_max == 0, the original intention is no limit,
> > but it can be limited by nf_conntrack_max in different netns.
>
> Sounds good to me.

It seems that there are two ways to change it:

1. Disallow net->ct.sysctl_max == 0

2. Allow net->ct.sysctl_max == 0
the original intention is no limit, but it can be limited by init_net netns
in different netns. When init_net is modified, it will change dynamically.
    +----------------+-------------+----------------+
    | init_net netns | other netns | limit behavior |
    +----------------+-------------+----------------+
    | not 0          | 0           | init_net       |
    +----------------+-------------+----------------+


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
                   ` (4 preceding siblings ...)
  2025-04-12 17:26 ` [PATCH V5] " lvxiafei
@ 2025-04-15  9:08 ` lvxiafei
  2025-04-27  8:14   ` lvxiafei
  2025-05-22 19:24   ` Pablo Neira Ayuso
  5 siblings, 2 replies; 35+ messages in thread
From: lvxiafei @ 2025-04-15  9:08 UTC (permalink / raw)
  To: xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

From: lvxiafei <lvxiafei@sensetime.com>

Support net.netfilter.nf_conntrack_max settings per
netns, net.netfilter.nf_conntrack_max is used to more
flexibly limit the ct_count in different netns. The
default value belongs to the init_net limit.

After net.netfilter.nf_conntrack_max is set in different
netns, it is not allowed to be greater than the init_net
limit when working.

Signed-off-by: lvxiafei <lvxiafei@sensetime.com>
---
 .../networking/nf_conntrack-sysctl.rst        | 29 +++++++++++++++----
 include/net/netfilter/nf_conntrack.h          | 12 +++++++-
 include/net/netns/conntrack.h                 |  1 +
 net/netfilter/nf_conntrack_core.c             | 19 ++++++------
 net/netfilter/nf_conntrack_netlink.c          |  2 +-
 net/netfilter/nf_conntrack_standalone.c       |  7 +++--
 6 files changed, 50 insertions(+), 20 deletions(-)

diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
index 238b66d0e059..6e7f17f5959a 100644
--- a/Documentation/networking/nf_conntrack-sysctl.rst
+++ b/Documentation/networking/nf_conntrack-sysctl.rst
@@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
 	Log invalid packets of a type specified by value.
 
 nf_conntrack_max - INTEGER
-        Maximum number of allowed connection tracking entries. This value is set
-        to nf_conntrack_buckets by default.
-        Note that connection tracking entries are added to the table twice -- once
-        for the original direction and once for the reply direction (i.e., with
-        the reversed address). This means that with default settings a maxed-out
-        table will have a average hash chain length of 2, not 1.
+    - 0 - disabled (unlimited)
+    - not 0 - enabled
+
+    Maximum number of allowed connection tracking entries per netns. This value
+    is set to nf_conntrack_buckets by default.
+
+    Note that connection tracking entries are added to the table twice -- once
+    for the original direction and once for the reply direction (i.e., with
+    the reversed address). This means that with default settings a maxed-out
+    table will have a average hash chain length of 2, not 1.
+
+    The limit of other netns cannot be greater than init_net netns.
+    +----------------+-------------+----------------+
+    | init_net netns | other netns | limit behavior |
+    +----------------+-------------+----------------+
+    | 0              | 0           | unlimited      |
+    +----------------+-------------+----------------+
+    | 0              | not 0       | other          |
+    +----------------+-------------+----------------+
+    | not 0          | 0           | init_net       |
+    +----------------+-------------+----------------+
+    | not 0          | not 0       | min            |
+    +----------------+-------------+----------------+
 
 nf_conntrack_tcp_be_liberal - BOOLEAN
 	- 0 - disabled (default)
diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
index 3f02a45773e8..594439b2f5a1 100644
--- a/include/net/netfilter/nf_conntrack.h
+++ b/include/net/netfilter/nf_conntrack.h
@@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
 extern struct hlist_nulls_head *nf_conntrack_hash;
 extern unsigned int nf_conntrack_htable_size;
 extern seqcount_spinlock_t nf_conntrack_generation;
-extern unsigned int nf_conntrack_max;
 
 /* must be called with rcu read lock held */
 static inline void
@@ -360,6 +359,17 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
 	return net_generic(net, nf_conntrack_net_id);
 }
 
+static inline unsigned int nf_conntrack_max(const struct net *net)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
+	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
+	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
+#else
+	return 0;
+#endif
+}
+
 int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
 int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
 			   u16 zone, u8 family, u8 *proto, u16 *mru);
diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
index bae914815aa3..d3fcd0b92b2d 100644
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -102,6 +102,7 @@ struct netns_ct {
 	u8			sysctl_acct;
 	u8			sysctl_tstamp;
 	u8			sysctl_checksum;
+	unsigned int		sysctl_max;
 
 	struct ip_conntrack_stat __percpu *stat;
 	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f8b245e287a..a738564923ec 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
 seqcount_spinlock_t nf_conntrack_generation __read_mostly;
 static siphash_aligned_key_t nf_conntrack_hash_rnd;
 
@@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned int i, hashsz, nf_conntrack_max95 = 0;
+	unsigned int i, hashsz;
 	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
 	unsigned int expired_count = 0;
@@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
-	if (gc_work->early_drop)
-		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
 	if (i == 0) {
 		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
@@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
 		}
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
+			unsigned int nf_conntrack_max95 = 0;
 			struct nf_conntrack_net *cnet;
 			struct net *net;
 			long expires;
@@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
 			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
 			expires = (expires - (long)next_run) / ++count;
 			next_run += expires;
+			net = nf_ct_net(tmp);
+
+			if (gc_work->early_drop)
+				nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
 
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
-			net = nf_ct_net(tmp);
 			cnet = nf_ct_pernet(net);
 			if (atomic_read(&cnet->count) < nf_conntrack_max95)
 				continue;
@@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
 		     gfp_t gfp, u32 hash)
 {
 	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
-	unsigned int ct_count;
+	unsigned int ct_max, ct_count;
 	struct nf_conn *ct;
 
 	/* We don't want any race condition at early drop stage */
 	ct_count = atomic_inc_return(&cnet->count);
+	ct_max = nf_conntrack_max(net);
 
-	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
+	if (ct_max && unlikely(ct_count > ct_max)) {
 		if (!early_drop(net, hash)) {
 			if (!conntrack_gc_work.early_drop)
 				conntrack_gc_work.early_drop = true;
@@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
 	if (!nf_conntrack_hash)
 		return -ENOMEM;
 
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
 
 	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
 						sizeof(struct nf_conn),
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 2cc0fde23344..73e6bb1e939b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
 	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
 		goto nla_put_failure;
 
-	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
+	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
 		goto nla_put_failure;
 
 	nlmsg_end(skb, nlh);
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 2f666751c7e7..5db6df0e4eb3 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
 static struct ctl_table nf_ct_sysctl_table[] = {
 	[NF_SYSCTL_CT_MAX] = {
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.sysctl_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec_minmax,
@@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
 	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
+	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
 	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
 	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
 #ifdef CONFIG_NF_CONNTRACK_EVENTS
@@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 
 	/* Don't allow non-init_net ns to alter global sysctls */
 	if (!net_eq(&init_net, net)) {
-		table[NF_SYSCTL_CT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
 		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
 	}
@@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
 	int ret;
 
 	net->ct.sysctl_checksum = 1;
+	net->ct.sysctl_max = init_net.ct.sysctl_max;
 
 	ret = nf_conntrack_standalone_init_sysctl(net);
 	if (ret < 0)
-- 
2.40.1


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-15  9:08 ` [PATCH V6] " lvxiafei
@ 2025-04-27  8:14   ` lvxiafei
  2025-04-28  9:40     ` Pablo Neira Ayuso
  2025-05-22 19:24   ` Pablo Neira Ayuso
  1 sibling, 1 reply; 35+ messages in thread
From: lvxiafei @ 2025-04-27  8:14 UTC (permalink / raw)
  To: fw, xiafei_xupt
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, pablo

Hello!

My name is Xiafei Lv, and I sent patch v6 on April 15, 2025. As of today, it
has been nearly two weeks, and I have not received any feedback. However, the
patch has been tested successfully, and you can view the details at this link:
https://patchwork.kernel.org/project/netdevbpf/patch/20250415090834.24882-1-xiafei_xupt@163.com/

I understand that kernel development is busy, and maintainers may have many
things to deal with and may not have time to review my patch. With this in mind,
I would like to politely ask what steps I should take next? If you have any
questions about my patch or need additional information from me, please feel
free to let me know and I will cooperate as soon as possible.

Thank you very much for taking the time to deal with my request, and I look
forward to your response.

Best wishes!

Xiafei Lv


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-27  8:14   ` lvxiafei
@ 2025-04-28  9:40     ` Pablo Neira Ayuso
  0 siblings, 0 replies; 35+ messages in thread
From: Pablo Neira Ayuso @ 2025-04-28  9:40 UTC (permalink / raw)
  To: lvxiafei
  Cc: fw, coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni

Hi!

On Sun, Apr 27, 2025 at 04:14:20PM +0800, lvxiafei wrote:
> Hello!
> 
> My name is Xiafei Lv, and I sent patch v6 on April 15, 2025. As of today, it
> has been nearly two weeks, and I have not received any feedback. However, the
> patch has been tested successfully, and you can view the details at this link:
> https://patchwork.kernel.org/project/netdevbpf/patch/20250415090834.24882-1-xiafei_xupt@163.com/
>
> I understand that kernel development is busy, and maintainers may have many
> things to deal with and may not have time to review my patch. With this in mind,
> I would like to politely ask what steps I should take next? If you have any
> questions about my patch or need additional information from me, please feel
> free to let me know and I will cooperate as soon as possible.

You just have to wait. We will get back to you with more questions if
needed.

> Thank you very much for taking the time to deal with my request, and I look
> forward to your response.

Thanks.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-04-15  9:08 ` [PATCH V6] " lvxiafei
  2025-04-27  8:14   ` lvxiafei
@ 2025-05-22 19:24   ` Pablo Neira Ayuso
  2025-05-22 19:32     ` Florian Westphal
  1 sibling, 1 reply; 35+ messages in thread
From: Pablo Neira Ayuso @ 2025-05-22 19:24 UTC (permalink / raw)
  To: lvxiafei
  Cc: coreteam, davem, edumazet, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni

On Tue, Apr 15, 2025 at 05:08:34PM +0800, lvxiafei wrote:
> diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
> index 238b66d0e059..6e7f17f5959a 100644
> --- a/Documentation/networking/nf_conntrack-sysctl.rst
> +++ b/Documentation/networking/nf_conntrack-sysctl.rst
> @@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
>  	Log invalid packets of a type specified by value.
>  
>  nf_conntrack_max - INTEGER
> -        Maximum number of allowed connection tracking entries. This value is set
> -        to nf_conntrack_buckets by default.
> -        Note that connection tracking entries are added to the table twice -- once
> -        for the original direction and once for the reply direction (i.e., with
> -        the reversed address). This means that with default settings a maxed-out
> -        table will have a average hash chain length of 2, not 1.
> +    - 0 - disabled (unlimited)

unlimited is too much, and the number of buckets is also global, how
does this work?

Is your goal to allow a netns to have larger table than netns? There
should be a cap for this.

> +    - not 0 - enabled
> +
> +    Maximum number of allowed connection tracking entries per netns. This value
> +    is set to nf_conntrack_buckets by default.
> +
> +    Note that connection tracking entries are added to the table twice -- once
> +    for the original direction and once for the reply direction (i.e., with
> +    the reversed address). This means that with default settings a maxed-out
> +    table will have a average hash chain length of 2, not 1.
> +
> +    The limit of other netns cannot be greater than init_net netns.
> +    +----------------+-------------+----------------+
> +    | init_net netns | other netns | limit behavior |
> +    +----------------+-------------+----------------+
> +    | 0              | 0           | unlimited      |
> +    +----------------+-------------+----------------+
> +    | 0              | not 0       | other          |
> +    +----------------+-------------+----------------+
> +    | not 0          | 0           | init_net       |
> +    +----------------+-------------+----------------+
> +    | not 0          | not 0       | min            |
> +    +----------------+-------------+----------------+
>  
>  nf_conntrack_tcp_be_liberal - BOOLEAN
>  	- 0 - disabled (default)
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 3f02a45773e8..594439b2f5a1 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
>  extern struct hlist_nulls_head *nf_conntrack_hash;
>  extern unsigned int nf_conntrack_htable_size;
>  extern seqcount_spinlock_t nf_conntrack_generation;
> -extern unsigned int nf_conntrack_max;
>  
>  /* must be called with rcu read lock held */
>  static inline void
> @@ -360,6 +359,17 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
>  	return net_generic(net, nf_conntrack_net_id);
>  }
>  
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> +	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> +	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
>  int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
>  			   u16 zone, u8 family, u8 *proto, u16 *mru);
> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> index bae914815aa3..d3fcd0b92b2d 100644
> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -102,6 +102,7 @@ struct netns_ct {
>  	u8			sysctl_acct;
>  	u8			sysctl_tstamp;
>  	u8			sysctl_checksum;
> +	unsigned int		sysctl_max;
>  
>  	struct ip_conntrack_stat __percpu *stat;
>  	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 7f8b245e287a..a738564923ec 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
>  unsigned int nf_conntrack_htable_size __read_mostly;
>  EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
>  
> -unsigned int nf_conntrack_max __read_mostly;
> -EXPORT_SYMBOL_GPL(nf_conntrack_max);
>  seqcount_spinlock_t nf_conntrack_generation __read_mostly;
>  static siphash_aligned_key_t nf_conntrack_hash_rnd;
>  
> @@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
>  
>  static void gc_worker(struct work_struct *work)
>  {
> -	unsigned int i, hashsz, nf_conntrack_max95 = 0;
> +	unsigned int i, hashsz;
>  	u32 end_time, start_time = nfct_time_stamp;
>  	struct conntrack_gc_work *gc_work;
>  	unsigned int expired_count = 0;
> @@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
>  	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
>  
>  	i = gc_work->next_bucket;
> -	if (gc_work->early_drop)
> -		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
>  
>  	if (i == 0) {
>  		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
> @@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
>  		}
>  
>  		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
> +			unsigned int nf_conntrack_max95 = 0;
>  			struct nf_conntrack_net *cnet;
>  			struct net *net;
>  			long expires;
> @@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
>  			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
>  			expires = (expires - (long)next_run) / ++count;
>  			next_run += expires;
> +			net = nf_ct_net(tmp);
> +
> +			if (gc_work->early_drop)
> +				nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
>  
>  			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
>  				continue;
>  
> -			net = nf_ct_net(tmp);
>  			cnet = nf_ct_pernet(net);
>  			if (atomic_read(&cnet->count) < nf_conntrack_max95)
>  				continue;
> @@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
>  		     gfp_t gfp, u32 hash)
>  {
>  	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
> -	unsigned int ct_count;
> +	unsigned int ct_max, ct_count;
>  	struct nf_conn *ct;
>  
>  	/* We don't want any race condition at early drop stage */
>  	ct_count = atomic_inc_return(&cnet->count);
> +	ct_max = nf_conntrack_max(net);
>  
> -	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +	if (ct_max && unlikely(ct_count > ct_max)) {
>  		if (!early_drop(net, hash)) {
>  			if (!conntrack_gc_work.early_drop)
>  				conntrack_gc_work.early_drop = true;
> @@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
>  	if (!nf_conntrack_hash)
>  		return -ENOMEM;
>  
> -	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> +	init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
>  
>  	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
>  						sizeof(struct nf_conn),
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 2cc0fde23344..73e6bb1e939b 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
>  	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
>  		goto nla_put_failure;
>  
> -	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
> +	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
>  		goto nla_put_failure;
>  
>  	nlmsg_end(skb, nlh);
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..5db6df0e4eb3 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
>  static struct ctl_table nf_ct_sysctl_table[] = {
>  	[NF_SYSCTL_CT_MAX] = {
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec_minmax,
> @@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
>  static struct ctl_table nf_ct_netfilter_table[] = {
>  	{
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec_minmax,
> @@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>  
>  	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
>  	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
> +	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
>  	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
>  	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
>  #ifdef CONFIG_NF_CONNTRACK_EVENTS
> @@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>  
>  	/* Don't allow non-init_net ns to alter global sysctls */
>  	if (!net_eq(&init_net, net)) {
> -		table[NF_SYSCTL_CT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
>  	}
> @@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
>  	int ret;
>  
>  	net->ct.sysctl_checksum = 1;
> +	net->ct.sysctl_max = init_net.ct.sysctl_max;
>  
>  	ret = nf_conntrack_standalone_init_sysctl(net);
>  	if (ret < 0)
> -- 
> 2.40.1
> 

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-05-22 19:24   ` Pablo Neira Ayuso
@ 2025-05-22 19:32     ` Florian Westphal
  2025-05-22 19:58       ` Pablo Neira Ayuso
  0 siblings, 1 reply; 35+ messages in thread
From: Florian Westphal @ 2025-05-22 19:32 UTC (permalink / raw)
  To: Pablo Neira Ayuso
  Cc: lvxiafei, coreteam, davem, edumazet, horms, kadlec, kuba,
	linux-kernel, lvxiafei, netdev, netfilter-devel, pabeni

Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > -        to nf_conntrack_buckets by default.
> > -        Note that connection tracking entries are added to the table twice -- once
> > -        for the original direction and once for the reply direction (i.e., with
> > -        the reversed address). This means that with default settings a maxed-out
> > -        table will have a average hash chain length of 2, not 1.
> > +    - 0 - disabled (unlimited)
> 
> unlimited is too much, and the number of buckets is also global, how
> does this work?

Its an historic wart going back to ip_conntrack - it was never the
default but you could disable any and all limits even in the original
version.

Wether its time to disallow 0 is a different topic and not related to this patch.

I would argue: "yes", disallow 0 -- users can still set INT_MAX if they
 want and that should provide enough rope to strangle yourself.

> > +    The limit of other netns cannot be greater than init_net netns.
> > +    +----------------+-------------+----------------+
> > +    | init_net netns | other netns | limit behavior |
> > +    +----------------+-------------+----------------+
> > +    | 0              | 0           | unlimited      |
> > +    +----------------+-------------+----------------+
> > +    | 0              | not 0       | other          |
> > +    +----------------+-------------+----------------+
> > +    | not 0          | 0           | init_net       |
> > +    +----------------+-------------+----------------+
> > +    | not 0          | not 0       | min            |
> > +    +----------------+-------------+----------------+

I think this is fine, it doesn't really change things from init_net
point of view.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-05-22 19:32     ` Florian Westphal
@ 2025-05-22 19:58       ` Pablo Neira Ayuso
  2025-05-23  9:21         ` lvxiafei
  0 siblings, 1 reply; 35+ messages in thread
From: Pablo Neira Ayuso @ 2025-05-22 19:58 UTC (permalink / raw)
  To: Florian Westphal
  Cc: lvxiafei, coreteam, davem, edumazet, horms, kadlec, kuba,
	linux-kernel, lvxiafei, netdev, netfilter-devel, pabeni

On Thu, May 22, 2025 at 09:32:23PM +0200, Florian Westphal wrote:
> Pablo Neira Ayuso <pablo@netfilter.org> wrote:
> > > -        to nf_conntrack_buckets by default.
> > > -        Note that connection tracking entries are added to the table twice -- once
> > > -        for the original direction and once for the reply direction (i.e., with
> > > -        the reversed address). This means that with default settings a maxed-out
> > > -        table will have a average hash chain length of 2, not 1.
> > > +    - 0 - disabled (unlimited)
> > 
> > unlimited is too much, and the number of buckets is also global, how
> > does this work?
> 
> Its an historic wart going back to ip_conntrack - it was never the
> default but you could disable any and all limits even in the original
> version.

Thanks, I was just sitting here clueless.

> Wether its time to disallow 0 is a different topic and not related to this patch.
>
> I would argue: "yes", disallow 0 -- users can still set INT_MAX if they
>  want and that should provide enough rope to strangle yourself.

The question is how to make it without breaking crazy people.

> > > +    The limit of other netns cannot be greater than init_net netns.
> > > +    +----------------+-------------+----------------+
> > > +    | init_net netns | other netns | limit behavior |
> > > +    +----------------+-------------+----------------+
> > > +    | 0              | 0           | unlimited      |
> > > +    +----------------+-------------+----------------+
> > > +    | 0              | not 0       | other          |
> > > +    +----------------+-------------+----------------+
> > > +    | not 0          | 0           | init_net       |

in this case above...

> > > +    +----------------+-------------+----------------+
> > > +    | not 0          | not 0       | min            |

... and this case, init_net value is used as a cap for other netns.
Then, this is basically allowing to specify a maximum that is smaller
than init_netns.

IIUC, that sounds reasonable.

As for how to discontinue the unlimited in other netns, let me know if
you have any suggestions.

> > > +    +----------------+-------------+----------------+
> 
> I think this is fine, it doesn't really change things from init_net
> point of view.

Thanks for explaning.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
  2025-05-22 19:58       ` Pablo Neira Ayuso
@ 2025-05-23  9:21         ` lvxiafei
  0 siblings, 0 replies; 35+ messages in thread
From: lvxiafei @ 2025-05-23  9:21 UTC (permalink / raw)
  To: pablo
  Cc: coreteam, davem, edumazet, fw, horms, kadlec, kuba, linux-kernel,
	lvxiafei, netdev, netfilter-devel, pabeni, xiafei_xupt

On Thu, 22 May 2025 21:58:13 +0200, Pablo Neira Ayuso wrote:

> > Wether its time to disallow 0 is a different topic and not related to this patch.
> >
> > I would argue: "yes", disallow 0 -- users can still set INT_MAX if they
> >  want and that should provide enough rope to strangle yourself.

> The question is how to make it without breaking crazy people.

It seems that we need a new topic to discuss the maximum value that the system can
tolerate to ensure safety:

1. This value is a system limitation, not a user setting

2. This value should be calculated based on system resources

3. This value takes precedence over 0 and other larger values that the user sets

4. This value does not affect the value of the user setting, and 0 in the user
setting can still indicate that the user setting is unlimited, maintaining
compatibility with historical usage.


^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2025-05-23  9:22 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
2025-04-07 10:13 ` Florian Westphal
2025-04-07 10:56   ` Jan Engelhardt
2025-04-08  8:27     ` lvxiafei
2025-04-08  8:38     ` lvxiafei
2025-04-08  8:17   ` lvxiafei
2025-04-08  9:03 ` [PATCH V2] " lvxiafei
2025-04-08  9:58   ` Florian Westphal
2025-04-08 12:39     ` lvxiafei
2025-04-08 13:28       ` Florian Westphal
2025-04-09  4:14         ` lvxiafei
2025-04-09  4:25 ` [PATCH V3] " lvxiafei
2025-04-09  7:20   ` Florian Westphal
2025-04-09  9:13     ` lvxiafei
2025-04-09  9:42       ` Florian Westphal
2025-04-10 10:02         ` lvxiafei
2025-04-10 10:53           ` Florian Westphal
2025-04-10 13:05         ` lvxiafei
2025-04-10 13:17           ` Florian Westphal
2025-04-10 14:16             ` Florian Westphal
2025-04-11  4:09               ` lvxiafei
2025-04-12 14:37 ` [PATCH V4] " lvxiafei
2025-04-12 17:26 ` [PATCH V5] " lvxiafei
2025-04-12 17:30   ` lvxiafei
2025-04-12 21:16   ` Jakub Kicinski
2025-04-13  1:14     ` lvxiafei
2025-04-13  9:07   ` Florian Westphal
2025-04-14  3:04     ` lvxiafei
2025-04-15  9:08 ` [PATCH V6] " lvxiafei
2025-04-27  8:14   ` lvxiafei
2025-04-28  9:40     ` Pablo Neira Ayuso
2025-05-22 19:24   ` Pablo Neira Ayuso
2025-05-22 19:32     ` Florian Westphal
2025-05-22 19:58       ` Pablo Neira Ayuso
2025-05-23  9:21         ` lvxiafei

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).