All of lore.kernel.org
 help / color / mirror / Atom feed
From: Pablo Neira Ayuso <pablo@netfilter.org>
To: lvxiafei <xiafei_xupt@163.com>
Cc: coreteam@netfilter.org, davem@davemloft.net, edumazet@google.com,
	horms@kernel.org, kadlec@netfilter.org, kuba@kernel.org,
	linux-kernel@vger.kernel.org, lvxiafei@sensetime.com,
	netdev@vger.kernel.org, netfilter-devel@vger.kernel.org,
	pabeni@redhat.com
Subject: Re: [PATCH V6] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl
Date: Thu, 22 May 2025 21:24:48 +0200	[thread overview]
Message-ID: <aC96AHaQX9WVtln5@calendula> (raw)
In-Reply-To: <20250415090834.24882-1-xiafei_xupt@163.com>

On Tue, Apr 15, 2025 at 05:08:34PM +0800, lvxiafei wrote:
> diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst
> index 238b66d0e059..6e7f17f5959a 100644
> --- a/Documentation/networking/nf_conntrack-sysctl.rst
> +++ b/Documentation/networking/nf_conntrack-sysctl.rst
> @@ -93,12 +93,29 @@ nf_conntrack_log_invalid - INTEGER
>  	Log invalid packets of a type specified by value.
>  
>  nf_conntrack_max - INTEGER
> -        Maximum number of allowed connection tracking entries. This value is set
> -        to nf_conntrack_buckets by default.
> -        Note that connection tracking entries are added to the table twice -- once
> -        for the original direction and once for the reply direction (i.e., with
> -        the reversed address). This means that with default settings a maxed-out
> -        table will have a average hash chain length of 2, not 1.
> +    - 0 - disabled (unlimited)

unlimited is too much, and the number of buckets is also global, how
does this work?

Is your goal to allow a netns to have larger table than netns? There
should be a cap for this.

> +    - not 0 - enabled
> +
> +    Maximum number of allowed connection tracking entries per netns. This value
> +    is set to nf_conntrack_buckets by default.
> +
> +    Note that connection tracking entries are added to the table twice -- once
> +    for the original direction and once for the reply direction (i.e., with
> +    the reversed address). This means that with default settings a maxed-out
> +    table will have a average hash chain length of 2, not 1.
> +
> +    The limit of other netns cannot be greater than init_net netns.
> +    +----------------+-------------+----------------+
> +    | init_net netns | other netns | limit behavior |
> +    +----------------+-------------+----------------+
> +    | 0              | 0           | unlimited      |
> +    +----------------+-------------+----------------+
> +    | 0              | not 0       | other          |
> +    +----------------+-------------+----------------+
> +    | not 0          | 0           | init_net       |
> +    +----------------+-------------+----------------+
> +    | not 0          | not 0       | min            |
> +    +----------------+-------------+----------------+
>  
>  nf_conntrack_tcp_be_liberal - BOOLEAN
>  	- 0 - disabled (default)
> diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
> index 3f02a45773e8..594439b2f5a1 100644
> --- a/include/net/netfilter/nf_conntrack.h
> +++ b/include/net/netfilter/nf_conntrack.h
> @@ -320,7 +320,6 @@ int nf_conntrack_hash_resize(unsigned int hashsize);
>  extern struct hlist_nulls_head *nf_conntrack_hash;
>  extern unsigned int nf_conntrack_htable_size;
>  extern seqcount_spinlock_t nf_conntrack_generation;
> -extern unsigned int nf_conntrack_max;
>  
>  /* must be called with rcu read lock held */
>  static inline void
> @@ -360,6 +359,17 @@ static inline struct nf_conntrack_net *nf_ct_pernet(const struct net *net)
>  	return net_generic(net, nf_conntrack_net_id);
>  }
>  
> +static inline unsigned int nf_conntrack_max(const struct net *net)
> +{
> +#if IS_ENABLED(CONFIG_NF_CONNTRACK)
> +	return likely(init_net.ct.sysctl_max && net->ct.sysctl_max) ?
> +	    min(init_net.ct.sysctl_max, net->ct.sysctl_max) :
> +	    max(init_net.ct.sysctl_max, net->ct.sysctl_max);
> +#else
> +	return 0;
> +#endif
> +}
> +
>  int nf_ct_skb_network_trim(struct sk_buff *skb, int family);
>  int nf_ct_handle_fragments(struct net *net, struct sk_buff *skb,
>  			   u16 zone, u8 family, u8 *proto, u16 *mru);
> diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h
> index bae914815aa3..d3fcd0b92b2d 100644
> --- a/include/net/netns/conntrack.h
> +++ b/include/net/netns/conntrack.h
> @@ -102,6 +102,7 @@ struct netns_ct {
>  	u8			sysctl_acct;
>  	u8			sysctl_tstamp;
>  	u8			sysctl_checksum;
> +	unsigned int		sysctl_max;
>  
>  	struct ip_conntrack_stat __percpu *stat;
>  	struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
> diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
> index 7f8b245e287a..a738564923ec 100644
> --- a/net/netfilter/nf_conntrack_core.c
> +++ b/net/netfilter/nf_conntrack_core.c
> @@ -202,8 +202,6 @@ static void nf_conntrack_all_unlock(void)
>  unsigned int nf_conntrack_htable_size __read_mostly;
>  EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
>  
> -unsigned int nf_conntrack_max __read_mostly;
> -EXPORT_SYMBOL_GPL(nf_conntrack_max);
>  seqcount_spinlock_t nf_conntrack_generation __read_mostly;
>  static siphash_aligned_key_t nf_conntrack_hash_rnd;
>  
> @@ -1498,7 +1496,7 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
>  
>  static void gc_worker(struct work_struct *work)
>  {
> -	unsigned int i, hashsz, nf_conntrack_max95 = 0;
> +	unsigned int i, hashsz;
>  	u32 end_time, start_time = nfct_time_stamp;
>  	struct conntrack_gc_work *gc_work;
>  	unsigned int expired_count = 0;
> @@ -1509,8 +1507,6 @@ static void gc_worker(struct work_struct *work)
>  	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
>  
>  	i = gc_work->next_bucket;
> -	if (gc_work->early_drop)
> -		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
>  
>  	if (i == 0) {
>  		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
> @@ -1538,6 +1534,7 @@ static void gc_worker(struct work_struct *work)
>  		}
>  
>  		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
> +			unsigned int nf_conntrack_max95 = 0;
>  			struct nf_conntrack_net *cnet;
>  			struct net *net;
>  			long expires;
> @@ -1567,11 +1564,14 @@ static void gc_worker(struct work_struct *work)
>  			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
>  			expires = (expires - (long)next_run) / ++count;
>  			next_run += expires;
> +			net = nf_ct_net(tmp);
> +
> +			if (gc_work->early_drop)
> +				nf_conntrack_max95 = nf_conntrack_max(net) / 100u * 95u;
>  
>  			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
>  				continue;
>  
> -			net = nf_ct_net(tmp);
>  			cnet = nf_ct_pernet(net);
>  			if (atomic_read(&cnet->count) < nf_conntrack_max95)
>  				continue;
> @@ -1648,13 +1648,14 @@ __nf_conntrack_alloc(struct net *net,
>  		     gfp_t gfp, u32 hash)
>  {
>  	struct nf_conntrack_net *cnet = nf_ct_pernet(net);
> -	unsigned int ct_count;
> +	unsigned int ct_max, ct_count;
>  	struct nf_conn *ct;
>  
>  	/* We don't want any race condition at early drop stage */
>  	ct_count = atomic_inc_return(&cnet->count);
> +	ct_max = nf_conntrack_max(net);
>  
> -	if (nf_conntrack_max && unlikely(ct_count > nf_conntrack_max)) {
> +	if (ct_max && unlikely(ct_count > ct_max)) {
>  		if (!early_drop(net, hash)) {
>  			if (!conntrack_gc_work.early_drop)
>  				conntrack_gc_work.early_drop = true;
> @@ -2650,7 +2651,7 @@ int nf_conntrack_init_start(void)
>  	if (!nf_conntrack_hash)
>  		return -ENOMEM;
>  
> -	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
> +	init_net.ct.sysctl_max = max_factor * nf_conntrack_htable_size;
>  
>  	nf_conntrack_cachep = kmem_cache_create("nf_conntrack",
>  						sizeof(struct nf_conn),
> diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
> index 2cc0fde23344..73e6bb1e939b 100644
> --- a/net/netfilter/nf_conntrack_netlink.c
> +++ b/net/netfilter/nf_conntrack_netlink.c
> @@ -2608,7 +2608,7 @@ ctnetlink_stat_ct_fill_info(struct sk_buff *skb, u32 portid, u32 seq, u32 type,
>  	if (nla_put_be32(skb, CTA_STATS_GLOBAL_ENTRIES, htonl(nr_conntracks)))
>  		goto nla_put_failure;
>  
> -	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max)))
> +	if (nla_put_be32(skb, CTA_STATS_GLOBAL_MAX_ENTRIES, htonl(nf_conntrack_max(net))))
>  		goto nla_put_failure;
>  
>  	nlmsg_end(skb, nlh);
> diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
> index 2f666751c7e7..5db6df0e4eb3 100644
> --- a/net/netfilter/nf_conntrack_standalone.c
> +++ b/net/netfilter/nf_conntrack_standalone.c
> @@ -615,7 +615,7 @@ enum nf_ct_sysctl_index {
>  static struct ctl_table nf_ct_sysctl_table[] = {
>  	[NF_SYSCTL_CT_MAX] = {
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec_minmax,
> @@ -948,7 +948,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
>  static struct ctl_table nf_ct_netfilter_table[] = {
>  	{
>  		.procname	= "nf_conntrack_max",
> -		.data		= &nf_conntrack_max,
> +		.data		= &init_net.ct.sysctl_max,
>  		.maxlen		= sizeof(int),
>  		.mode		= 0644,
>  		.proc_handler	= proc_dointvec_minmax,
> @@ -1063,6 +1063,7 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>  
>  	table[NF_SYSCTL_CT_COUNT].data = &cnet->count;
>  	table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum;
> +	table[NF_SYSCTL_CT_MAX].data = &net->ct.sysctl_max;
>  	table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid;
>  	table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct;
>  #ifdef CONFIG_NF_CONNTRACK_EVENTS
> @@ -1087,7 +1088,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
>  
>  	/* Don't allow non-init_net ns to alter global sysctls */
>  	if (!net_eq(&init_net, net)) {
> -		table[NF_SYSCTL_CT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_EXPECT_MAX].mode = 0444;
>  		table[NF_SYSCTL_CT_BUCKETS].mode = 0444;
>  	}
> @@ -1139,6 +1139,7 @@ static int nf_conntrack_pernet_init(struct net *net)
>  	int ret;
>  
>  	net->ct.sysctl_checksum = 1;
> +	net->ct.sysctl_max = init_net.ct.sysctl_max;
>  
>  	ret = nf_conntrack_standalone_init_sysctl(net);
>  	if (ret < 0)
> -- 
> 2.40.1
> 

  parent reply	other threads:[~2025-05-22 19:24 UTC|newest]

Thread overview: 37+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-04-07  9:50 [PATCH] netfilter: netns nf_conntrack: per-netns net.netfilter.nf_conntrack_max sysctl lvxiafei
2025-04-07 10:13 ` Florian Westphal
2025-04-07 10:56   ` Jan Engelhardt
2025-04-08  8:27     ` lvxiafei
2025-04-08  8:38     ` lvxiafei
2025-04-08  8:17   ` lvxiafei
2025-04-08  9:03 ` [PATCH V2] " lvxiafei
2025-04-08  9:58   ` Florian Westphal
2025-04-08 12:39     ` lvxiafei
2025-04-08 13:28       ` Florian Westphal
2025-04-09  4:14         ` lvxiafei
2025-04-09  4:25 ` [PATCH V3] " lvxiafei
2025-04-09  7:20   ` Florian Westphal
2025-04-09  9:13     ` lvxiafei
2025-04-09  9:42       ` Florian Westphal
2025-04-10 10:02         ` lvxiafei
2025-04-10 10:53           ` Florian Westphal
2025-04-10 13:05         ` lvxiafei
2025-04-10 13:17           ` Florian Westphal
2025-04-10 14:16             ` Florian Westphal
2025-04-11  4:09               ` lvxiafei
2025-04-12 14:37 ` [PATCH V4] " lvxiafei
2025-04-12 17:26 ` [PATCH V5] " lvxiafei
2025-04-12 17:30   ` lvxiafei
2025-04-12 21:16   ` Jakub Kicinski
2025-04-13  1:14     ` lvxiafei
2025-04-13  9:07   ` Florian Westphal
2025-04-14  3:04     ` lvxiafei
2025-04-15  9:08 ` [PATCH V6] " lvxiafei
2025-04-27  8:14   ` lvxiafei
2025-04-28  9:40     ` Pablo Neira Ayuso
2025-05-22 19:24   ` Pablo Neira Ayuso [this message]
2025-05-22 19:32     ` Florian Westphal
2025-05-22 19:58       ` Pablo Neira Ayuso
2025-05-23  9:21         ` lvxiafei
2025-10-14 13:54           ` Florian Westphal
2025-12-01 11:08             ` lvxiafei

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=aC96AHaQX9WVtln5@calendula \
    --to=pablo@netfilter.org \
    --cc=coreteam@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=edumazet@google.com \
    --cc=horms@kernel.org \
    --cc=kadlec@netfilter.org \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lvxiafei@sensetime.com \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=xiafei_xupt@163.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.