netdev.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Sasha Levin <sashal@kernel.org>
To: linux-kernel@vger.kernel.org, stable@vger.kernel.org
Cc: Florian Westphal <fw@strlen.de>, Karel Rericha <karel@maxtel.cz>,
	Shmulik Ladkani <shmulik.ladkani@gmail.com>,
	Eyal Birger <eyal.birger@gmail.com>,
	Pablo Neira Ayuso <pablo@netfilter.org>,
	Sasha Levin <sashal@kernel.org>,
	kadlec@netfilter.org, davem@davemloft.net, kuba@kernel.org,
	pabeni@redhat.com, netfilter-devel@vger.kernel.org,
	coreteam@netfilter.org, netdev@vger.kernel.org
Subject: [PATCH AUTOSEL 5.16 103/109] netfilter: conntrack: revisit gc autotuning
Date: Fri,  1 Apr 2022 10:32:50 -0400	[thread overview]
Message-ID: <20220401143256.1950537-103-sashal@kernel.org> (raw)
In-Reply-To: <20220401143256.1950537-1-sashal@kernel.org>

From: Florian Westphal <fw@strlen.de>

[ Upstream commit 2cfadb761d3d0219412fd8150faea60c7e863833 ]

as of commit 4608fdfc07e1
("netfilter: conntrack: collect all entries in one cycle")
conntrack gc was changed to run every 2 minutes.

On systems where conntrack hash table is set to large value, most evictions
happen from gc worker rather than the packet path due to hash table
distribution.

This causes netlink event overflows when events are collected.

This change collects average expiry of scanned entries and
reschedules to the average remaining value, within 1 to 60 second interval.

To avoid event overflows, reschedule after each bucket and add a
limit for both run time and number of evictions per run.

If more entries have to be evicted, reschedule and restart 1 jiffy
into the future.

Reported-by: Karel Rericha <karel@maxtel.cz>
Cc: Shmulik Ladkani <shmulik.ladkani@gmail.com>
Cc: Eyal Birger <eyal.birger@gmail.com>
Signed-off-by: Florian Westphal <fw@strlen.de>
Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 net/netfilter/nf_conntrack_core.c | 85 ++++++++++++++++++++++++-------
 1 file changed, 68 insertions(+), 17 deletions(-)

diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7f7997460764..4045169ff47a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,6 +66,8 @@ EXPORT_SYMBOL_GPL(nf_conntrack_hash);
 struct conntrack_gc_work {
 	struct delayed_work	dwork;
 	u32			next_bucket;
+	u32			avg_timeout;
+	u32			start_time;
 	bool			exiting;
 	bool			early_drop;
 };
@@ -77,8 +79,19 @@ static __read_mostly bool nf_conntrack_locks_all;
 /* serialize hash resizes and nf_ct_iterate_cleanup */
 static DEFINE_MUTEX(nf_conntrack_mutex);
 
-#define GC_SCAN_INTERVAL	(120u * HZ)
+#define GC_SCAN_INTERVAL_MAX	(60ul * HZ)
+#define GC_SCAN_INTERVAL_MIN	(1ul * HZ)
+
+/* clamp timeouts to this value (TCP unacked) */
+#define GC_SCAN_INTERVAL_CLAMP	(300ul * HZ)
+
+/* large initial bias so that we don't scan often just because we have
+ * three entries with a 1s timeout.
+ */
+#define GC_SCAN_INTERVAL_INIT	INT_MAX
+
 #define GC_SCAN_MAX_DURATION	msecs_to_jiffies(10)
+#define GC_SCAN_EXPIRED_MAX	(64000u / HZ)
 
 #define MIN_CHAINLEN	8u
 #define MAX_CHAINLEN	(32u - MIN_CHAINLEN)
@@ -1420,16 +1433,28 @@ static bool gc_worker_can_early_drop(const struct nf_conn *ct)
 
 static void gc_worker(struct work_struct *work)
 {
-	unsigned long end_time = jiffies + GC_SCAN_MAX_DURATION;
 	unsigned int i, hashsz, nf_conntrack_max95 = 0;
-	unsigned long next_run = GC_SCAN_INTERVAL;
+	u32 end_time, start_time = nfct_time_stamp;
 	struct conntrack_gc_work *gc_work;
+	unsigned int expired_count = 0;
+	unsigned long next_run;
+	s32 delta_time;
+
 	gc_work = container_of(work, struct conntrack_gc_work, dwork.work);
 
 	i = gc_work->next_bucket;
 	if (gc_work->early_drop)
 		nf_conntrack_max95 = nf_conntrack_max / 100u * 95u;
 
+	if (i == 0) {
+		gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT;
+		gc_work->start_time = start_time;
+	}
+
+	next_run = gc_work->avg_timeout;
+
+	end_time = start_time + GC_SCAN_MAX_DURATION;
+
 	do {
 		struct nf_conntrack_tuple_hash *h;
 		struct hlist_nulls_head *ct_hash;
@@ -1446,6 +1471,7 @@ static void gc_worker(struct work_struct *work)
 
 		hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) {
 			struct nf_conntrack_net *cnet;
+			unsigned long expires;
 			struct net *net;
 
 			tmp = nf_ct_tuplehash_to_ctrack(h);
@@ -1455,11 +1481,29 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
+			if (expired_count > GC_SCAN_EXPIRED_MAX) {
+				rcu_read_unlock();
+
+				gc_work->next_bucket = i;
+				gc_work->avg_timeout = next_run;
+
+				delta_time = nfct_time_stamp - gc_work->start_time;
+
+				/* re-sched immediately if total cycle time is exceeded */
+				next_run = delta_time < (s32)GC_SCAN_INTERVAL_MAX;
+				goto early_exit;
+			}
+
 			if (nf_ct_is_expired(tmp)) {
 				nf_ct_gc_expired(tmp);
+				expired_count++;
 				continue;
 			}
 
+			expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP);
+			next_run += expires;
+			next_run /= 2u;
+
 			if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp))
 				continue;
 
@@ -1477,8 +1521,10 @@ static void gc_worker(struct work_struct *work)
 				continue;
 			}
 
-			if (gc_worker_can_early_drop(tmp))
+			if (gc_worker_can_early_drop(tmp)) {
 				nf_ct_kill(tmp);
+				expired_count++;
+			}
 
 			nf_ct_put(tmp);
 		}
@@ -1491,33 +1537,38 @@ static void gc_worker(struct work_struct *work)
 		cond_resched();
 		i++;
 
-		if (time_after(jiffies, end_time) && i < hashsz) {
+		delta_time = nfct_time_stamp - end_time;
+		if (delta_time > 0 && i < hashsz) {
+			gc_work->avg_timeout = next_run;
 			gc_work->next_bucket = i;
 			next_run = 0;
-			break;
+			goto early_exit;
 		}
 	} while (i < hashsz);
 
+	gc_work->next_bucket = 0;
+
+	next_run = clamp(next_run, GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_MAX);
+
+	delta_time = max_t(s32, nfct_time_stamp - gc_work->start_time, 1);
+	if (next_run > (unsigned long)delta_time)
+		next_run -= delta_time;
+	else
+		next_run = 1;
+
+early_exit:
 	if (gc_work->exiting)
 		return;
 
-	/*
-	 * Eviction will normally happen from the packet path, and not
-	 * from this gc worker.
-	 *
-	 * This worker is only here to reap expired entries when system went
-	 * idle after a busy period.
-	 */
-	if (next_run) {
+	if (next_run)
 		gc_work->early_drop = false;
-		gc_work->next_bucket = 0;
-	}
+
 	queue_delayed_work(system_power_efficient_wq, &gc_work->dwork, next_run);
 }
 
 static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work)
 {
-	INIT_DEFERRABLE_WORK(&gc_work->dwork, gc_worker);
+	INIT_DELAYED_WORK(&gc_work->dwork, gc_worker);
 	gc_work->exiting = false;
 }
 
-- 
2.34.1


  parent reply	other threads:[~2022-04-01 14:51 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <20220401143256.1950537-1-sashal@kernel.org>
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 002/109] ath5k: fix OOB in ath5k_eeprom_read_pcal_info_5111 Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 013/109] ptp: replace snprintf with sysfs_emit Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 014/109] selftests, xsk: Fix bpf_res cleanup test Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 019/109] ath11k: fix kernel panic during unload/load ath11k modules Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 020/109] ath11k: pci: fix crash on suspend if board file is not found Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 021/109] ath11k: mhi: use mhi_sync_power_up() Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 022/109] net/smc: Send directly when TCP_CORK is cleared Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 024/109] bpf: Make dst_port field in struct bpf_sock 16-bit wide Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 029/109] mt76: mt7921: fix crash when startup fails Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 030/109] mt76: dma: initialize skip_unmap in mt76_dma_rx_fill Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 031/109] cfg80211: don't add non transmitted BSS to 6GHz scanned channels Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 032/109] libbpf: Fix build issue with llvm-readelf Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 033/109] ipv6: make mc_forwarding atomic Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 034/109] net: initialize init_net earlier Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 044/109] tcp: Don't acquire inet_listen_hashbucket::lock with disabled BH Sasha Levin
2022-04-01 14:31 ` [PATCH AUTOSEL 5.16 051/109] net/mlx5e: Disable TX queues before registering the netdev Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 053/109] iwlwifi: mvm: Correctly set fragmented EBS Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 054/109] iwlwifi: mvm: Passively scan non PSC channels only when requested so Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 055/109] iwlwifi: fix small doc mistake for iwl_fw_ini_addr_val Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 056/109] iwlwifi: mvm: move only to an enabled channel Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 058/109] rtw89: fix RCU usage in rtw89_core_txq_push() Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 059/109] ipv4: Invalidate neighbour for broadcast address upon address addition Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 069/109] mt76: mt7915: fix injected MPDU transmission to not use HW A-MSDU Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 072/109] mt76: mt7615: Fix assigning negative values to unsigned variable Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 076/109] net/smc: correct settings of RMB window update limit Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 078/109] macvtap: advertise link netns via netlink Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 079/109] tuntap: add sanity checks about msg_controllen in sendmsg Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 081/109] Bluetooth: Fix not checking for valid hdev on bt_dev_{info,warn,err,dbg} Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 082/109] Bluetooth: use memset avoid memory leaks Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 083/109] bnxt_en: Eliminate unintended link toggle during FW reset Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 086/109] powerpc/64e: Tie PPC_BOOK3E_64 to PPC_FSL_BOOK3E Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 089/109] can: isotp: set default value for N_As to 50 micro seconds Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 090/109] can: etas_es58x: es58x_fd_rx_event_msg(): initialize rx_event_msg before calling es58x_check_msg_len() Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 092/109] net: account alternate interface name memory Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 093/109] net: limit altnames to 64k total Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 094/109] net/mlx5e: Remove overzealous validations in netlink EEPROM query Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 097/109] net: sfp: add 2500base-X quirk for Lantech SFP module Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 099/109] mt76: fix monitor mode crash with sdio driver Sasha Levin
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 102/109] Bluetooth: Fix use after free in hci_send_acl Sasha Levin
2022-04-01 14:32 ` Sasha Levin [this message]
2022-04-01 14:32 ` [PATCH AUTOSEL 5.16 104/109] netlabel: fix out-of-bounds memory accesses Sasha Levin

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220401143256.1950537-103-sashal@kernel.org \
    --to=sashal@kernel.org \
    --cc=coreteam@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=eyal.birger@gmail.com \
    --cc=fw@strlen.de \
    --cc=kadlec@netfilter.org \
    --cc=karel@maxtel.cz \
    --cc=kuba@kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pablo@netfilter.org \
    --cc=shmulik.ladkani@gmail.com \
    --cc=stable@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).