netfilter-devel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Richard Gobert <richardbgobert@gmail.com>
To: davem@davemloft.net, edumazet@google.com, kuba@kernel.org,
	pabeni@redhat.com, corbet@lwn.net, yoshfuji@linux-ipv6.org,
	dsahern@kernel.org, alex.aring@gmail.com,
	stefan@datenfreihafen.org, pablo@netfilter.org,
	kadlec@netfilter.org, fw@strlen.de, kafai@fb.com,
	netdev@vger.kernel.org, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, linux-wpan@vger.kernel.org,
	netfilter-devel@vger.kernel.org, coreteam@netfilter.org
Subject: [PATCH 4/4] net-next: frags: dynamic timeout under load
Date: Mon, 29 Aug 2022 13:47:49 +0200	[thread overview]
Message-ID: <20220829114739.GA2436@debian> (raw)

Calculate a dynamic fragment reassembly timeout, taking into
consideration the current fqdir load and the load introduced by
the peer. Reintroduce low_thresh, which now acts as a knob for
adjusting per-peer memory limits.

Signed-off-by: Richard Gobert <richardbgobert@gmail.com>
---
 Documentation/networking/ip-sysctl.rst |  3 +++
 include/net/inet_frag.h                |  1 +
 net/ipv4/inet_fragment.c               | 30 +++++++++++++++++++++++++-
 net/ipv4/ip_fragment.c                 |  2 +-
 4 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/Documentation/networking/ip-sysctl.rst b/Documentation/networking/ip-sysctl.rst
index 56cd4ea059b2..fb25aa6e22a2 100644
--- a/Documentation/networking/ip-sysctl.rst
+++ b/Documentation/networking/ip-sysctl.rst
@@ -247,6 +247,9 @@ ipfrag_low_thresh - LONG INTEGER
 	begins to remove incomplete fragment queues to free up resources.
 	The kernel still accepts new fragments for defragmentation.
 
+	(Since linux-6.1)
+	Maximum memory used to reassemble IP fragments sent by a single peer.
+
 ipfrag_time - INTEGER
 	Time in seconds to keep an IP fragment in memory.
 
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 077a0ec78a58..595a6db57a0e 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -99,6 +99,7 @@ struct inet_frag_queue {
 	u16			max_size;
 	struct fqdir		*fqdir;
 	struct inet_peer	*peer;
+	u64			timeout;
 	struct rcu_head		rcu;
 };
 
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index 8b8d77d548d4..34c5ebba4951 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -314,6 +314,30 @@ void inet_frag_free(struct inet_frag_queue *q)
 	call_rcu(&q->rcu, inet_frag_destroy_rcu);
 }
 
+static int inet_frag_update_timeout(struct inet_frag_queue *q)
+{
+	u64 peer_timeout, inet_timeout;
+	long peer_mem, inet_mem;
+	long high_thresh = READ_ONCE(q->fqdir->high_thresh);
+	long low_thresh  = READ_ONCE(q->fqdir->low_thresh);
+	u64 base_timeout = READ_ONCE(q->fqdir->timeout);
+
+	peer_mem = low_thresh - peer_mem_limit(q);
+	inet_mem = high_thresh - frag_mem_limit(q->fqdir);
+
+	if (peer_mem <= 0 || inet_mem <= 0)
+		return -ENOMEM;
+
+	/* Timeout changes linearly with respect to the amount of free memory.
+	 * Choose the more permissive of the two timeouts, to avoid limiting
+	 * the system while there is still enough memory.
+	 */
+	peer_timeout = div64_long(base_timeout * peer_mem, low_thresh);
+	inet_timeout = div64_long(base_timeout * inet_mem, high_thresh);
+	q->timeout = max_t(u64, peer_timeout, inet_timeout);
+	return 0;
+}
+
 void inet_frag_destroy(struct inet_frag_queue *q)
 {
 	struct fqdir *fqdir;
@@ -346,6 +370,10 @@ static struct inet_frag_queue *inet_frag_alloc(struct fqdir *fqdir,
 
 	q->fqdir = fqdir;
 	f->constructor(q, arg);
+	if (inet_frag_update_timeout(q)) {
+		inet_frag_free(q);
+		return NULL;
+	}
 	add_frag_mem_limit(q, f->qsize);
 
 	timer_setup(&q->timer, f->frag_expire, 0);
@@ -367,7 +395,7 @@ static struct inet_frag_queue *inet_frag_create(struct fqdir *fqdir,
 		*prev = ERR_PTR(-ENOMEM);
 		return NULL;
 	}
-	mod_timer(&q->timer, jiffies + fqdir->timeout);
+	mod_timer(&q->timer, jiffies + q->timeout);
 
 	*prev = rhashtable_lookup_get_insert_key(&fqdir->rhashtable, &q->key,
 						 &q->node, f->rhash_params);
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index e35061f6aadb..88a99242d721 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -236,7 +236,7 @@ static int ip_frag_reinit(struct ipq *qp)
 {
 	unsigned int sum_truesize = 0;
 
-	if (!mod_timer(&qp->q.timer, jiffies + qp->q.fqdir->timeout)) {
+	if (!mod_timer(&qp->q.timer, jiffies + qp->q.timeout)) {
 		refcount_inc(&qp->q.refcnt);
 		return -ETIMEDOUT;
 	}
-- 
2.36.1


             reply	other threads:[~2022-08-29 12:11 UTC|newest]

Thread overview: 2+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-08-29 11:47 Richard Gobert [this message]
2022-08-29 17:15 ` [PATCH 4/4] net-next: frags: dynamic timeout under load Eric Dumazet

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20220829114739.GA2436@debian \
    --to=richardbgobert@gmail.com \
    --cc=alex.aring@gmail.com \
    --cc=corbet@lwn.net \
    --cc=coreteam@netfilter.org \
    --cc=davem@davemloft.net \
    --cc=dsahern@kernel.org \
    --cc=edumazet@google.com \
    --cc=fw@strlen.de \
    --cc=kadlec@netfilter.org \
    --cc=kafai@fb.com \
    --cc=kuba@kernel.org \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-wpan@vger.kernel.org \
    --cc=netdev@vger.kernel.org \
    --cc=netfilter-devel@vger.kernel.org \
    --cc=pabeni@redhat.com \
    --cc=pablo@netfilter.org \
    --cc=stefan@datenfreihafen.org \
    --cc=yoshfuji@linux-ipv6.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).