Linux cgroups development
 help / color / mirror / Atom feed
From: Thadeu Lima de Souza Cascardo <cascardo@igalia.com>
To: "Tejun Heo" <tj@kernel.org>,
	"Johannes Weiner" <hannes@cmpxchg.org>,
	"Michal Koutný" <mkoutny@suse.com>,
	"Michal Hocko" <mhocko@kernel.org>,
	"Roman Gushchin" <roman.gushchin@linux.dev>,
	"Shakeel Butt" <shakeel.butt@linux.dev>,
	"Muchun Song" <muchun.song@linux.dev>,
	"Andrew Morton" <akpm@linux-foundation.org>,
	"Jonathan Corbet" <corbet@lwn.net>,
	"Shuah Khan" <skhan@linuxfoundation.org>,
	"Maarten Lankhorst" <dev@lankhorst.se>,
	"Maxime Ripard" <mripard@kernel.org>,
	"Natalie Vock" <natalie.vock@gmx.de>,
	"Tvrtko Ursulin" <tvrtko.ursulin@igalia.com>
Cc: cgroups@vger.kernel.org, linux-kernel@vger.kernel.org,
	 linux-mm@kvack.org, linux-doc@vger.kernel.org,
	 dri-devel@lists.freedesktop.org,
	 Thadeu Lima de Souza Cascardo <cascardo@igalia.com>,
	kernel-dev@igalia.com
Subject: [PATCH 2/2] cgroup/dmem: introduce a peak file
Date: Wed, 06 May 2026 08:58:25 -0300	[thread overview]
Message-ID: <20260506-dmem_peak-v1-2-8d803eb3449c@igalia.com> (raw)
In-Reply-To: <20260506-dmem_peak-v1-0-8d803eb3449c@igalia.com>

Just like we have memory.peak, introduce a dmem.peak, which uses the
page_counter support for that.

It can be written to in order to reset the peak, but different from
memory.peak, which expects any write, dmem.peak expects the region name to
be written to it. That region peak is the one that is reset.

That requires ofp_peak to carry a pointer to the pool that was reset.

Writing a different region name will reset the different region and make
the original region peak get back to its non-reset value.

Signed-off-by: Thadeu Lima de Souza Cascardo <cascardo@igalia.com>
---
 Documentation/admin-guide/cgroup-v2.rst |  10 +++
 include/linux/cgroup-defs.h             |   1 +
 kernel/cgroup/dmem.c                    | 132 ++++++++++++++++++++++++++++++--
 3 files changed, 137 insertions(+), 6 deletions(-)

diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
index 6efd0095ed99..3ba7ab3a36b3 100644
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2808,6 +2808,16 @@ DMEM Interface Files
 	The semantics are the same as for the memory cgroup controller, and are
 	calculated in the same way.
 
+  dmem.peak
+	A readwrite nested-keyed file that exists on non-root cgroups.
+
+	The max memory usage recorded for the cgroup and its descendants since
+	either the creation of the cgroup or the most recent reset for that FD.
+
+	A write of a region name to this file resets it to the current memory
+	usage for subsequent reads through the same file descriptor for that
+	region.
+
   dmem.capacity
 	A read-only file that describes maximum region capacity.
 	It only exists on the root cgroup. Not all memory can be
diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h
index a85044cb0553..b536054bd916 100644
--- a/include/linux/cgroup-defs.h
+++ b/include/linux/cgroup-defs.h
@@ -874,6 +874,7 @@ extern bool cgroup_enable_per_threadgroup_rwsem;
 struct cgroup_of_peak {
 	unsigned long		value;
 	struct list_head	list;
+	struct dmem_cgroup_pool_state *pool;
 };
 
 void of_peak_reset(struct cgroup_of_peak *ofp, struct page_counter *pc,
diff --git a/kernel/cgroup/dmem.c b/kernel/cgroup/dmem.c
index 1ab1fb47f271..afa380c9839b 100644
--- a/kernel/cgroup/dmem.c
+++ b/kernel/cgroup/dmem.c
@@ -57,6 +57,9 @@ struct dmemcg_state {
 	struct cgroup_subsys_state css;
 
 	struct list_head pools;
+
+	/** @peaks_lock: Protects access to the pools' peaks lists */
+	spinlock_t peaks_lock;
 };
 
 struct dmem_cgroup_pool_state {
@@ -72,6 +75,10 @@ struct dmem_cgroup_pool_state {
 	struct rcu_head rcu;
 
 	struct page_counter cnt;
+
+	/* Protected by the dmemcg_state peaks_lock */
+	struct list_head peaks;
+
 	struct dmem_cgroup_pool_state *parent;
 
 	refcount_t ref;
@@ -162,26 +169,45 @@ set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
 	page_counter_set_max(&pool->cnt, val);
 }
 
-static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
+static u64 get_resource_low(struct seq_file *sf, struct dmem_cgroup_pool_state *pool)
 {
 	return pool ? READ_ONCE(pool->cnt.low) : 0;
 }
 
-static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
+static u64 get_resource_min(struct seq_file *sf, struct dmem_cgroup_pool_state *pool)
 {
 	return pool ? READ_ONCE(pool->cnt.min) : 0;
 }
 
-static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
+static u64 get_resource_max(struct seq_file *sf, struct dmem_cgroup_pool_state *pool)
 {
 	return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
 }
 
-static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
+static u64 get_resource_current(struct seq_file *sf, struct dmem_cgroup_pool_state *pool)
 {
 	return pool ? page_counter_read(&pool->cnt) : 0;
 }
 
+static u64 get_resource_peak(struct seq_file *sf, struct dmem_cgroup_pool_state *pool)
+{
+	struct cgroup_of_peak *ofp = of_peak(sf->private);
+	u64 fd_peak, peak;
+	struct dmem_cgroup_pool_state *of_pool;
+
+	if (!pool)
+		return 0;
+
+	of_pool = READ_ONCE(ofp->pool);
+
+	fd_peak = READ_ONCE(ofp->value);
+	if (of_pool != pool || fd_peak == OFP_PEAK_UNSET)
+		peak = pool->cnt.watermark;
+	else
+		peak = max(fd_peak, READ_ONCE(pool->cnt.local_watermark));
+	return peak;
+}
+
 static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
 {
 	set_resource_min(rpool, 0);
@@ -227,6 +253,7 @@ dmemcs_alloc(struct cgroup_subsys_state *parent_css)
 		return ERR_PTR(-ENOMEM);
 
 	INIT_LIST_HEAD(&dmemcs->pools);
+	spin_lock_init(&dmemcs->peaks_lock);
 	return &dmemcs->css;
 }
 
@@ -377,6 +404,7 @@ alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region
 			  ppool ? &ppool->cnt : NULL, true);
 	reset_all_resource_limits(pool);
 	refcount_set(&pool->ref, 1);
+	INIT_LIST_HEAD(&pool->peaks);
 	kref_get(&region->ref);
 	if (ppool && !pool->parent) {
 		pool->parent = ppool;
@@ -784,7 +812,7 @@ static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
 }
 
 static int dmemcg_limit_show(struct seq_file *sf, void *v,
-			    u64 (*fn)(struct dmem_cgroup_pool_state *))
+			    u64 (*fn)(struct seq_file *, struct dmem_cgroup_pool_state *))
 {
 	struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
 	struct dmem_cgroup_region *region;
@@ -796,7 +824,7 @@ static int dmemcg_limit_show(struct seq_file *sf, void *v,
 
 		seq_puts(sf, region->name);
 
-		val = fn(pool);
+		val = fn(sf, pool);
 		if (val < PAGE_COUNTER_MAX)
 			seq_printf(sf, " %lld\n", val);
 		else
@@ -807,6 +835,90 @@ static int dmemcg_limit_show(struct seq_file *sf, void *v,
 	return 0;
 }
 
+static int dmem_cgroup_region_peak_open(struct kernfs_open_file *of)
+{
+	struct cgroup_of_peak *ofp = of_peak(of);
+
+	ofp->value = OFP_PEAK_UNSET;
+
+	return 0;
+}
+
+static void dmem_cgroup_region_peak_remove(struct cgroup_of_peak *ofp)
+{
+	struct dmem_cgroup_pool_state *pool;
+	struct dmemcg_state *dmemcs;
+
+	pool = xchg(&ofp->pool, NULL);
+	if (!pool)
+		return;
+
+	dmemcs = pool->cs;
+
+	spin_lock(&dmemcs->peaks_lock);
+	list_del(&ofp->list);
+	spin_unlock(&dmemcs->peaks_lock);
+
+	WRITE_ONCE(ofp->value, OFP_PEAK_UNSET);
+
+	dmemcg_pool_put(pool);
+}
+
+static void dmem_cgroup_region_peak_release(struct kernfs_open_file *of)
+{
+	struct cgroup_of_peak *ofp = of_peak(of);
+
+	if (ofp->value == OFP_PEAK_UNSET) {
+		/* fast path (no writes on this fd) */
+		return;
+	}
+
+	dmem_cgroup_region_peak_remove(ofp);
+}
+
+static ssize_t dmem_cgroup_region_peak_write(struct kernfs_open_file *of,
+					     char *buf, size_t nbytes, loff_t off)
+{
+	struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
+	struct cgroup_of_peak *ofp = of_peak(of);
+	struct dmem_cgroup_pool_state *pool = NULL;
+	struct dmem_cgroup_region *region;
+	int err = 0;
+
+	buf = strstrip(buf);
+	if (!buf[0])
+		return -EINVAL;
+
+	rcu_read_lock();
+	region = dmemcg_get_region_by_name(buf);
+	rcu_read_unlock();
+
+	if (!region)
+		return -EINVAL;
+
+	pool = get_cg_pool_unlocked(dmemcs, region);
+	if (IS_ERR(pool)) {
+		err = PTR_ERR(pool);
+		goto out_put;
+	}
+
+	dmem_cgroup_region_peak_remove(ofp);
+
+	xchg(&ofp->pool, pool);
+	spin_lock(&dmemcs->peaks_lock);
+	of_peak_reset(ofp, &pool->cnt, &pool->peaks);
+	spin_unlock(&dmemcs->peaks_lock);
+
+out_put:
+	kref_put(&region->ref, dmemcg_free_region);
+	return err ?: nbytes;
+}
+
+static int dmem_cgroup_region_peak_show(struct seq_file *sf, void *v)
+{
+	return dmemcg_limit_show(sf, v, get_resource_peak);
+}
+
 static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
 {
 	return dmemcg_limit_show(sf, v, get_resource_current);
@@ -855,6 +967,14 @@ static struct cftype files[] = {
 		.name = "current",
 		.seq_show = dmem_cgroup_region_current_show,
 	},
+	{
+		.name = "peak",
+		.open = dmem_cgroup_region_peak_open,
+		.release = dmem_cgroup_region_peak_release,
+		.write = dmem_cgroup_region_peak_write,
+		.seq_show = dmem_cgroup_region_peak_show,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
 	{
 		.name = "min",
 		.write = dmem_cgroup_region_min_write,

-- 
2.47.3


  parent reply	other threads:[~2026-05-06 12:00 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-05-06 11:58 [PATCH 0/2] cgroup/dmem: introduce a peak file Thadeu Lima de Souza Cascardo
2026-05-06 11:58 ` [PATCH 1/2] mm/page_counter: decouple peak_reset from peak_write Thadeu Lima de Souza Cascardo
2026-05-06 11:58 ` Thadeu Lima de Souza Cascardo [this message]
2026-05-06 13:53 ` [PATCH 0/2] cgroup/dmem: introduce a peak file Michal Koutný
2026-05-06 14:18   ` Thadeu Lima de Souza Cascardo
2026-05-06 15:09     ` Michal Koutný

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260506-dmem_peak-v1-2-8d803eb3449c@igalia.com \
    --to=cascardo@igalia.com \
    --cc=akpm@linux-foundation.org \
    --cc=cgroups@vger.kernel.org \
    --cc=corbet@lwn.net \
    --cc=dev@lankhorst.se \
    --cc=dri-devel@lists.freedesktop.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-dev@igalia.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@kernel.org \
    --cc=mkoutny@suse.com \
    --cc=mripard@kernel.org \
    --cc=muchun.song@linux.dev \
    --cc=natalie.vock@gmx.de \
    --cc=roman.gushchin@linux.dev \
    --cc=shakeel.butt@linux.dev \
    --cc=skhan@linuxfoundation.org \
    --cc=tj@kernel.org \
    --cc=tvrtko.ursulin@igalia.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox