[PATCH 2/2] mm/vmpressure: split v1 userspace eventfd code into vmpressure-v1.c

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Usama Arif <usama.arif@linux.dev>
To: Andrew Morton <akpm@linux-foundation.org>,
	david@kernel.org, linux-mm@kvack.org
Cc: hannes@cmpxchg.org, tj@kernel.org, mkoutny@suse.com,
	shakeel.butt@linux.dev, roman.gushchin@linux.dev,
	liam@infradead.org, linux-kernel@vger.kernel.org, ljs@kernel.org,
	mhocko@suse.com, rppt@kernel.org, surenb@google.com,
	vbabka@kernel.org, kernel-team@meta.com,
	Usama Arif <usama.arif@linux.dev>
Subject: [PATCH 2/2] mm/vmpressure: split v1 userspace eventfd code into vmpressure-v1.c
Date: Sat,  6 Jun 2026 04:41:34 -0700	[thread overview]
Message-ID: <20260606114158.3126210-3-usama.arif@linux.dev> (raw)
In-Reply-To: <20260606114158.3126210-1-usama.arif@linux.dev>

Clean up mm/vmpressure.c by separating the cgroup v1 userspace eventfd
interface from the shared and v2 in-kernel code.

Currently, almost half of mm/vmpressure.c exists to serve tree=true:
struct vmpressure_event, the events list and its mutex, the work_struct
and vmpressure_work_fn that drains tree_scanned/tree_reclaimed, the
parent walk, vmpressure_event(), vmpressure_register_event(),
vmpressure_unregister_event(), and vmpressure_prio() (which always
calls vmpressure() with tree=true).

Move it all into a new mm/vmpressure-v1.c built only when
CONFIG_MEMCG_V1=y (following the existing memcontrol-v1.o pattern).

vmpressure.c keeps the shared bits (constants, vmpressure_calc_level,
the runtime hierarchy check, the tree=false body, init/cleanup
plumbing) and calls into three small v1 hooks for the tree=true
accumulator and the v1 portions of init/cleanup. The hooks have
static-inline no-op stubs in include/linux/vmpressure.h for the
!MEMCG_V1 case, so callers don't need ifdefs. vmpressure_prio() gets
the same treatment, which means vmscan.c's call site disappears at
compile time on v2-only kernels.

The only #ifdef CONFIG_MEMCG_V1 in source remains around the v1-only
fields inside struct vmpressure itself.

Memory savings on CONFIG_MEMCG_V1=n (measured with pahole):

  struct vmpressure :  112B ->   24B
  struct mem_cgroup : 1664B -> 1536B

Signed-off-by: Usama Arif <usama.arif@linux.dev>
---
 include/linux/vmpressure.h |  46 +++++-
 mm/Makefile                |   2 +-
 mm/vmpressure-v1.c         | 305 +++++++++++++++++++++++++++++++++++++
 mm/vmpressure.c            | 293 ++---------------------------------
 4 files changed, 358 insertions(+), 288 deletions(-)
 create mode 100644 mm/vmpressure-v1.c

diff --git a/include/linux/vmpressure.h b/include/linux/vmpressure.h
index faecd5522401..e5e6b68d0dc4 100644
--- a/include/linux/vmpressure.h
+++ b/include/linux/vmpressure.h
@@ -13,18 +13,31 @@
 struct vmpressure {
 	unsigned long scanned;
 	unsigned long reclaimed;
+	/* The lock is used to keep the scanned/reclaimed in sync. */
+	spinlock_t sr_lock;
 
+#ifdef CONFIG_MEMCG_V1
+	/*
+	 * tree=true accumulators feed the v1 userspace eventfd interface
+	 * (memory.pressure_level). Drained by @work. v2 has no equivalent
+	 * interface, so this state is omitted on CONFIG_MEMCG_V1=n builds.
+	 */
 	unsigned long tree_scanned;
 	unsigned long tree_reclaimed;
-	/* The lock is used to keep the scanned/reclaimed above in sync. */
-	spinlock_t sr_lock;
-
 	/* The list of vmpressure_event structs. */
 	struct list_head events;
 	/* Have to grab the lock on events traversal or modifications. */
 	struct mutex events_lock;
 
 	struct work_struct work;
+#endif
+};
+
+enum vmpressure_levels {
+	VMPRESSURE_LOW = 0,
+	VMPRESSURE_MEDIUM,
+	VMPRESSURE_CRITICAL,
+	VMPRESSURE_NUM_LEVELS,
 };
 
 struct mem_cgroup;
@@ -32,18 +45,41 @@ struct mem_cgroup;
 #ifdef CONFIG_MEMCG
 void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
 		unsigned long scanned, unsigned long reclaimed);
-extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
-
 extern void vmpressure_init(struct vmpressure *vmpr);
 extern void vmpressure_cleanup(struct vmpressure *vmpr);
 extern struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg);
 extern struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr);
+
+/* Shared with mm/vmpressure-v1.c. */
+extern const unsigned long vmpressure_win;
+extern enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+						    unsigned long reclaimed);
+
+#ifdef CONFIG_MEMCG_V1
+extern void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio);
 extern int vmpressure_register_event(struct mem_cgroup *memcg,
 				     struct eventfd_ctx *eventfd,
 				     const char *args);
 extern void vmpressure_unregister_event(struct mem_cgroup *memcg,
 					struct eventfd_ctx *eventfd);
+
+/* v1 hooks called from mm/vmpressure.c; no-ops below when !MEMCG_V1. */
+extern void vmpressure_v1_init(struct vmpressure *vmpr);
+extern void vmpressure_v1_cleanup(struct vmpressure *vmpr);
+extern void vmpressure_v1_account_tree(struct vmpressure *vmpr,
+				       unsigned long scanned,
+				       unsigned long reclaimed);
 #else
+static inline void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg,
+				   int prio) {}
+static inline void vmpressure_v1_init(struct vmpressure *vmpr) {}
+static inline void vmpressure_v1_cleanup(struct vmpressure *vmpr) {}
+static inline void vmpressure_v1_account_tree(struct vmpressure *vmpr,
+					      unsigned long scanned,
+					      unsigned long reclaimed) {}
+#endif /* CONFIG_MEMCG_V1 */
+
+#else /* !CONFIG_MEMCG */
 static inline void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg,
 			      bool tree, unsigned long scanned,
 			      unsigned long reclaimed) {}
diff --git a/mm/Makefile b/mm/Makefile
index eff9f9e7e061..282688f6a543 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -101,7 +101,7 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o
 obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o
 obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
 obj-$(CONFIG_LIVEUPDATE_MEMFD) += memfd_luo.o
-obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o
+obj-$(CONFIG_MEMCG_V1) += memcontrol-v1.o vmpressure-v1.o
 obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
 ifdef CONFIG_BPF_SYSCALL
 obj-$(CONFIG_MEMCG) += bpf_memcontrol.o
diff --git a/mm/vmpressure-v1.c b/mm/vmpressure-v1.c
new file mode 100644
index 000000000000..fd813cba0544
--- /dev/null
+++ b/mm/vmpressure-v1.c
@@ -0,0 +1,305 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * cgroup v1 userspace vmpressure interface (memory.pressure_level /
+ * cgroup.event_control). Split out of mm/vmpressure.c so that v2-only
+ * kernels (CONFIG_MEMCG_V1=n) drop the whole eventfd accumulator,
+ * its work item, and the per-memcg state it requires.
+ */
+
+#include <linux/cgroup.h>
+#include <linux/eventfd.h>
+#include <linux/list.h>
+#include <linux/log2.h>
+#include <linux/memcontrol.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/swap.h>
+#include <linux/vmpressure.h>
+#include <linux/workqueue.h>
+
+/*
+ * When there are too little pages left to scan, vmpressure() may miss the
+ * critical pressure as number of pages will be less than "window size".
+ * However, in that case the vmscan priority will raise fast as the
+ * reclaimer will try to scan LRUs more deeply.
+ *
+ * The vmscan logic considers these special priorities:
+ *
+ * prio == DEF_PRIORITY (12): reclaimer starts with that value
+ * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
+ * prio == 0                : close to OOM, kernel scans every page in an lru
+ *
+ * Any value in this range is acceptable for this tunable (i.e. from 12 to
+ * 0). Current value for the vmpressure_level_critical_prio is chosen
+ * empirically, but the number, in essence, means that we consider
+ * critical level when scanning depth is ~10% of the lru size (vmscan
+ * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
+ * eights).
+ */
+static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
+
+enum vmpressure_modes {
+	VMPRESSURE_NO_PASSTHROUGH = 0,
+	VMPRESSURE_HIERARCHY,
+	VMPRESSURE_LOCAL,
+	VMPRESSURE_NUM_MODES,
+};
+
+static const char * const vmpressure_str_levels[] = {
+	[VMPRESSURE_LOW] = "low",
+	[VMPRESSURE_MEDIUM] = "medium",
+	[VMPRESSURE_CRITICAL] = "critical",
+};
+
+static const char * const vmpressure_str_modes[] = {
+	[VMPRESSURE_NO_PASSTHROUGH] = "default",
+	[VMPRESSURE_HIERARCHY] = "hierarchy",
+	[VMPRESSURE_LOCAL] = "local",
+};
+
+struct vmpressure_event {
+	struct eventfd_ctx *efd;
+	enum vmpressure_levels level;
+	enum vmpressure_modes mode;
+	struct list_head node;
+};
+
+static struct vmpressure *work_to_vmpressure(struct work_struct *work)
+{
+	return container_of(work, struct vmpressure, work);
+}
+
+static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
+{
+	struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
+
+	memcg = parent_mem_cgroup(memcg);
+	if (!memcg)
+		return NULL;
+	return memcg_to_vmpressure(memcg);
+}
+
+static bool vmpressure_event(struct vmpressure *vmpr,
+			     const enum vmpressure_levels level,
+			     bool ancestor, bool signalled)
+{
+	struct vmpressure_event *ev;
+	bool ret = false;
+
+	mutex_lock(&vmpr->events_lock);
+	list_for_each_entry(ev, &vmpr->events, node) {
+		if (ancestor && ev->mode == VMPRESSURE_LOCAL)
+			continue;
+		if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
+			continue;
+		if (level < ev->level)
+			continue;
+		eventfd_signal(ev->efd);
+		ret = true;
+	}
+	mutex_unlock(&vmpr->events_lock);
+
+	return ret;
+}
+
+static void vmpressure_work_fn(struct work_struct *work)
+{
+	struct vmpressure *vmpr = work_to_vmpressure(work);
+	unsigned long scanned;
+	unsigned long reclaimed;
+	enum vmpressure_levels level;
+	bool ancestor = false;
+	bool signalled = false;
+
+	spin_lock(&vmpr->sr_lock);
+	/*
+	 * Several contexts might be calling vmpressure(), so it is
+	 * possible that the work was rescheduled again before the old
+	 * work context cleared the counters. In that case we will run
+	 * just after the old work returns, but then scanned might be zero
+	 * here. No need for any locks here since we don't care if
+	 * vmpr->reclaimed is in sync.
+	 */
+	scanned = vmpr->tree_scanned;
+	if (!scanned) {
+		spin_unlock(&vmpr->sr_lock);
+		return;
+	}
+
+	reclaimed = vmpr->tree_reclaimed;
+	vmpr->tree_scanned = 0;
+	vmpr->tree_reclaimed = 0;
+	spin_unlock(&vmpr->sr_lock);
+
+	level = vmpressure_calc_level(scanned, reclaimed);
+
+	do {
+		if (vmpressure_event(vmpr, level, ancestor, signalled))
+			signalled = true;
+		ancestor = true;
+	} while ((vmpr = vmpressure_parent(vmpr)));
+}
+
+/*
+ * Tree-mode accumulator: accumulate per-memcg scanned/reclaimed and
+ * schedule the work that walks the parent chain and signals registered
+ * eventfd listeners once we cross the window threshold.
+ */
+void vmpressure_v1_account_tree(struct vmpressure *vmpr,
+				unsigned long scanned,
+				unsigned long reclaimed)
+{
+	spin_lock(&vmpr->sr_lock);
+	scanned = vmpr->tree_scanned += scanned;
+	vmpr->tree_reclaimed += reclaimed;
+	spin_unlock(&vmpr->sr_lock);
+
+	if (scanned < vmpressure_win)
+		return;
+	schedule_work(&vmpr->work);
+}
+
+void vmpressure_v1_init(struct vmpressure *vmpr)
+{
+	mutex_init(&vmpr->events_lock);
+	INIT_LIST_HEAD(&vmpr->events);
+	INIT_WORK(&vmpr->work, vmpressure_work_fn);
+}
+
+void vmpressure_v1_cleanup(struct vmpressure *vmpr)
+{
+	/*
+	 * Make sure there is no pending work before eventfd infrastructure
+	 * goes away.
+	 */
+	flush_work(&vmpr->work);
+}
+
+/**
+ * vmpressure_prio() - Account memory pressure through reclaimer priority level
+ * @gfp:	reclaimer's gfp mask
+ * @memcg:	cgroup memory controller handle
+ * @prio:	reclaimer's priority
+ *
+ * This function should be called from the reclaim path every time when
+ * the vmscan's reclaiming priority (scanning depth) changes.
+ *
+ * This function does not return any value.
+ */
+void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
+{
+	/*
+	 * We only use prio for accounting critical level. For more info
+	 * see comment for vmpressure_level_critical_prio variable above.
+	 */
+	if (prio > vmpressure_level_critical_prio)
+		return;
+
+	/*
+	 * OK, the prio is below the threshold, updating vmpressure
+	 * information before shrinker dives into long shrinking of long
+	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
+	 * to the vmpressure() basically means that we signal 'critical'
+	 * level.
+	 */
+	vmpressure(gfp, 0, memcg, true, vmpressure_win, 0);
+}
+
+#define MAX_VMPRESSURE_ARGS_LEN	(strlen("critical") + strlen("hierarchy") + 2)
+
+/**
+ * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
+ * @memcg:	memcg that is interested in vmpressure notifications
+ * @eventfd:	eventfd context to link notifications with
+ * @args:	event arguments (pressure level threshold, optional mode)
+ *
+ * This function associates eventfd context with the vmpressure
+ * infrastructure, so that the notifications will be delivered to the
+ * @eventfd. The @args parameter is a comma-delimited string that denotes a
+ * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
+ * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
+ * "hierarchy" or "local").
+ *
+ * To be used as memcg event method.
+ *
+ * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
+ * not be parsed.
+ */
+int vmpressure_register_event(struct mem_cgroup *memcg,
+			      struct eventfd_ctx *eventfd, const char *args)
+{
+	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+	struct vmpressure_event *ev;
+	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
+	enum vmpressure_levels level;
+	char *spec, *spec_orig;
+	char *token;
+	int ret = 0;
+
+	spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
+	if (!spec)
+		return -ENOMEM;
+
+	/* Find required level */
+	token = strsep(&spec, ",");
+	ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
+	if (ret < 0)
+		goto out;
+	level = ret;
+
+	/* Find optional mode */
+	token = strsep(&spec, ",");
+	if (token) {
+		ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
+		if (ret < 0)
+			goto out;
+		mode = ret;
+	}
+
+	ev = kzalloc_obj(*ev);
+	if (!ev) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ev->efd = eventfd;
+	ev->level = level;
+	ev->mode = mode;
+
+	mutex_lock(&vmpr->events_lock);
+	list_add(&ev->node, &vmpr->events);
+	mutex_unlock(&vmpr->events_lock);
+	ret = 0;
+out:
+	kfree(spec_orig);
+	return ret;
+}
+
+/**
+ * vmpressure_unregister_event() - Unbind eventfd from vmpressure
+ * @memcg:	memcg handle
+ * @eventfd:	eventfd context that was used to link vmpressure with the @cg
+ *
+ * This function does internal manipulations to detach the @eventfd from
+ * the vmpressure notifications, and then frees internal resources
+ * associated with the @eventfd (but the @eventfd itself is not freed).
+ *
+ * To be used as memcg event method.
+ */
+void vmpressure_unregister_event(struct mem_cgroup *memcg,
+				 struct eventfd_ctx *eventfd)
+{
+	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
+	struct vmpressure_event *ev;
+
+	mutex_lock(&vmpr->events_lock);
+	list_for_each_entry(ev, &vmpr->events, node) {
+		if (ev->efd != eventfd)
+			continue;
+		list_del(&ev->node);
+		kfree(ev);
+		break;
+	}
+	mutex_unlock(&vmpr->events_lock);
+}
diff --git a/mm/vmpressure.c b/mm/vmpressure.c
index c82cee1ab43b..af07db152239 100644
--- a/mm/vmpressure.c
+++ b/mm/vmpressure.c
@@ -7,16 +7,15 @@
  *
  * Based on ideas from Andrew Morton, David Rientjes, KOSAKI Motohiro,
  * Leonid Moiseichuk, Mel Gorman, Minchan Kim and Pekka Enberg.
+ *
+ * Tree-mode (cgroup v1 userspace eventfd) bookkeeping lives in
+ * mm/vmpressure-v1.c; this file holds the shared code and the in-kernel
+ * (tree=false) socket-pressure path that runs on cgroup v2.
  */
 
 #include <linux/cgroup.h>
-#include <linux/fs.h>
 #include <linux/log2.h>
-#include <linux/sched.h>
 #include <linux/mm.h>
-#include <linux/vmstat.h>
-#include <linux/eventfd.h>
-#include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/printk.h>
 #include <linux/vmpressure.h>
@@ -35,7 +34,7 @@
  * TODO: Make the window size depend on machine size, as we do for vmstat
  * thresholds. Currently we set it to 512 pages (2MB for 4KB pages).
  */
-static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
+const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
 
 /*
  * These thresholds are used when we account memory pressure through
@@ -46,68 +45,6 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16;
 static const unsigned int vmpressure_level_med = 60;
 static const unsigned int vmpressure_level_critical = 95;
 
-/*
- * When there are too little pages left to scan, vmpressure() may miss the
- * critical pressure as number of pages will be less than "window size".
- * However, in that case the vmscan priority will raise fast as the
- * reclaimer will try to scan LRUs more deeply.
- *
- * The vmscan logic considers these special priorities:
- *
- * prio == DEF_PRIORITY (12): reclaimer starts with that value
- * prio <= DEF_PRIORITY - 2 : kswapd becomes somewhat overwhelmed
- * prio == 0                : close to OOM, kernel scans every page in an lru
- *
- * Any value in this range is acceptable for this tunable (i.e. from 12 to
- * 0). Current value for the vmpressure_level_critical_prio is chosen
- * empirically, but the number, in essence, means that we consider
- * critical level when scanning depth is ~10% of the lru size (vmscan
- * scans 'lru_size >> prio' pages, so it is actually 12.5%, or one
- * eights).
- */
-static const unsigned int vmpressure_level_critical_prio = ilog2(100 / 10);
-
-static struct vmpressure *work_to_vmpressure(struct work_struct *work)
-{
-	return container_of(work, struct vmpressure, work);
-}
-
-static struct vmpressure *vmpressure_parent(struct vmpressure *vmpr)
-{
-	struct mem_cgroup *memcg = vmpressure_to_memcg(vmpr);
-
-	memcg = parent_mem_cgroup(memcg);
-	if (!memcg)
-		return NULL;
-	return memcg_to_vmpressure(memcg);
-}
-
-enum vmpressure_levels {
-	VMPRESSURE_LOW = 0,
-	VMPRESSURE_MEDIUM,
-	VMPRESSURE_CRITICAL,
-	VMPRESSURE_NUM_LEVELS,
-};
-
-enum vmpressure_modes {
-	VMPRESSURE_NO_PASSTHROUGH = 0,
-	VMPRESSURE_HIERARCHY,
-	VMPRESSURE_LOCAL,
-	VMPRESSURE_NUM_MODES,
-};
-
-static const char * const vmpressure_str_levels[] = {
-	[VMPRESSURE_LOW] = "low",
-	[VMPRESSURE_MEDIUM] = "medium",
-	[VMPRESSURE_CRITICAL] = "critical",
-};
-
-static const char * const vmpressure_str_modes[] = {
-	[VMPRESSURE_NO_PASSTHROUGH] = "default",
-	[VMPRESSURE_HIERARCHY] = "hierarchy",
-	[VMPRESSURE_LOCAL] = "local",
-};
-
 static enum vmpressure_levels vmpressure_level(unsigned long pressure)
 {
 	if (pressure >= vmpressure_level_critical)
@@ -117,8 +54,8 @@ static enum vmpressure_levels vmpressure_level(unsigned long pressure)
 	return VMPRESSURE_LOW;
 }
 
-static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
-						    unsigned long reclaimed)
+enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
+					     unsigned long reclaimed)
 {
 	unsigned long scale = scanned + reclaimed;
 	unsigned long pressure = 0;
@@ -147,74 +84,6 @@ static enum vmpressure_levels vmpressure_calc_level(unsigned long scanned,
 	return vmpressure_level(pressure);
 }
 
-struct vmpressure_event {
-	struct eventfd_ctx *efd;
-	enum vmpressure_levels level;
-	enum vmpressure_modes mode;
-	struct list_head node;
-};
-
-static bool vmpressure_event(struct vmpressure *vmpr,
-			     const enum vmpressure_levels level,
-			     bool ancestor, bool signalled)
-{
-	struct vmpressure_event *ev;
-	bool ret = false;
-
-	mutex_lock(&vmpr->events_lock);
-	list_for_each_entry(ev, &vmpr->events, node) {
-		if (ancestor && ev->mode == VMPRESSURE_LOCAL)
-			continue;
-		if (signalled && ev->mode == VMPRESSURE_NO_PASSTHROUGH)
-			continue;
-		if (level < ev->level)
-			continue;
-		eventfd_signal(ev->efd);
-		ret = true;
-	}
-	mutex_unlock(&vmpr->events_lock);
-
-	return ret;
-}
-
-static void vmpressure_work_fn(struct work_struct *work)
-{
-	struct vmpressure *vmpr = work_to_vmpressure(work);
-	unsigned long scanned;
-	unsigned long reclaimed;
-	enum vmpressure_levels level;
-	bool ancestor = false;
-	bool signalled = false;
-
-	spin_lock(&vmpr->sr_lock);
-	/*
-	 * Several contexts might be calling vmpressure(), so it is
-	 * possible that the work was rescheduled again before the old
-	 * work context cleared the counters. In that case we will run
-	 * just after the old work returns, but then scanned might be zero
-	 * here. No need for any locks here since we don't care if
-	 * vmpr->reclaimed is in sync.
-	 */
-	scanned = vmpr->tree_scanned;
-	if (!scanned) {
-		spin_unlock(&vmpr->sr_lock);
-		return;
-	}
-
-	reclaimed = vmpr->tree_reclaimed;
-	vmpr->tree_scanned = 0;
-	vmpr->tree_reclaimed = 0;
-	spin_unlock(&vmpr->sr_lock);
-
-	level = vmpressure_calc_level(scanned, reclaimed);
-
-	do {
-		if (vmpressure_event(vmpr, level, ancestor, signalled))
-			signalled = true;
-		ancestor = true;
-	} while ((vmpr = vmpressure_parent(vmpr)));
-}
-
 /**
  * vmpressure() - Account memory pressure through scanned/reclaimed ratio
  * @gfp:	reclaimer's gfp mask
@@ -283,14 +152,8 @@ void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
 		return;
 
 	if (tree) {
-		spin_lock(&vmpr->sr_lock);
-		scanned = vmpr->tree_scanned += scanned;
-		vmpr->tree_reclaimed += reclaimed;
-		spin_unlock(&vmpr->sr_lock);
-
-		if (scanned < vmpressure_win)
-			return;
-		schedule_work(&vmpr->work);
+		vmpressure_v1_account_tree(vmpr, scanned, reclaimed);
+		return;
 	} else {
 		enum vmpressure_levels level;
 
@@ -332,134 +195,6 @@ void vmpressure(gfp_t gfp, int order, struct mem_cgroup *memcg, bool tree,
 	}
 }
 
-/**
- * vmpressure_prio() - Account memory pressure through reclaimer priority level
- * @gfp:	reclaimer's gfp mask
- * @memcg:	cgroup memory controller handle
- * @prio:	reclaimer's priority
- *
- * This function should be called from the reclaim path every time when
- * the vmscan's reclaiming priority (scanning depth) changes.
- *
- * This function does not return any value.
- */
-void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio)
-{
-	/*
-	 * We only use prio for accounting critical level. For more info
-	 * see comment for vmpressure_level_critical_prio variable above.
-	 */
-	if (prio > vmpressure_level_critical_prio)
-		return;
-
-	/*
-	 * OK, the prio is below the threshold, updating vmpressure
-	 * information before shrinker dives into long shrinking of long
-	 * range vmscan. Passing scanned = vmpressure_win, reclaimed = 0
-	 * to the vmpressure() basically means that we signal 'critical'
-	 * level.
-	 */
-	vmpressure(gfp, 0, memcg, true, vmpressure_win, 0);
-}
-
-#define MAX_VMPRESSURE_ARGS_LEN	(strlen("critical") + strlen("hierarchy") + 2)
-
-/**
- * vmpressure_register_event() - Bind vmpressure notifications to an eventfd
- * @memcg:	memcg that is interested in vmpressure notifications
- * @eventfd:	eventfd context to link notifications with
- * @args:	event arguments (pressure level threshold, optional mode)
- *
- * This function associates eventfd context with the vmpressure
- * infrastructure, so that the notifications will be delivered to the
- * @eventfd. The @args parameter is a comma-delimited string that denotes a
- * pressure level threshold (one of vmpressure_str_levels, i.e. "low", "medium",
- * or "critical") and an optional mode (one of vmpressure_str_modes, i.e.
- * "hierarchy" or "local").
- *
- * To be used as memcg event method.
- *
- * Return: 0 on success, -ENOMEM on memory failure or -EINVAL if @args could
- * not be parsed.
- */
-int vmpressure_register_event(struct mem_cgroup *memcg,
-			      struct eventfd_ctx *eventfd, const char *args)
-{
-	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
-	struct vmpressure_event *ev;
-	enum vmpressure_modes mode = VMPRESSURE_NO_PASSTHROUGH;
-	enum vmpressure_levels level;
-	char *spec, *spec_orig;
-	char *token;
-	int ret = 0;
-
-	spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL);
-	if (!spec)
-		return -ENOMEM;
-
-	/* Find required level */
-	token = strsep(&spec, ",");
-	ret = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token);
-	if (ret < 0)
-		goto out;
-	level = ret;
-
-	/* Find optional mode */
-	token = strsep(&spec, ",");
-	if (token) {
-		ret = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token);
-		if (ret < 0)
-			goto out;
-		mode = ret;
-	}
-
-	ev = kzalloc_obj(*ev);
-	if (!ev) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
-	ev->efd = eventfd;
-	ev->level = level;
-	ev->mode = mode;
-
-	mutex_lock(&vmpr->events_lock);
-	list_add(&ev->node, &vmpr->events);
-	mutex_unlock(&vmpr->events_lock);
-	ret = 0;
-out:
-	kfree(spec_orig);
-	return ret;
-}
-
-/**
- * vmpressure_unregister_event() - Unbind eventfd from vmpressure
- * @memcg:	memcg handle
- * @eventfd:	eventfd context that was used to link vmpressure with the @cg
- *
- * This function does internal manipulations to detach the @eventfd from
- * the vmpressure notifications, and then frees internal resources
- * associated with the @eventfd (but the @eventfd itself is not freed).
- *
- * To be used as memcg event method.
- */
-void vmpressure_unregister_event(struct mem_cgroup *memcg,
-				 struct eventfd_ctx *eventfd)
-{
-	struct vmpressure *vmpr = memcg_to_vmpressure(memcg);
-	struct vmpressure_event *ev;
-
-	mutex_lock(&vmpr->events_lock);
-	list_for_each_entry(ev, &vmpr->events, node) {
-		if (ev->efd != eventfd)
-			continue;
-		list_del(&ev->node);
-		kfree(ev);
-		break;
-	}
-	mutex_unlock(&vmpr->events_lock);
-}
-
 /**
  * vmpressure_init() - Initialize vmpressure control structure
  * @vmpr:	Structure to be initialized
@@ -470,9 +205,7 @@ void vmpressure_unregister_event(struct mem_cgroup *memcg,
 void vmpressure_init(struct vmpressure *vmpr)
 {
 	spin_lock_init(&vmpr->sr_lock);
-	mutex_init(&vmpr->events_lock);
-	INIT_LIST_HEAD(&vmpr->events);
-	INIT_WORK(&vmpr->work, vmpressure_work_fn);
+	vmpressure_v1_init(vmpr);
 }
 
 /**
@@ -484,9 +217,5 @@ void vmpressure_init(struct vmpressure *vmpr)
  */
 void vmpressure_cleanup(struct vmpressure *vmpr)
 {
-	/*
-	 * Make sure there is no pending work before eventfd infrastructure
-	 * goes away.
-	 */
-	flush_work(&vmpr->work);
+	vmpressure_v1_cleanup(vmpr);
 }
-- 
2.52.0

next prev parent reply	other threads:[~2026-06-06 11:42 UTC|newest]

Thread overview: 9+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2026-06-06 11:41 [PATCH 0/2] mm/vmpressure: reduce CPU, memory and code overhead on cgroup v2 Usama Arif
2026-06-06 11:41 ` [PATCH 1/2] mm/vmpressure: skip tree=true accounting " Usama Arif
2026-06-08 17:06   ` Shakeel Butt
2026-06-06 11:41 ` Usama Arif [this message]
2026-06-08 17:05 ` [PATCH 0/2] mm/vmpressure: reduce CPU, memory and code overhead " Shakeel Butt
2026-06-08 18:49   ` Usama Arif
2026-06-08 19:56     ` Shakeel Butt
2026-06-08 21:19       ` Usama Arif
2026-06-08 22:26         ` Shakeel Butt

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:faecd552240 dfblob:e5e6b68d0dc dfblob:eff9f9e7e06
dfblob:282688f6a54 dfblob:fd813cba054 dfblob:c82cee1ab43
dfblob:af07db15223 )
 OR (
bs:"[PATCH 2/2] mm/vmpressure: split v1 userspace eventfd code into vmpressure-v1.c" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20260606114158.3126210-3-usama.arif@linux.dev \
    --to=usama.arif@linux.dev \
    --cc=akpm@linux-foundation.org \
    --cc=david@kernel.org \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@meta.com \
    --cc=liam@infradead.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=ljs@kernel.org \
    --cc=mhocko@suse.com \
    --cc=mkoutny@suse.com \
    --cc=roman.gushchin@linux.dev \
    --cc=rppt@kernel.org \
    --cc=shakeel.butt@linux.dev \
    --cc=surenb@google.com \
    --cc=tj@kernel.org \
    --cc=vbabka@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.