Linux Trace Kernel
 help / color / mirror / Atom feed
* [PATCH v3 07/28] fsnotify: add FSNOTIFY_EVENT_RENAME data type
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

Add a new fsnotify_rename_data struct and FSNOTIFY_EVENT_RENAME data
type that carries both the moved dentry and the inode that was
overwritten by the rename (if any).

Update fsnotify_data_inode(), fsnotify_data_dentry(), and
fsnotify_data_sb() to handle the new type, and add a new
fsnotify_data_rename_target() helper for extracting the overwritten
target inode.

Update fsnotify_move() to use the new data type for FS_RENAME and
FS_MOVED_TO events, passing the overwritten target inode through the
event data. FS_MOVED_FROM is unchanged since the source directory
doesn't need overwrite information.

This is done so that fsnotify consumers like nfsd can atomically
observe the overwritten file when a rename replaces an existing entry,
without needing a separate FS_DELETE event.

Assisted-by: Claude (Anthropic Claude Code)
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Amir Goldstein <amir73il@gmail.com>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 include/linux/fsnotify.h         |  8 ++++++--
 include/linux/fsnotify_backend.h | 20 ++++++++++++++++++++
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h
index 079c18bcdbde..bda798bc67bc 100644
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -257,6 +257,10 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	__u32 new_dir_mask = FS_MOVED_TO;
 	__u32 rename_mask = FS_RENAME;
 	const struct qstr *new_name = &moved->d_name;
+	struct fsnotify_rename_data rd = {
+		.moved = moved,
+		.target = target,
+	};
 
 	if (isdir) {
 		old_dir_mask |= FS_ISDIR;
@@ -265,12 +269,12 @@ static inline void fsnotify_move(struct inode *old_dir, struct inode *new_dir,
 	}
 
 	/* Event with information about both old and new parent+name */
-	fsnotify_name(rename_mask, moved, FSNOTIFY_EVENT_DENTRY,
+	fsnotify_name(rename_mask, &rd, FSNOTIFY_EVENT_RENAME,
 		      old_dir, old_name, 0);
 
 	fsnotify_name(old_dir_mask, source, FSNOTIFY_EVENT_INODE,
 		      old_dir, old_name, fs_cookie);
-	fsnotify_name(new_dir_mask, source, FSNOTIFY_EVENT_INODE,
+	fsnotify_name(new_dir_mask, &rd, FSNOTIFY_EVENT_RENAME,
 		      new_dir, new_name, fs_cookie);
 
 	if (target)
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 66e185bd1b1b..f8c8fb7f34ae 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -311,6 +311,7 @@ enum fsnotify_data_type {
 	FSNOTIFY_EVENT_DENTRY,
 	FSNOTIFY_EVENT_MNT,
 	FSNOTIFY_EVENT_ERROR,
+	FSNOTIFY_EVENT_RENAME,
 };
 
 struct fs_error_report {
@@ -335,6 +336,11 @@ struct fsnotify_mnt {
 	u64 mnt_id;
 };
 
+struct fsnotify_rename_data {
+	struct dentry *moved;	/* the dentry that was renamed */
+	struct inode *target;	/* inode overwritten by rename, or NULL */
+};
+
 static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 {
 	switch (data_type) {
@@ -348,6 +354,8 @@ static inline struct inode *fsnotify_data_inode(const void *data, int data_type)
 		return d_inode(file_range_path(data)->dentry);
 	case FSNOTIFY_EVENT_ERROR:
 		return ((struct fs_error_report *)data)->inode;
+	case FSNOTIFY_EVENT_RENAME:
+		return d_inode(((const struct fsnotify_rename_data *)data)->moved);
 	default:
 		return NULL;
 	}
@@ -363,6 +371,8 @@ static inline struct dentry *fsnotify_data_dentry(const void *data, int data_typ
 		return ((const struct path *)data)->dentry;
 	case FSNOTIFY_EVENT_FILE_RANGE:
 		return file_range_path(data)->dentry;
+	case FSNOTIFY_EVENT_RENAME:
+		return ((struct fsnotify_rename_data *)data)->moved;
 	default:
 		return NULL;
 	}
@@ -395,6 +405,8 @@ static inline struct super_block *fsnotify_data_sb(const void *data,
 		return file_range_path(data)->dentry->d_sb;
 	case FSNOTIFY_EVENT_ERROR:
 		return ((struct fs_error_report *) data)->sb;
+	case FSNOTIFY_EVENT_RENAME:
+		return ((const struct fsnotify_rename_data *)data)->moved->d_sb;
 	default:
 		return NULL;
 	}
@@ -430,6 +442,14 @@ static inline struct fs_error_report *fsnotify_data_error_report(
 	}
 }
 
+static inline struct inode *fsnotify_data_rename_target(const void *data,
+							int data_type)
+{
+	if (data_type == FSNOTIFY_EVENT_RENAME)
+		return ((const struct fsnotify_rename_data *)data)->target;
+	return NULL;
+}
+
 static inline const struct file_range *fsnotify_data_file_range(
 							const void *data,
 							int data_type)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 06/28] fsnotify: add fsnotify_modify_mark_mask()
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

nfsd needs to be able to modify the mask on an existing mark when new
directory delegations are set or unset. Add an exported function that
allows the caller to set and clear bits in the mark->mask, and does
the recalculation if something changed.

Suggested-by: Jan Kara <jack@suse.cz>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/notify/mark.c                 | 29 +++++++++++++++++++++++++++++
 include/linux/fsnotify_backend.h |  1 +
 2 files changed, 30 insertions(+)

diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index c2ed5b11b0fe..b1e73c6fd382 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -310,6 +310,35 @@ void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn)
 		fsnotify_conn_set_children_dentry_flags(conn);
 }
 
+/**
+ * fsnotify_modify_mark_mask - set and/or clear flags in a mark's mask
+ * @mark: mark to be modified
+ * @set: bits to be set in mask
+ * @clear: bits to be cleared in mask
+ *
+ * Modify a fsnotify_mark mask as directed, and update its associated conn.
+ * The caller is expected to hold a reference to the mark.
+ */
+void fsnotify_modify_mark_mask(struct fsnotify_mark *mark, u32 set, u32 clear)
+{
+	bool recalc = false;
+	u32 mask;
+
+	WARN_ON_ONCE(clear & set);
+
+	spin_lock(&mark->lock);
+	mask = mark->mask;
+	mark->mask |= set;
+	mark->mask &= ~clear;
+	if (mark->mask != mask)
+		recalc = true;
+	spin_unlock(&mark->lock);
+
+	if (recalc)
+		fsnotify_recalc_mask(mark->connector);
+}
+EXPORT_SYMBOL_GPL(fsnotify_modify_mark_mask);
+
 /* Free all connectors queued for freeing once SRCU period ends */
 static void fsnotify_connector_destroy_workfn(struct work_struct *work)
 {
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 95985400d3d8..66e185bd1b1b 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -917,6 +917,7 @@ extern void fsnotify_get_mark(struct fsnotify_mark *mark);
 extern void fsnotify_put_mark(struct fsnotify_mark *mark);
 extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info);
 extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info);
+extern void fsnotify_modify_mark_mask(struct fsnotify_mark *mark, u32 set, u32 clear);
 
 static inline void fsnotify_init_event(struct fsnotify_event *event)
 {

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 05/28] fsnotify: new tracepoint in fsnotify()
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

Add a tracepoint so we can see exactly how this is being called.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/notify/fsnotify.c            |  5 ++++
 include/trace/events/fsnotify.h | 51 +++++++++++++++++++++++++++++++++++++++++
 include/trace/misc/fsnotify.h   | 35 ++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+)

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index 9995de1710e5..5448738635f6 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -14,6 +14,9 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/fsnotify.h>
+
 /*
  * Clear all of the marks on an inode when it is being evicted from core
  */
@@ -504,6 +507,8 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir,
 	int ret = 0;
 	__u32 test_mask, marks_mask = 0;
 
+	trace_fsnotify(mask, data, data_type, dir, file_name, inode, cookie);
+
 	if (path)
 		mnt = real_mount(path->mnt);
 
diff --git a/include/trace/events/fsnotify.h b/include/trace/events/fsnotify.h
new file mode 100644
index 000000000000..341bbd57a39b
--- /dev/null
+++ b/include/trace/events/fsnotify.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fsnotify
+
+#if !defined(_TRACE_FSNOTIFY_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FSNOTIFY_H
+
+#include <linux/tracepoint.h>
+
+#include <trace/misc/fsnotify.h>
+
+TRACE_EVENT(fsnotify,
+	TP_PROTO(__u32 mask, const void *data, int data_type,
+		 struct inode *dir, const struct qstr *file_name,
+		 struct inode *inode, u32 cookie),
+
+	TP_ARGS(mask, data, data_type, dir, file_name, inode, cookie),
+
+	TP_STRUCT__entry(
+		__field(__u32, mask)
+		__field(unsigned long, dir_ino)
+		__field(unsigned long, ino)
+		__field(dev_t, s_dev)
+		__field(int, data_type)
+		__field(u32, cookie)
+		__string(file_name, file_name ? (const char *)file_name->name : "")
+	),
+
+	TP_fast_assign(
+		__entry->mask = mask;
+		__entry->dir_ino = dir ? dir->i_ino : 0;
+		__entry->ino = inode ? inode->i_ino : 0;
+		__entry->s_dev = dir ? dir->i_sb->s_dev :
+				 inode ? inode->i_sb->s_dev : 0;
+		__entry->data_type = data_type;
+		__entry->cookie = cookie;
+		__assign_str(file_name);
+	),
+
+	TP_printk("dev=%d:%d dir=%lu ino=%lu data_type=%d cookie=0x%x mask=0x%x %s name=%s",
+		  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+		  __entry->dir_ino, __entry->ino,
+		  __entry->data_type, __entry->cookie,
+		  __entry->mask, show_fsnotify_mask(__entry->mask),
+		  __get_str(file_name))
+);
+
+#endif /* _TRACE_FSNOTIFY_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/misc/fsnotify.h b/include/trace/misc/fsnotify.h
new file mode 100644
index 000000000000..a201e1bd6d8c
--- /dev/null
+++ b/include/trace/misc/fsnotify.h
@@ -0,0 +1,35 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Display helpers for fsnotify events
+ */
+
+#include <linux/fsnotify_backend.h>
+
+#define show_fsnotify_mask(mask) \
+	__print_flags(mask, "|", \
+		{ FS_ACCESS,		"ACCESS" }, \
+		{ FS_MODIFY,		"MODIFY" }, \
+		{ FS_ATTRIB,		"ATTRIB" }, \
+		{ FS_CLOSE_WRITE,	"CLOSE_WRITE" }, \
+		{ FS_CLOSE_NOWRITE,	"CLOSE_NOWRITE" }, \
+		{ FS_OPEN,		"OPEN" }, \
+		{ FS_MOVED_FROM,	"MOVED_FROM" }, \
+		{ FS_MOVED_TO,		"MOVED_TO" }, \
+		{ FS_CREATE,		"CREATE" }, \
+		{ FS_DELETE,		"DELETE" }, \
+		{ FS_DELETE_SELF,	"DELETE_SELF" }, \
+		{ FS_MOVE_SELF,		"MOVE_SELF" }, \
+		{ FS_OPEN_EXEC,		"OPEN_EXEC" }, \
+		{ FS_UNMOUNT,		"UNMOUNT" }, \
+		{ FS_Q_OVERFLOW,	"Q_OVERFLOW" }, \
+		{ FS_ERROR,		"ERROR" }, \
+		{ FS_OPEN_PERM,		"OPEN_PERM" }, \
+		{ FS_ACCESS_PERM,	"ACCESS_PERM" }, \
+		{ FS_OPEN_EXEC_PERM,	"OPEN_EXEC_PERM" }, \
+		{ FS_PRE_ACCESS,	"PRE_ACCESS" }, \
+		{ FS_MNT_ATTACH,	"MNT_ATTACH" }, \
+		{ FS_MNT_DETACH,	"MNT_DETACH" }, \
+		{ FS_EVENT_ON_CHILD,	"EVENT_ON_CHILD" }, \
+		{ FS_RENAME,		"RENAME" }, \
+		{ FS_DN_MULTISHOT,	"DN_MULTISHOT" }, \
+		{ FS_ISDIR,		"ISDIR" })

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 04/28] filelock: add an inode_lease_ignore_mask helper
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

Add a new routine that returns a mask of all dir change events that are
currently ignored by any leases. nfsd will use this to determine how to
configure the fsnotify_mark mask.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c               | 32 ++++++++++++++++++++++++++++++++
 include/linux/filelock.h |  1 +
 2 files changed, 33 insertions(+)

diff --git a/fs/locks.c b/fs/locks.c
index 792c3920b33a..61f64b261282 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1582,6 +1582,38 @@ static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc
 	return rc;
 }
 
+#define IGNORE_MASK	(FL_IGN_DIR_CREATE | FL_IGN_DIR_DELETE | FL_IGN_DIR_RENAME)
+
+/**
+ * inode_lease_ignore_mask - return union of all ignored inode events for this inode
+ * @inode: inode of which to get ignore mask
+ *
+ * Walk the list of leases, and return the result of all of
+ * their FL_IGN_DIR_* bits or'ed together.
+ */
+u32
+inode_lease_ignore_mask(struct inode *inode)
+{
+	struct file_lock_context *ctx;
+	struct file_lock_core *flc;
+	u32 mask = 0;
+
+	ctx = locks_inode_context(inode);
+	if (!ctx)
+		return 0;
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+		mask |= flc->flc_flags & IGNORE_MASK;
+		/* If we already have everything, we can stop */
+		if (mask == IGNORE_MASK)
+			break;
+	}
+	spin_unlock(&ctx->flc_lock);
+	return mask;
+}
+EXPORT_SYMBOL_GPL(inode_lease_ignore_mask);
+
 static bool
 ignore_dir_deleg_break(struct file_lease *fl, unsigned int flags)
 {
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 9dd4e67a6f30..6e125902c58a 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -236,6 +236,7 @@ int generic_setlease(struct file *, int, struct file_lease **, void **priv);
 int kernel_setlease(struct file *, int, struct file_lease **, void **);
 int vfs_setlease(struct file *, int, struct file_lease **, void **);
 int lease_modify(struct file_lease *, int, struct list_head *);
+u32 inode_lease_ignore_mask(struct inode *inode);
 
 struct notifier_block;
 int lease_register_notifier(struct notifier_block *);

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 03/28] filelock: add a tracepoint to start of break_lease()
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

...mostly to show the LEASE_BREAK_* flags.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c                      |  2 ++
 include/trace/events/filelock.h | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

diff --git a/fs/locks.c b/fs/locks.c
index 8b5958f34b61..792c3920b33a 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1651,6 +1651,8 @@ int __break_lease(struct inode *inode, unsigned int flags)
 	bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY);
 	int error = 0;
 
+	trace_break_lease(inode, flags);
+
 	type = break_lease_flags_to_type(flags);
 	if (!type)
 		return -EINVAL;
diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h
index ef4bb0afb86a..fff0ee2d452d 100644
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -120,6 +120,39 @@ DEFINE_EVENT(filelock_lock, flock_lock_inode,
 		TP_PROTO(struct inode *inode, struct file_lock *fl, int ret),
 		TP_ARGS(inode, fl, ret));
 
+#define show_lease_break_flags(val)					\
+	__print_flags(val, "|",						\
+		{ LEASE_BREAK_LEASE,		"LEASE" },		\
+		{ LEASE_BREAK_DELEG,		"DELEG" },		\
+		{ LEASE_BREAK_LAYOUT,		"LAYOUT" },		\
+		{ LEASE_BREAK_NONBLOCK,		"NONBLOCK" },		\
+		{ LEASE_BREAK_OPEN_RDONLY,	"OPEN_RDONLY" },	\
+		{ LEASE_BREAK_DIR_CREATE,	"DIR_CREATE" },		\
+		{ LEASE_BREAK_DIR_DELETE,	"DIR_DELETE" },		\
+		{ LEASE_BREAK_DIR_RENAME,	"DIR_RENAME" })
+
+TRACE_EVENT(break_lease,
+	TP_PROTO(struct inode *inode, unsigned int flags),
+
+	TP_ARGS(inode, flags),
+
+	TP_STRUCT__entry(
+		__field(unsigned long, i_ino)
+		__field(dev_t, s_dev)
+		__field(unsigned int, flags)
+	),
+
+	TP_fast_assign(
+		__entry->s_dev = inode->i_sb->s_dev;
+		__entry->i_ino = inode->i_ino;
+		__entry->flags = flags;
+	),
+
+	TP_printk("dev=0x%x:0x%x ino=0x%lx flags=%s",
+		  MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+		  __entry->i_ino, show_lease_break_flags(__entry->flags))
+);
+
 DECLARE_EVENT_CLASS(filelock_lease,
 	TP_PROTO(struct inode *inode, struct file_lease *fl),
 

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 02/28] filelock: add support for ignoring deleg breaks for dir change events
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

If a NFS client requests a directory delegation with a notification
bitmask covering directory change events, the server shouldn't recall
the delegation. Instead the client will be notified of the change after
the fact.

Add support for ignoring lease breaks on directory changes. Add a new
flags parameter to try_break_deleg() and teach __break_lease how to
ignore certain types of delegation break events.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/attr.c                       |  2 +-
 fs/locks.c                      | 82 ++++++++++++++++++++++++++++-------------
 fs/namei.c                      | 31 +++++++++-------
 fs/posix_acl.c                  |  4 +-
 fs/xattr.c                      |  4 +-
 include/linux/filelock.h        | 53 ++++++++++++++++++--------
 include/trace/events/filelock.h |  5 ++-
 7 files changed, 120 insertions(+), 61 deletions(-)

diff --git a/fs/attr.c b/fs/attr.c
index e7d7c6d19fe9..28744f0e9ff4 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -547,7 +547,7 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry,
 	 * breaking the delegation in this case.
 	 */
 	if (!(ia_valid & ATTR_DELEG)) {
-		error = try_break_deleg(inode, delegated_inode);
+		error = try_break_deleg(inode, 0, delegated_inode);
 		if (error)
 			return error;
 	}
diff --git a/fs/locks.c b/fs/locks.c
index d82c5be7aa5b..8b5958f34b61 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1583,29 +1583,63 @@ static bool leases_conflict(struct file_lock_core *lc, struct file_lock_core *bc
 }
 
 static bool
-any_leases_conflict(struct inode *inode, struct file_lease *breaker)
+ignore_dir_deleg_break(struct file_lease *fl, unsigned int flags)
 {
-	struct file_lock_context *ctx = inode->i_flctx;
-	struct file_lock_core *flc;
+	if ((flags & LEASE_BREAK_DIR_CREATE) && (fl->c.flc_flags & FL_IGN_DIR_CREATE))
+		return true;
+	if ((flags & LEASE_BREAK_DIR_DELETE) && (fl->c.flc_flags & FL_IGN_DIR_DELETE))
+		return true;
+	if ((flags & LEASE_BREAK_DIR_RENAME) && (fl->c.flc_flags & FL_IGN_DIR_RENAME))
+		return true;
+
+	return false;
+}
+
+static unsigned int
+break_lease_flags_to_type(unsigned int flags)
+{
+	if (flags & LEASE_BREAK_LEASE)
+		return FL_LEASE;
+	else if (flags & LEASE_BREAK_DELEG)
+		return FL_DELEG;
+	else if (flags & LEASE_BREAK_LAYOUT)
+		return FL_LAYOUT;
+	else
+		return 0;
+
+}
+
+static struct file_lease *
+first_visible_lease(struct inode *inode, struct file_lease *new_fl, unsigned int flags)
+{
+	struct file_lock_context *ctx = locks_inode_context(inode);
+	struct file_lease *fl;
 
 	lockdep_assert_held(&ctx->flc_lock);
 
-	list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
-		if (leases_conflict(flc, &breaker->c))
-			return true;
+	list_for_each_entry(fl, &ctx->flc_lease, c.flc_list) {
+		if (!leases_conflict(&fl->c, &new_fl->c))
+			continue;
+		if (S_ISDIR(inode->i_mode) && ignore_dir_deleg_break(fl, flags))
+			continue;
+		return fl;
 	}
-	return false;
+	return NULL;
 }
 
+
 /**
- *	__break_lease	-	revoke all outstanding leases on file
- *	@inode: the inode of the file to return
- *	@flags: LEASE_BREAK_* flags
+ * __break_lease	-	revoke all outstanding leases on file
+ * @inode: the inode of the file to return
+ * @flags: LEASE_BREAK_* flags
  *
- *	break_lease (inlined for speed) has checked there already is at least
- *	some kind of lock (maybe a lease) on this file.  Leases are broken on
- *	a call to open() or truncate().  This function can block waiting for the
- *	lease break unless you specify LEASE_BREAK_NONBLOCK.
+ * break_lease (inlined for speed) has checked there already is at least
+ * some kind of lock (maybe a lease) on this file. Leases and Delegations
+ * are broken on a call to open() or truncate(). Delegations are also
+ * broken on any event that would change the ctime. Directory delegations
+ * are broken whenever the directory changes (unless the delegation is set
+ * up to ignore the event). This function can block waiting for the lease
+ * break unless you specify LEASE_BREAK_NONBLOCK.
  */
 int __break_lease(struct inode *inode, unsigned int flags)
 {
@@ -1617,13 +1651,8 @@ int __break_lease(struct inode *inode, unsigned int flags)
 	bool want_write = !(flags & LEASE_BREAK_OPEN_RDONLY);
 	int error = 0;
 
-	if (flags & LEASE_BREAK_LEASE)
-		type = FL_LEASE;
-	else if (flags & LEASE_BREAK_DELEG)
-		type = FL_DELEG;
-	else if (flags & LEASE_BREAK_LAYOUT)
-		type = FL_LAYOUT;
-	else
+	type = break_lease_flags_to_type(flags);
+	if (!type)
 		return -EINVAL;
 
 	new_fl = lease_alloc(NULL, type, want_write ? F_WRLCK : F_RDLCK);
@@ -1642,7 +1671,7 @@ int __break_lease(struct inode *inode, unsigned int flags)
 
 	time_out_leases(inode, &dispose);
 
-	if (!any_leases_conflict(inode, new_fl))
+	if (!first_visible_lease(inode, new_fl, flags))
 		goto out;
 
 	break_time = 0;
@@ -1655,6 +1684,8 @@ int __break_lease(struct inode *inode, unsigned int flags)
 	list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, c.flc_list) {
 		if (!leases_conflict(&fl->c, &new_fl->c))
 			continue;
+		if (S_ISDIR(inode->i_mode) && ignore_dir_deleg_break(fl, flags))
+			continue;
 		if (want_write) {
 			if (fl->c.flc_flags & FL_UNLOCK_PENDING)
 				continue;
@@ -1670,7 +1701,8 @@ int __break_lease(struct inode *inode, unsigned int flags)
 			locks_delete_lock_ctx(&fl->c, &dispose);
 	}
 
-	if (list_empty(&ctx->flc_lease))
+	fl = first_visible_lease(inode, new_fl, flags);
+	if (!fl)
 		goto out;
 
 	if (flags & LEASE_BREAK_NONBLOCK) {
@@ -1680,7 +1712,6 @@ int __break_lease(struct inode *inode, unsigned int flags)
 	}
 
 restart:
-	fl = list_first_entry(&ctx->flc_lease, struct file_lease, c.flc_list);
 	break_time = fl->fl_break_time;
 	if (break_time != 0) {
 		if (time_after(jiffies, break_time)) {
@@ -1711,7 +1742,8 @@ int __break_lease(struct inode *inode, unsigned int flags)
 		 */
 		if (error == 0)
 			time_out_leases(inode, &dispose);
-		if (any_leases_conflict(inode, new_fl))
+		fl = first_visible_lease(inode, new_fl, flags);
+		if (fl)
 			goto restart;
 		error = 0;
 	}
diff --git a/fs/namei.c b/fs/namei.c
index 9e5500dad14f..e3cbd9f877bd 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4176,7 +4176,7 @@ int vfs_create(struct mnt_idmap *idmap, struct dentry *dentry, umode_t mode,
 	error = security_inode_create(dir, dentry, mode);
 	if (error)
 		return error;
-	error = try_break_deleg(dir, di);
+	error = try_break_deleg(dir, LEASE_BREAK_DIR_CREATE, di);
 	if (error)
 		return error;
 	error = dir->i_op->create(idmap, dir, dentry, mode, true);
@@ -4475,7 +4475,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
 	/* Negative dentry, just create the file */
 	if (!dentry->d_inode && (open_flag & O_CREAT)) {
 		/* but break the directory lease first! */
-		error = try_break_deleg(dir_inode, delegated_inode);
+		error = try_break_deleg(dir_inode, LEASE_BREAK_DIR_CREATE, delegated_inode);
 		if (error)
 			goto out_dput;
 
@@ -5091,7 +5091,7 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
-	error = try_break_deleg(dir, delegated_inode);
+	error = try_break_deleg(dir, LEASE_BREAK_DIR_CREATE, delegated_inode);
 	if (error)
 		return error;
 
@@ -5232,7 +5232,7 @@ struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (max_links && dir->i_nlink >= max_links)
 		goto err;
 
-	error = try_break_deleg(dir, delegated_inode);
+	error = try_break_deleg(dir, LEASE_BREAK_DIR_CREATE, delegated_inode);
 	if (error)
 		goto err;
 
@@ -5337,7 +5337,7 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		goto out;
 
-	error = try_break_deleg(dir, delegated_inode);
+	error = try_break_deleg(dir, LEASE_BREAK_DIR_DELETE, delegated_inode);
 	if (error)
 		goto out;
 
@@ -5467,10 +5467,10 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir,
 	else {
 		error = security_inode_unlink(dir, dentry);
 		if (!error) {
-			error = try_break_deleg(dir, delegated_inode);
+			error = try_break_deleg(dir, LEASE_BREAK_DIR_DELETE, delegated_inode);
 			if (error)
 				goto out;
-			error = try_break_deleg(target, delegated_inode);
+			error = try_break_deleg(target, 0, delegated_inode);
 			if (error)
 				goto out;
 			error = dir->i_op->unlink(dir, dentry);
@@ -5614,7 +5614,7 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir,
 	if (error)
 		return error;
 
-	error = try_break_deleg(dir, delegated_inode);
+	error = try_break_deleg(dir, LEASE_BREAK_DIR_CREATE, delegated_inode);
 	if (error)
 		return error;
 
@@ -5745,9 +5745,9 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
 	else if (max_links && inode->i_nlink >= max_links)
 		error = -EMLINK;
 	else {
-		error = try_break_deleg(dir, delegated_inode);
+		error = try_break_deleg(dir, LEASE_BREAK_DIR_CREATE, delegated_inode);
 		if (!error)
-			error = try_break_deleg(inode, delegated_inode);
+			error = try_break_deleg(inode, 0, delegated_inode);
 		if (!error)
 			error = dir->i_op->link(old_dentry, dir, new_dentry);
 	}
@@ -6011,21 +6011,24 @@ int vfs_rename(struct renamedata *rd)
 		    old_dir->i_nlink >= max_links)
 			goto out;
 	}
-	error = try_break_deleg(old_dir, delegated_inode);
+	error = try_break_deleg(old_dir,
+				old_dir == new_dir ? LEASE_BREAK_DIR_RENAME :
+						     LEASE_BREAK_DIR_DELETE,
+				delegated_inode);
 	if (error)
 		goto out;
 	if (new_dir != old_dir) {
-		error = try_break_deleg(new_dir, delegated_inode);
+		error = try_break_deleg(new_dir, LEASE_BREAK_DIR_CREATE, delegated_inode);
 		if (error)
 			goto out;
 	}
 	if (!is_dir) {
-		error = try_break_deleg(source, delegated_inode);
+		error = try_break_deleg(source, 0, delegated_inode);
 		if (error)
 			goto out;
 	}
 	if (target && !new_is_dir) {
-		error = try_break_deleg(target, delegated_inode);
+		error = try_break_deleg(target, 0, delegated_inode);
 		if (error)
 			goto out;
 	}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 12591c95c925..b4bfe4ddf64e 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -1126,7 +1126,7 @@ int vfs_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (error)
 		goto out_inode_unlock;
 
-	error = try_break_deleg(inode, &delegated_inode);
+	error = try_break_deleg(inode, 0, &delegated_inode);
 	if (error)
 		goto out_inode_unlock;
 
@@ -1234,7 +1234,7 @@ int vfs_remove_acl(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (error)
 		goto out_inode_unlock;
 
-	error = try_break_deleg(inode, &delegated_inode);
+	error = try_break_deleg(inode, 0, &delegated_inode);
 	if (error)
 		goto out_inode_unlock;
 
diff --git a/fs/xattr.c b/fs/xattr.c
index 3e49e612e1ba..6b67a6e76eeb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -288,7 +288,7 @@ __vfs_setxattr_locked(struct mnt_idmap *idmap, struct dentry *dentry,
 	if (error)
 		goto out;
 
-	error = try_break_deleg(inode, delegated_inode);
+	error = try_break_deleg(inode, 0, delegated_inode);
 	if (error)
 		goto out;
 
@@ -546,7 +546,7 @@ __vfs_removexattr_locked(struct mnt_idmap *idmap,
 	if (error)
 		goto out;
 
-	error = try_break_deleg(inode, delegated_inode);
+	error = try_break_deleg(inode, 0, delegated_inode);
 	if (error)
 		goto out;
 
diff --git a/include/linux/filelock.h b/include/linux/filelock.h
index 5f0a2fb31450..9dd4e67a6f30 100644
--- a/include/linux/filelock.h
+++ b/include/linux/filelock.h
@@ -4,19 +4,22 @@
 
 #include <linux/fs.h>
 
-#define FL_POSIX	1
-#define FL_FLOCK	2
-#define FL_DELEG	4	/* NFSv4 delegation */
-#define FL_ACCESS	8	/* not trying to lock, just looking */
-#define FL_EXISTS	16	/* when unlocking, test for existence */
-#define FL_LEASE	32	/* lease held on this file */
-#define FL_CLOSE	64	/* unlock on close */
-#define FL_SLEEP	128	/* A blocking lock */
-#define FL_DOWNGRADE_PENDING	256 /* Lease is being downgraded */
-#define FL_UNLOCK_PENDING	512 /* Lease is being broken */
-#define FL_OFDLCK	1024	/* lock is "owned" by struct file */
-#define FL_LAYOUT	2048	/* outstanding pNFS layout */
-#define FL_RECLAIM	4096	/* reclaiming from a reboot server */
+#define FL_POSIX		BIT(0)	/* POSIX lock */
+#define FL_FLOCK		BIT(1)	/* BSD lock */
+#define FL_DELEG		BIT(2)	/* NFSv4 delegation */
+#define FL_ACCESS		BIT(3)	/* not trying to lock, just looking */
+#define FL_EXISTS		BIT(4)	/* when unlocking, test for existence */
+#define FL_LEASE		BIT(5)	/* file lease */
+#define FL_CLOSE		BIT(6)	/* unlock on close */
+#define FL_SLEEP		BIT(7)	/* A blocking lock */
+#define FL_DOWNGRADE_PENDING	BIT(8)	/* Lease is being downgraded */
+#define FL_UNLOCK_PENDING	BIT(9)	/* Lease is being broken */
+#define FL_OFDLCK		BIT(10) /* POSIX lock "owned" by struct file */
+#define FL_LAYOUT		BIT(11) /* outstanding pNFS layout */
+#define FL_RECLAIM		BIT(12) /* reclaiming from a reboot server */
+#define FL_IGN_DIR_CREATE	BIT(13) /* ignore DIR_CREATE events */
+#define FL_IGN_DIR_DELETE	BIT(14) /* ignore DIR_DELETE events */
+#define FL_IGN_DIR_RENAME	BIT(15) /* ignore DIR_RENAME events */
 
 #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE)
 
@@ -222,6 +225,10 @@ struct file_lease *locks_alloc_lease(void);
 #define LEASE_BREAK_LAYOUT		BIT(2)	// break layouts only
 #define LEASE_BREAK_NONBLOCK		BIT(3)	// non-blocking break
 #define LEASE_BREAK_OPEN_RDONLY		BIT(4)	// readonly open event
+#define LEASE_BREAK_DIR_CREATE		BIT(5)  // dir deleg create event
+#define LEASE_BREAK_DIR_DELETE		BIT(6)  // dir deleg delete event
+#define LEASE_BREAK_DIR_RENAME		BIT(7)  // dir deleg rename event
+
 
 int __break_lease(struct inode *inode, unsigned int flags);
 void lease_get_mtime(struct inode *, struct timespec64 *time);
@@ -516,12 +523,26 @@ static inline bool is_delegated(struct delegated_inode *di)
 	return di->di_inode;
 }
 
-static inline int try_break_deleg(struct inode *inode,
+/**
+ * try_break_deleg - do a non-blocking delegation break
+ * @inode: inode that should have its delegations broken
+ * @flags: extra LEASE_BREAK_* flags to pass to break_deleg()
+ * @di: returns pointer to delegated inode (may be NULL)
+ *
+ * Break delegations in a non-blocking fashion. If there are
+ * outstanding delegations and @di is set, then an extra reference
+ * will be taken on @inode and @di->di_inode will be populated so
+ * that it may be waited upon.
+ *
+ * Returns 0 if there is no need to wait or an error. If -EWOULDBLOCK
+ * is returned, then @di will be populated (if non-NULL).
+ */
+static inline int try_break_deleg(struct inode *inode, unsigned int flags,
 				  struct delegated_inode *di)
 {
 	int ret;
 
-	ret = break_deleg(inode, LEASE_BREAK_NONBLOCK);
+	ret = break_deleg(inode, flags | LEASE_BREAK_NONBLOCK);
 	if (ret == -EWOULDBLOCK && di) {
 		di->di_inode = inode;
 		ihold(inode);
@@ -574,7 +595,7 @@ static inline int break_deleg(struct inode *inode, unsigned int flags)
 	return 0;
 }
 
-static inline int try_break_deleg(struct inode *inode,
+static inline int try_break_deleg(struct inode *inode, unsigned int flags,
 				  struct delegated_inode *delegated_inode)
 {
 	return 0;
diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h
index 370016c38a5b..ef4bb0afb86a 100644
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -28,7 +28,10 @@
 		{ FL_DOWNGRADE_PENDING,	"FL_DOWNGRADE_PENDING" },	\
 		{ FL_UNLOCK_PENDING,	"FL_UNLOCK_PENDING" },		\
 		{ FL_OFDLCK,		"FL_OFDLCK" },			\
-		{ FL_RECLAIM,		"FL_RECLAIM"})
+		{ FL_RECLAIM,		"FL_RECLAIM" },			\
+		{ FL_IGN_DIR_CREATE,	"FL_IGN_DIR_CREATE" },		\
+		{ FL_IGN_DIR_DELETE,	"FL_IGN_DIR_DELETE" },		\
+		{ FL_IGN_DIR_RENAME,	"FL_IGN_DIR_RENAME" })
 
 #define show_fl_type(val)				\
 	__print_symbolic(val,				\

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 01/28] filelock: pass current blocking lease to trace_break_lease_block() rather than "new_fl"
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260428-dir-deleg-v3-0-5a0780ba9def@kernel.org>

The break_lease_block tracepoint currently just shows the type of
"new_fl", which we can predict from the "flags" value. Switch it to
display info about "fl" instead, as that's the file_lease on which the
code is blocking.

For trace_break_lease_unblock(), pass it a NULL pointer. "fl" may have
been freed by that point, and passing it the info in new_fl is
deceptive.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/locks.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 8e44b1f6c15a..d82c5be7aa5b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1691,7 +1691,7 @@ int __break_lease(struct inode *inode, unsigned int flags)
 	} else
 		break_time++;
 	locks_insert_block(&fl->c, &new_fl->c, leases_conflict);
-	trace_break_lease_block(inode, new_fl);
+	trace_break_lease_block(inode, fl);
 	spin_unlock(&ctx->flc_lock);
 	percpu_up_read(&file_rwsem);
 
@@ -1702,7 +1702,7 @@ int __break_lease(struct inode *inode, unsigned int flags)
 
 	percpu_down_read(&file_rwsem);
 	spin_lock(&ctx->flc_lock);
-	trace_break_lease_unblock(inode, new_fl);
+	trace_break_lease_unblock(inode, NULL);
 	__locks_delete_block(&new_fl->c);
 	if (error >= 0) {
 		/*

-- 
2.54.0


^ permalink raw reply related

* [PATCH v3 00/28] vfs/nfsd: add support for CB_NOTIFY callbacks in directory delegations
From: Jeff Layton @ 2026-04-28  7:09 UTC (permalink / raw)
  To: Alexander Viro, Christian Brauner, Jan Kara, Chuck Lever,
	Alexander Aring, Steven Rostedt, Masami Hiramatsu,
	Mathieu Desnoyers, Jonathan Corbet, Shuah Khan, NeilBrown,
	Olga Kornievskaia, Dai Ngo, Tom Talpey, Trond Myklebust,
	Anna Schumaker, Amir Goldstein
  Cc: Calum Mackay, linux-fsdevel, linux-kernel, linux-trace-kernel,
	linux-doc, linux-nfs, Jeff Layton

Re-posting the set per Christian's request. The only difference in this
version is a small error handling fix in alloc_init_dir_deleg(). The old
version could crash since release_pages() can't handle an array with
NULL pointers in it.

---------------------------------8<------------------------------------

This patchset builds on the directory delegation work we did a few
months ago, to add support for CB_NOTIFY callbacks for some events. In
particular, creates, unlinks and renames. The server also sends updated
directory attributes in the notifications. With this support, the client
can register interest in a directory and get notifications about changes
within it without losing its lease.

The series starts with patches to allow the vfs to ignore certain types
of events on directories. nfsd can then request these sorts of
delegations on directories, and then set up inotify watches on the
directory to trigger sending CB_NOTIFY events.

This has mainly been tested with pynfs, with some new testcases that
I'll be posting soon. They seem to work fine with those tests, but I
don't think we'll want to merge these until we have a complete
client-side implementation to test against.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
Changes in v3:
- Fix error handling in alloc_init_dir_deleg()
- Link to v2: https://lore.kernel.org/r/20260416-dir-deleg-v2-0-851426a550f6@kernel.org

Changes in v2:
- Fix __break_lease handling with different lease types on flc_lease list
- Add FSNOTIFY_EVENT_RENAME data type to properly handle cross-directory rename events
- Display fsnotify mask symbolically in tracepoints
- New tracepoint in fsnotify()
- Recalc fsnotify mask after unlocking lease instead of before
- Don't notify client that is making the changes
- After sending CB_NOTIFY, requeue if new events came in while running
- Document removal of NFS4_VERIFIER_SIZE/NFS4_FHSIZE from UAPI headers
- Properly release nfsd_dir_fsnotify_group on server shutdown
- Link to v1: https://lore.kernel.org/r/20260407-dir-deleg-v1-0-aaf68c478abd@kernel.org

---
Jeff Layton (28):
      filelock: pass current blocking lease to trace_break_lease_block() rather than "new_fl"
      filelock: add support for ignoring deleg breaks for dir change events
      filelock: add a tracepoint to start of break_lease()
      filelock: add an inode_lease_ignore_mask helper
      fsnotify: new tracepoint in fsnotify()
      fsnotify: add fsnotify_modify_mark_mask()
      fsnotify: add FSNOTIFY_EVENT_RENAME data type
      nfsd: check fl_lmops in nfsd_breaker_owns_lease()
      nfsd: add protocol support for CB_NOTIFY
      nfs_common: add new NOTIFY4_* flags proposed in RFC8881bis
      nfsd: allow nfsd to get a dir lease with an ignore mask
      nfsd: update the fsnotify mark when setting or removing a dir delegation
      nfsd: make nfsd4_callback_ops->prepare operation bool return
      nfsd: add callback encoding and decoding linkages for CB_NOTIFY
      nfsd: use RCU to protect fi_deleg_file
      nfsd: add data structures for handling CB_NOTIFY
      nfsd: add notification handlers for dir events
      nfsd: add tracepoint to dir_event handler
      nfsd: apply the notify mask to the delegation when requested
      nfsd: add helper to marshal a fattr4 from completed args
      nfsd: allow nfsd4_encode_fattr4_change() to work with no export
      nfsd: send basic file attributes in CB_NOTIFY
      nfsd: allow encoding a filehandle into fattr4 without a svc_fh
      nfsd: add a fi_connectable flag to struct nfs4_file
      nfsd: add the filehandle to returned attributes in CB_NOTIFY
      nfsd: properly track requested child attributes
      nfsd: track requested dir attributes
      nfsd: add support to CB_NOTIFY for dir attribute changes

 Documentation/sunrpc/xdr/nfs4_1.x    | 264 ++++++++++++++-
 fs/attr.c                            |   2 +-
 fs/locks.c                           | 118 +++++--
 fs/namei.c                           |  31 +-
 fs/nfsd/filecache.c                  |  70 +++-
 fs/nfsd/nfs4callback.c               |  60 +++-
 fs/nfsd/nfs4layouts.c                |   5 +-
 fs/nfsd/nfs4proc.c                   |  17 +
 fs/nfsd/nfs4state.c                  | 551 ++++++++++++++++++++++++++++----
 fs/nfsd/nfs4xdr.c                    | 323 +++++++++++++++++--
 fs/nfsd/nfs4xdr_gen.c                | 601 ++++++++++++++++++++++++++++++++++-
 fs/nfsd/nfs4xdr_gen.h                |  20 +-
 fs/nfsd/state.h                      |  72 ++++-
 fs/nfsd/trace.h                      |  23 ++
 fs/nfsd/xdr4.h                       |   5 +
 fs/nfsd/xdr4cb.h                     |  12 +
 fs/notify/fsnotify.c                 |   5 +
 fs/notify/mark.c                     |  29 ++
 fs/posix_acl.c                       |   4 +-
 fs/xattr.c                           |   4 +-
 include/linux/filelock.h             |  54 +++-
 include/linux/fsnotify.h             |   8 +-
 include/linux/fsnotify_backend.h     |  21 ++
 include/linux/nfs4.h                 | 127 --------
 include/linux/sunrpc/xdrgen/nfs4_1.h | 291 ++++++++++++++++-
 include/trace/events/filelock.h      |  38 ++-
 include/trace/events/fsnotify.h      |  51 +++
 include/trace/misc/fsnotify.h        |  35 ++
 include/uapi/linux/nfs4.h            |   2 -
 29 files changed, 2519 insertions(+), 324 deletions(-)
---
base-commit: f4d71dd7fd9cec357c32431fa55c107b96008312
change-id: 20260325-dir-deleg-339066dd1017

Best regards,
-- 
Jeff Layton <jlayton@kernel.org>


^ permalink raw reply

* Re: [PATCH v2] Documentation/rv: Replace stale website link
From: Gabriele Monaco @ 2026-04-28  7:06 UTC (permalink / raw)
  To: Randy Dunlap, Steven Rostedt, Jonathan Corbet, linux-trace-kernel,
	linux-doc, linux-kernel
  Cc: matteo.martelli, skhan
In-Reply-To: <f841e9f6-9d0b-4e10-a930-fdd74a74b9b9@infradead.org>

On Mon, 2026-04-27 at 09:50 -0700, Randy Dunlap wrote:
> Tested-by: Randy Dunlap <rdunlap@infradead.org>
> Acked-by: Randy Dunlap <rdunlap@infradead.org>

Thanks for the ack!

> although I don't care for the "J. Syst. Archit." abbreviation.
> Does JSA use that? Not that I can see.

That's the citation format I got from semanticscholar.org , it's indeed
a bit ugly but it's apparently the ISO 4 abbreviation [1].

Not sure if it would be neater to just use JSA which looks more
official.

Thanks,
Gabriele

[1] - https://dblp.org/db/journals/jsa/index.html


^ permalink raw reply

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Jeff Layton @ 2026-04-28  6:49 UTC (permalink / raw)
  To: Ritesh Harjani, Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Christoph Hellwig, Kairui Song, Qi Zheng, Shakeel Butt,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <jytsrnn1.ritesh.list@gmail.com>

On Tue, 2026-04-28 at 04:56 +0530, Ritesh Harjani wrote:
> > 
> 
> I guess you missed answering this. The reason why I was asking about this is....
> 

Oops, sorry...

> > > >                        baseline    patched     change
> > > >   buffered              1619.5     1611.2      -0.5%
> > > >   dontcache             1281.1     1629.4     +27.2%
> > > >   direct                1545.4     1609.4      +4.1%
> > > > 
> 
> ... If we see the performace of buffered and dontcache in baseline case,
> then we don't see dontcache doing any good. Even the patched version is
> just slightly better compared to buffered case.
> 
> But IIUC, dontcache should really shine in cases where we have buffered
> writers dirtying the page cache pages which can overflow the RAM size
> [1]. The reason why dontcache should show benefit there is, because we
> don't see any page cache pressure, since after writeback the pages gets
> evicted. Also earlier in the unpatched version, the I/O submission
> happens immediately in the same context.
> 
> So, I guess, isn't it better to evaluate those scenarios as well with
> the patched version - since this series affects those code paths now?
> 
> [1]: https://lore.kernel.org/all/20241110152906.1747545-11-axboe@kernel.dk/
> 
> > 

Ok, that's a good point. I'll have Claude recreate a benchmark that
mirrors what Jens did in the original posting and make sure the
behavior of that test doesn't regress (at least not significantly).

I'll try to get this done before LSF/MM, but we'll see.

Cheers,
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* Re: [RFC PATCH 00/19] mm/damon: introduce data attributes monitoring
From: SeongJae Park @ 2026-04-28  0:33 UTC (permalink / raw)
  To: Gutierrez Asier
  Cc: SeongJae Park, Liam R. Howlett, Andrew Morton, David Hildenbrand,
	Jonathan Corbet, Lorenzo Stoakes, Masami Hiramatsu,
	Mathieu Desnoyers, Michal Hocko, Mike Rapoport, Shuah Khan,
	Shuah Khan, Steven Rostedt, Suren Baghdasaryan, Vlastimil Babka,
	damon, linux-doc, linux-kernel, linux-kselftest, linux-mm,
	linux-trace-kernel
In-Reply-To: <14036b07-413e-4dcd-a363-e7f834d85da3@huawei-partners.com>

On Mon, 27 Apr 2026 16:16:07 +0300 Gutierrez Asier <gutierrez.asier@huawei-partners.com> wrote:

> Hi SeonJae,
> 
> On 4/26/2026 11:52 PM, SeongJae Park wrote:
> > TL; DR
> > ======
> > 
> > Extend DAMON for monitoring general data attributes other than accesses.
> > This is for enabling light-weight page type (e.g., belonging cgroup)
> > aware monitoring in short term.  In long term, this will help extending
> > DAMON for multiple access events capture primitives (e.g., page faults
> > and PMU) and eventually pivotting DAMON to a "Data Attributes Monitoring
> > and Operations eNgine" in long term.
> 
> Very interesting. Looking forward to seeing this in upstream.

Thank you!

[...]
> My main concern is about potential pollution of sysfs. DAMON is already
> complex to set up, with a lot of knobs. Adding more configuration options
> may make admin's job more complex.

You are right, ther are a lot of knobs for DAMON.  Nevertheless, each knob is
simple and independent, so easy to scale.  We also provide user-space tool for
users who still want to use DAMON in highly customized way, and DAMON modules
for users who want common purpose usage of DAMON with minimum tunable knobs.

I believe the beginning part of DAMON usage document [2] is explaining this
point.

FWIW, I'm also working on DAMON-X [1] for making the modules based appraoch
just works for more use cases.

> 
> Do you plan to support this extension in damo user space?

Yes, I will!

[1] https://lore.kernel.org/linux-mm/20260307210250.204245-1-sj@kernel.org/
[2] Documentation/admin-guide/mm/damon/usage.rst


Thanks,
SJ

[...]

^ permalink raw reply

* Re: [PATCH v5 1/2] blk-mq: add tracepoint block_rq_tag_wait
From: Aaron Tomlin @ 2026-04-28  0:29 UTC (permalink / raw)
  To: Steven Rostedt
  Cc: axboe, mhiramat, mathieu.desnoyers, bvanassche,
	johannes.thumshirn, kch, dlemoal, ritesh.list, loberman, neelx,
	sean, mproche, chjohnst, linux-block, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260427123848.1e6b63d2@gandalf.local.home>

[-- Attachment #1: Type: text/plain, Size: 975 bytes --]

On Mon, Apr 27, 2026 at 12:38:48PM -0400, Steven Rostedt wrote:
> On Sun, 26 Apr 2026 22:01:41 -0400
> Aaron Tomlin <atomlin@atomlin.com> wrote:
> 
> > +TRACE_EVENT(block_rq_tag_wait,
> > +
> > +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
> > +
> > +	TP_ARGS(q, hctx, is_sched_tag),
> > +
> > +	TP_STRUCT__entry(
> > +		__field( dev_t,		dev			)
> > +		__field( u32,		hctx_id			)
> > +		__field( u32,		nr_tags			)
> > +		__field( bool,		is_sched_tag		)
> > +	),
> > +
> > +	TP_fast_assign(
> > +		__entry->dev		= q->disk ? disk_devt(q->disk);
> 
> Hmm, does the above even compile?
> 
Hi Steve,

	TP_fast_assign(
		__entry->dev		= q->disk ? disk_devt(q->disk) : 0;

I embarrassingly dropped ": 0" from the ternary operator while preparing
the v5 patch and failed to catch the syntax error before sending it out.

I will fix the syntax, verify the build and spin a v6.


Kind regards,
-- 
Aaron Tomlin

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 833 bytes --]

^ permalink raw reply

* Re: [PATCH v3 2/4] mm: kick writeback flusher for IOCB_DONTCACHE with targeted dirty tracking
From: Ritesh Harjani @ 2026-04-27 23:26 UTC (permalink / raw)
  To: Jeff Layton, Alexander Viro, Christian Brauner, Jan Kara,
	Matthew Wilcox (Oracle), Andrew Morton, David Hildenbrand,
	Lorenzo Stoakes, Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Mike Snitzer, Jens Axboe,
	Christoph Hellwig, Kairui Song, Qi Zheng, Shakeel Butt,
	Barry Song, Axel Rasmussen, Yuanchu Xie, Wei Xu, Steven Rostedt,
	Masami Hiramatsu, Mathieu Desnoyers, Chuck Lever
  Cc: linux-fsdevel, linux-kernel, linux-nfs, linux-mm,
	linux-trace-kernel
In-Reply-To: <bb418f9a7bfcabc3070b412c745c5b6456d592b9.camel@kernel.org>

Jeff Layton <jlayton@kernel.org> writes:

>> 
>> Also should the following change be documented somewhere? Like in Man
>> page maybe? i.e.
>> Earlier RWF_DONTCACHE writes made sure that those dirty pages are
>> immediately submitted for writeback and completion would release those
>> pages. But now, in certain cases when there is a mixed buffered write in
>> the system, those dontcache dirty pages might be written back after a
>> delay (whenever the next time writeback kicks in).
>> However for RWF_DONTCACHE reads, it should not affect anything.
>> 
>
> Looks like DONTCACHE is documented in the preadv/writev manpage. Here's
> the current blurb about writes:
>
>     Additionally, any range dirtied by a write operation with RWF_DONT‐
>     CACHE  set  will  get kicked off for writeback.  This is similar to
>     calling  sync_file_range(2)  with  SYNC_FILE_RANGE_WRITE  to  start
>     writeback on the given range.  RWF_DONTCACHE is a hint, or best ef‐
>     fort,  where  no hard guarantees are given on the state of the page
>     cache once the operation completes.
>
> I don't think this verbiage is invalid after this change. Kicking off
> writeback is still just a hint, like it was before. We could mention
> about how that I/O can compete with regular buffered I/O, but it seems
> a bit like we're adding info that will just be confusing for users.
>

Make sense.

>> > dontcache-bench results on dual-socket Xeon Gold 6138 (80 CPUs, 256 GB
>> > RAM, Samsung MZ1LB1T9HALS 1.7 TB NVMe, local XFS, io_uring, file size
>> > ~503 GB, compared to a v6.19-ish baseline):
>> > 
>> 
>> Can we please also test parallel buffered writes and dontcache writes? 
>> Since this patch series definitely affects that.
>>
>> BTW - adding these numbers in the commit msg itself is much helpful.
>> 
>
> To be clear, this only affects DONTCACHE, not normal buffered writes,
> but I guess you're referring to the fact that DONTCACHE and buffered
> writes can compete now.
>
> Can you clarify specifically what you'd like me to test here? Are you
> saying you want me to test parallel and buffered writes together at the
> same time (i.e. make them compete?).
>
> I should be able to do that for the local benchmarks, but nfsd's iomode
> settings are global and that won't be possible there.
>

The reason I am thinking of this is: dontcache marked pages, gets
evicted from page cache after they are written back. But this patch
series can now delay that from happening when there is a parallel
buffered writer dirtying page cache pages. Because of the reasons we
already discussed...

Note that, this may not be a workload which matters in the real world,
but I was thinking, it will be good to know the impact if any, of such
workload with this patch series (parallel buffered and dontcache
writers).


>> >   Single-client sequential write (MB/s):
>> >                        baseline    patched     change
>> >   buffered              1449.8     1440.1      -0.7%
>> >   dontcache             1347.9     1461.5      +8.4%
>> >   direct                1450.0     1440.1      -0.7%
>> > 
>> >   Single-client sequential write latency (us):
>> >                        baseline    patched     change
>> >   dontcache p50         3031.0    10551.3    +248.1%
>> >   dontcache p99        74973.2    21626.9     -71.2%
>> >   dontcache p99.9      85459.0    23199.7     -72.9%
>> > 
>> >   Single-client random write (MB/s):
>> >                        baseline    patched     change
>> >   dontcache              284.2      295.4      +3.9%
>> > 
>> >   Single-client random write p99.9 latency (us):
>> >                        baseline    patched     change
>> >   dontcache             2277.4      872.4     -61.7%
>> > 
>> >   Multi-writer aggregate throughput (MB/s):
>> 
>> Can you please help describe this test scenario if possible.. In above
>> you mentioned we are writing file_size as 2x RAM_SIZE. But your
>> multi-client tests says something else..
>> 
>> local num_clients=4
>> +	mem_kb=$(awk '/MemTotal/ {print $2}' /proc/meminfo)
>> +	client_size="$(( mem_kb / 1024 / num_clients ))M"
>> 

I guess you missed answering this. The reason why I was asking about this is....

>> >                        baseline    patched     change
>> >   buffered              1619.5     1611.2      -0.5%
>> >   dontcache             1281.1     1629.4     +27.2%
>> >   direct                1545.4     1609.4      +4.1%
>> > 

... If we see the performace of buffered and dontcache in baseline case,
then we don't see dontcache doing any good. Even the patched version is
just slightly better compared to buffered case.

But IIUC, dontcache should really shine in cases where we have buffered
writers dirtying the page cache pages which can overflow the RAM size
[1]. The reason why dontcache should show benefit there is, because we
don't see any page cache pressure, since after writeback the pages gets
evicted. Also earlier in the unpatched version, the I/O submission
happens immediately in the same context.

So, I guess, isn't it better to evaluate those scenarios as well with
the patched version - since this series affects those code paths now?

[1]: https://lore.kernel.org/all/20241110152906.1747545-11-axboe@kernel.dk/

>> 
>> Nice :)
>> Some explaination here of why 5x improvement with NFS compared to local
>> filesystems please?
>> (I am not much aware of NFS side, but a possible reasoning would help)
>> 
>
> I suspect that it's because of the "scattered" nature of nfsd writes.
> When the client sends a write to nfsd, we wake a nfsd thread to service
> it. So, if there are a lot of writes operating in parallel, they all
> get done in the context of different tasks.
>
> My hunch is that this I/O pattern (writing to same file from a bunch of
> different threads), particularly suffers from the DONTCACHE inline
> write behavior. The threads all end up competing to submit jobs to the
> queue and that causes the performance to fall off sharply.
>

Thanks!

-ritesh

^ permalink raw reply

* Re: [LSF/MM/BPF TOPIC][RFC PATCH v4 00/27] Private Memory Nodes (w/ Compressed RAM)
From: Gregory Price @ 2026-04-27 22:28 UTC (permalink / raw)
  To: Arun George
  Cc: lsf-pc, linux-kernel, linux-cxl, cgroups, linux-mm,
	linux-trace-kernel, damon, kernel-team, gregkh, rafael, dakr,
	dave, jonathan.cameron, dave.jiang, alison.schofield,
	vishal.l.verma, ira.weiny, dan.j.williams, longman, akpm, david,
	lorenzo.stoakes, Liam.Howlett, vbabka, rppt, surenb, mhocko,
	osalvador, ziy, matthew.brost, joshua.hahnjy, rakie.kim,
	byungchul, ying.huang, apopple, axelrasmussen, yuanchu, weixugc,
	yury.norov, linux, mhiramat, mathieu.desnoyers, tj, hannes,
	mkoutny, jackmanb, sj, baolin.wang, npache, ryan.roberts,
	dev.jain, baohua, lance.yang, muchun.song, xu.xin16,
	chengming.zhou, jannh, linmiaohe, nao.horiguchi, pfalcato,
	rientjes, shakeel.butt, riel, harry.yoo, cl, roman.gushchin,
	chrisl, kasong, shikemeng, nphamcs, bhe, zhengqi.arch,
	terry.bowman, gost.dev, arungeorge05, cpgs
In-Reply-To: <1983025922.01777297382206.JavaMail.epsvc@epcpadp2new>

On Mon, Apr 27, 2026 at 06:02:57PM +0530, Arun George wrote:
> 
> Appreciate the work as we also chase the same problem statement.
> A few queries please.
> 
> I see the current support relies on read-only mappings which might
> limit the performance. Any particular workload you are targeting with
> this (which can tolerate this latency)?
>
> Any deployments you think of where the goal is a capacity expansion
> with a compromise in performance?
>

Primary use cases for us are any workload that benefits from zswap -
which is many, many (many, many [many, many]) workloads.

That said, performance is quite irrelevant if you cannot guarantee
correctness.

In a scenario where a multi-threaded CPU can write many many GB/s to
a compressed device - I can't see a scenario where completely
uncontended writes to such a device can provide reliability.

I suppose you could increase the latency of a writable cacheline from
Xns to NXns - but you've only slowed the bear down.  Meanwhile, running
away from said bear includes trying to migrate stuff off the device...
presumably to swap - so your migration process has to have higher
throughput than whatever writes are coming in from the CPU.

Meanwhile - the system is clearly already pressured, and is likely to
continue demoting new data to the compressed tier.

So you end up, at best, in a footrace hoping the bear loses interest,
or at worst in a fight hoping to dodge its claws (generating poison on
some write that fails).

> On the device side, are you targeting beyond compressed RAM like
> devices such as memory with NAND etc.?
> 

For private nodes - I have been collecting use cases, but I haven't seen
a NAND proposal.  Unless someone is willing to demonstrate such a device
actually working without causing bus-lockup issues, most believe the
error-recovery overhead for NAND is too expensive to service cacheline
fetches.

> The TL;DR talked about mmap/mbind way of user space allocation from
> the private node. But the allocation is controlled by GFP flag
> N_MEMORY_PRIVATE. Does the user space path of allocation set this
> flag along the way?
> 

No.  Userspace does mbind() and it works - if the device's driver (or
service) has opted that node into allowing mempolicy syscalls.

The kernel injects the __GFP_PRIVATE for the relevant VMA in the vma
fault path if that VMA has a nodemask with a valid private node.

> And I believe the bear-proof cage might work in the normal scenarios,
> but may not work for all.

If it can't work for all workloads, then it's likely not general purpose
enough to find core kernel support and should seek to use the existing
interfaces (DAX and friends).

> We might not be able to rely on the control
> path (backpressure) fully. The control path could go slow, slower and
> even die as well. Should the device respond with something like
> 'bus error' if the host tries to write when it is not capable of
> taking any more writes?
> 

You need two controls over compressed RAM for it to be reliable:

  - Allocation control (acquiring new struct page to write to)
  - Write-control (preventing new writes to compressed pages)

Private nodes provide the allocation control.

A read-only mapping, and guarantee that only memory that can reach
the device is userland memory - is the only way to control the cpu
writes from the OS perspective.

(Bonus: page cache can't live here, because buffered I/O bypasses
 this by using direct writes from the kernel).

Slowing the bus down just puts you in competition with swap, and bus
error is basically equivalent to poison being reported at write time.

That's basically the whole story.

Loosening the write-protection can be seen as trading optimization
for risk - where the risk is hitting poison in userland-only memory.

In the next version of the RFC i'll demonstrate cram.c as a new swap
backend that allows for read-only mappings to be soft-faulted in,
migration on write, isolation to ANON memory, and some optional 
settings that allow a device or administrator a "writable budget" 
which allows some number of pages to be made writable without migration.

~Gregory

^ permalink raw reply

* Re: [PATCH 7.2 v16 09/13] mm/khugepaged: introduce collapse_allowable_orders helper function
From: David Hildenbrand (Arm) @ 2026-04-27 20:24 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-10-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> Add collapse_allowable_orders() to generalize THP order eligibility. The
> function determines which THP orders are permitted based on collapse
> context (khugepaged vs madv_collapse).
> 
> This consolidates collapse configuration logic and provides a clean
> interface for future mTHP collapse support where the orders may be
> different.
> 
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---


[...]

>  	cc = kmalloc_obj(*cc);
> diff --git a/mm/vma.c b/mm/vma.c
> index 377321b48734..c0398fb597b3 100644
> --- a/mm/vma.c
> +++ b/mm/vma.c
> @@ -989,7 +989,7 @@ static __must_check struct vm_area_struct *vma_merge_existing_range(
>  		goto abort;
>  
>  	vma_set_flags_mask(vmg->target, sticky_flags);
> -	khugepaged_enter_vma(vmg->target, vmg->vm_flags);
> +	khugepaged_enter_vma(vmg->target);
>  	vmg->state = VMA_MERGE_SUCCESS;
>  	return vmg->target;
>  
> @@ -1110,7 +1110,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg)
>  	 * following VMA if we have VMAs on both sides.
>  	 */
>  	if (vmg->target && !vma_expand(vmg)) {
> -		khugepaged_enter_vma(vmg->target, vmg->vm_flags);
> +		khugepaged_enter_vma(vmg->target);
>  		vmg->state = VMA_MERGE_SUCCESS;
>  		return vmg->target;
>  	}
> @@ -2589,7 +2589,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap,
>  	 * call covers the non-merge case.
>  	 */
>  	if (!vma_is_anonymous(vma))
> -		khugepaged_enter_vma(vma, map->vm_flags);
> +		khugepaged_enter_vma(vma);
>  	*vmap = vma;

Are you sure that in all cases, vma->vm_flags already corresponds to
vmg->vm_flags / map->vm_flags?


That's a change that makes this patch unnecessary hard to follow, in particular,
because it's not documented in the patch description.

If you think the change is fine, you should better move that into a separate
cleanup patch where you only drop the flags parameter from  khugepaged_enter_vma().

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 07/13] mm/khugepaged: add per-order mTHP collapse failure statistics
From: David Hildenbrand (Arm) @ 2026-04-27 20:21 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-8-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> Add three new mTHP statistics to track collapse failures for different
> orders when encountering swap PTEs, excessive none PTEs, and shared PTEs:
> 
> - collapse_exceed_swap_pte: Increment when mTHP collapse fails due to swap
> 	PTEs
> 
> - collapse_exceed_none_pte: Counts when mTHP collapse fails due to
>   	exceeding the none PTE threshold for the given order
> 
> - collapse_exceed_shared_pte: Counts when mTHP collapse fails due to shared
>   	PTEs
> 
> These statistics complement the existing THP_SCAN_EXCEED_* events by
> providing per-order granularity for mTHP collapse attempts. The stats are
> exposed via sysfs under
> `/sys/kernel/mm/transparent_hugepage/hugepages-*/stats/` for each
> supported hugepage size.
> 
> As we currently dont support collapsing mTHPs that contain a swap or

s/dont/do not/

> shared entry, those statistics keep track of how often we are
> encountering failed mTHP collapses due to these restrictions.
> 
> Now that we plan to support mTHP collapse for anon pages, lets also track

"We will add support for mTHP collapse for anonymous pages next; let's also ..."

> when this happens at the PMD level within the per-mTHP stats.

What about file collapse? For example, we do adjust
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE) and
count_vm_event(THP_SCAN_EXCEED_NONE_PTE) there.

Wouldn't we want to update the HPAGE_PMD_ORDER side of things there already? or
would we want to use a different counter for that?

> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  Documentation/admin-guide/mm/transhuge.rst | 24 ++++++++++++++++++++++
>  include/linux/huge_mm.h                    |  3 +++
>  mm/huge_memory.c                           |  7 +++++++
>  mm/khugepaged.c                            | 21 +++++++++++++++++--
>  4 files changed, 53 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst
> index c51932e6275d..eebb1f6bbc6c 100644
> --- a/Documentation/admin-guide/mm/transhuge.rst
> +++ b/Documentation/admin-guide/mm/transhuge.rst
> @@ -714,6 +714,30 @@ nr_anon_partially_mapped
>         an anonymous THP as "partially mapped" and count it here, even though it
>         is not actually partially mapped anymore.
>  
> +collapse_exceed_none_pte
> +       The number of collapse attempts that failed due to exceeding the
> +       max_ptes_none threshold. For mTHP collapse, Currently only max_ptes_none
> +       values of 0 and (HPAGE_PMD_NR - 1) are supported. Any other value will
> +       emit a warning and no mTHP collapse will be attempted. khugepaged will
> +       try to collapse to the largest enabled (m)THP size; if it fails, it will
> +       try the next lower enabled mTHP size. This counter records the number of
> +       times a collapse attempt was skipped for exceeding the max_ptes_none
> +       threshold, and khugepaged will move on to the next available mTHP size.

Why is everything after the first sentence worth documenting here? This doesn't
read like it belongs to a failure counter?

> +
> +collapse_exceed_swap_pte
> +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> +       to containing at least one swap PTE. Currently khugepaged does not
> +       support collapsing mTHP regions that contain a swap PTE. This counter can
> +       be used to monitor the number of khugepaged mTHP collapses that failed
> +       due to the presence of a swap PTE.

Can we similarly simplify that (and make it consistent with the one above) to

"The number of collapse attempts that failed due to exceeding the max_ptes_swap
threshold."

> +
> +collapse_exceed_shared_pte
> +       The number of anonymous mTHP PTE ranges which were unable to collapse due
> +       to containing at least one shared PTE. Currently khugepaged does not
> +       support collapsing mTHP PTE ranges that contain a shared PTE. This
> +       counter can be used to monitor the number of khugepaged mTHP collapses
> +       that failed due to the presence of a shared PTE.

Same here

"The number of collapse attempts that failed due to exceeding the
max_ptes_shared threshold."

?

> +

[...]

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 05/13] mm/khugepaged: generalize collapse_huge_page for mTHP collapse
From: David Hildenbrand (Arm) @ 2026-04-27 20:13 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-6-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> Pass an order and offset to collapse_huge_page to support collapsing anon
> memory to arbitrary orders within a PMD. order indicates what mTHP size we
> are attempting to collapse to, and offset indicates were in the PMD to
> start the collapse attempt.
> 
> For non-PMD collapse we must leave the anon VMA write locked until after
> we collapse the mTHP-- in the PMD case all the pages are isolated, but in
> the mTHP case this is not true, and we must keep the lock to prevent
> access/changes to the page tables. This can happen if the rmap walkers hit
> a pmd_none while the PMD entry is currently unavailable due to being
> temporarily removed during the collapse phase.
> 
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 103 +++++++++++++++++++++++++++---------------------
>  1 file changed, 57 insertions(+), 46 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 283bb63854a5..ff6f9f1883ed 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1198,42 +1198,36 @@ static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_stru
>  	return SCAN_SUCCEED;
>  }
>  
> -static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long address,
> -		int referenced, int unmapped, struct collapse_control *cc)
> +static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long start_addr,
> +		int referenced, int unmapped, struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	LIST_HEAD(compound_pagelist);
>  	pmd_t *pmd, _pmd;
> -	pte_t *pte;
> +	pte_t *pte = NULL;
>  	pgtable_t pgtable;
>  	struct folio *folio;
>  	spinlock_t *pmd_ptl, *pte_ptl;
>  	enum scan_result result = SCAN_FAIL;
>  	struct vm_area_struct *vma;
>  	struct mmu_notifier_range range;
> +	bool anon_vma_locked = false;
> +	const unsigned long pmd_addr = start_addr & HPAGE_PMD_MASK;
> +	const unsigned long end_addr = start_addr + (PAGE_SIZE << order);

In general, const read better when they are at the very top of this list.

>  
> -	VM_BUG_ON(address & ~HPAGE_PMD_MASK);
> -
> -	/*
> -	 * Before allocating the hugepage, release the mmap_lock read lock.
> -	 * The allocation can take potentially a long time if it involves
> -	 * sync compaction, and we do not need to hold the mmap_lock during
> -	 * that. We will recheck the vma after taking it again in write mode.
> -	 */
> -	mmap_read_unlock(mm);
> -

You should spell out that locking change (moving it to the caller), and why it
is required, in the patch description.

I'd even have put this into a separate patch, as it's independent to the
order-passing changes.
[...]

>  	 */
>  	__folio_mark_uptodate(folio);
> -	pgtable = pmd_pgtable(_pmd);
> -
>  	spin_lock(pmd_ptl);
> -	BUG_ON(!pmd_none(*pmd));
> -	pgtable_trans_huge_deposit(mm, pmd, pgtable);
> -	map_anon_folio_pmd_nopf(folio, pmd, vma, address);
> +	WARN_ON_ONCE(!pmd_none(*pmd));
> +	if (is_pmd_order(order)) { /* PMD collapse */
> +		pgtable = pmd_pgtable(_pmd);
> +		pgtable_trans_huge_deposit(mm, pmd, pgtable);
> +		map_anon_folio_pmd_nopf(folio, pmd, vma, pmd_addr);
> +	} else { /* mTHP collapse */

Do both these comments (PMD collapse ...) really add any value? I'd say it's
pretty self-documenting code already.

> +		map_anon_folio_pte_nopf(folio, pte, vma, start_addr, /*uffd_wp=*/ false);
> +		smp_wmb(); /* make PTEs visible before PMD. See pmd_install() */
> +		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
> +	}
>  	spin_unlock(pmd_ptl);
>  
>  	folio = NULL;
>  
>  	result = SCAN_SUCCEED;
>  out_up_write:
> +	if (anon_vma_locked)
> +		anon_vma_unlock_write(vma->anon_vma);
> +	if (pte)
> +		pte_unmap(pte);

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 04/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-04-27 20:07 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-5-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
> 
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2.
> 
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.

You could add a link here to previous discussions.

> 
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes:
> 
> - max_ptes_none=0: never introduce new none-pages for mTHP collapse.

"introduce" reads wrong in this context. And I don't know what a "none-page" is :)

"never collapses if it encounters an empty PTE or a PTE that maps the shared
zeropage. Consequently, no memory bloat."

> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
>   available mTHP order.
> 
> This removes the possiblilty of "creep", while not modifying any uAPI
> expectations. A warning will be emitted if any non-supported
> max_ptes_none value is configured with mTHP enabled.
> 
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
> 
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
> 
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 124 ++++++++++++++++++++++++++++++++++--------------
>  1 file changed, 88 insertions(+), 36 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f42b55421191..283bb63854a5 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -352,51 +352,86 @@ static bool pte_none_or_zero(pte_t pte)
>   * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse
>   * @cc: The collapse control struct
>   * @vma: The vma to check for userfaultfd
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
> - * empty page.
> + * empty page. For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the
> + * configured khugepaged_max_ptes_none value.
> + *
> + * For mTHP collapses, we currently only support khugepaged_max_pte_none values
> + * of 0 or (KHUGEPAGED_MAX_PTES_LIMIT). Any other value will emit a warning and
> + * no mTHP collapse will be attempted

Not sure if we discussed it (and maybe I had a different opinion back then ...),
but could we simply to fallback to max_ptes_none=0, so we can avoid returning
errors here?

max_ptes_none=0 is ok, because we will not waste any memory. The warning clearly
tells the user that this combination is not supported as is.

... and it would make this function a lot easier to handle. In the warning, we
can just state that "falling back to ... "max_ptes_non = 0".


[...]

>  
>  /**
>   * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse
>   * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
>   * shared page.
>   *
> + * For mTHP collapses, we currently dont support collapsing memory with
> + * shared memory.

"do not"

"shared memory" is misleading, as we do support shmem. What you mean is maybe
"collapsing with anonymous memory pages that are shared between processes
through CoW" or soemthing like that?

> + *
>   * Return: Maximum number of shared PTEs allowed for the collapse operation
>   */
> -static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
> +		unsigned int order)
>  {
>  	if (!cc->is_khugepaged)
>  		return HPAGE_PMD_NR;
> +	if (!is_pmd_order(order))
> +		return 0;
> +
>  	return khugepaged_max_ptes_shared;
>  }
>  
>  /**
>   * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse
>   * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
>   *
>   * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
>   * swap page.
>   *
> + * For PMD-sized collapses (order == HPAGE_PMD_ORDER), use the configured
> + * khugepaged_max_ptes_swap value.
> + *
> + * For mTHP collapses, we currently dont support collapsing memory with
> + * swapped out memory.

"do not". Given that this is also used for the pagecache, can we make this clearer?

> + *
>   * Return: Maximum number of swap PTEs allowed for the collapse operation
>   */
> -static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
> +		unsigned int order)
-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 04/13] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: David Hildenbrand (Arm) @ 2026-04-27 20:06 UTC (permalink / raw)
  To: Usama Arif, Nico Pache
  Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, akpm,
	anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
	catalin.marinas, cl, corbet, dave.hansen, dev.jain, gourry,
	hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
	lance.yang, Liam.Howlett, ljs, mathieu.desnoyers, matthew.brost,
	mhiramat, mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
	richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
	sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
	vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
	zokeefe
In-Reply-To: <20260420135554.27067-1-usama.arif@linux.dev>


> 
>> +	pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
>> +		      KHUGEPAGED_MAX_PTES_LIMIT);
> 
> IMO, warn_once can get lost quickly in dmesg. Maybe pr_warn_ratelimited?
> 
> Not sure what others opinions are..

pr_warn_ratelimited() still creates *a lot* of noise from a system daemon ...

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 03/13] mm/khugepaged: rework max_ptes_* handling with helper functions
From: David Hildenbrand (Arm) @ 2026-04-27 19:52 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-4-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> The following cleanup reworks all the max_ptes_* handling into helper
> functions. This increases the code readability and will later be used to
> implement the mTHP handling of these variables.
> 
> With these changes we abstract all the madvise_collapse() special casing
> (dont respect the sysctls) away from the functions that utilize them. And
> will later in this series to cleanly restrict mTHP collapses behaviors.
> 
> Suggested-by: David Hildenbrand <david@kernel.org>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
>  mm/khugepaged.c | 114 +++++++++++++++++++++++++++++++++---------------
>  1 file changed, 78 insertions(+), 36 deletions(-)
> 
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index afac6bc4e76d..f42b55421191 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -348,6 +348,58 @@ static bool pte_none_or_zero(pte_t pte)
>  	return pte_present(pte) && is_zero_pfn(pte_pfn(pte));
>  }
>  
> +/**
> + * collapse_max_ptes_none - Calculate maximum allowed empty PTEs for collapse

empty PTE or PTE mapping the shared zeropage ? That should be clarified also below.

> + * @cc: The collapse control struct
> + * @vma: The vma to check for userfaultfd
> + *
> + * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
> + * empty page.

Not completely accurate due to uffd. And it's not really "empty page".

Is that information really necessary for the caller? I'd suggest you drop this
here and instead add a comment inline above the "return HPAGE_PMD_NR;".

> + *
> + * Return: Maximum number of empty PTEs allowed for the collapse operation
> + */
> +static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> +		struct vm_area_struct *vma)
> +{
> +	if (vma && userfaultfd_armed(vma))
> +		return 0;
> +	if (!cc->is_khugepaged)
> +		return HPAGE_PMD_NR;
> +	return khugepaged_max_ptes_none;
> +}
> +
> +/**
> + * collapse_max_ptes_shared - Calculate maximum allowed shared PTEs for collapse

"shared PTE" is not quite clear.

"PTEs that map shared anonymous pages" ?

> + * @cc: The collapse control struct
> + *
> + * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
> + * shared page.

Same comment as above.

> + *
> + * Return: Maximum number of shared PTEs allowed for the collapse operation
> + */
> +static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> +{
> +	if (!cc->is_khugepaged)
> +		return HPAGE_PMD_NR;
> +	return khugepaged_max_ptes_shared;
> +}
> +
> +/**
> + * collapse_max_ptes_swap - Calculate maximum allowed swap PTEs for collapse

We're actually checking non-present page table entries (anonymous THP collapse)
or non-present pagecache entries (file THP collapse).

I wonder if there is an easy way to clarify that here, at least in the
description (confusing name can stay unless we find something better).

> + * @cc: The collapse control struct
> + *
> + * If we are not in khugepaged mode use HPAGE_PMD_NR to allow any
> + * swap page.

Dito.

> + *
> + * Return: Maximum number of swap PTEs allowed for the collapse operation
> + */
> +static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
> +{
> +	if (!cc->is_khugepaged)
> +		return HPAGE_PMD_NR;
> +	return khugepaged_max_ptes_swap;
> +}
> +
>  int hugepage_madvise(struct vm_area_struct *vma,
>  		     vm_flags_t *vm_flags, int advice)
>  {
> @@ -546,21 +598,19 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  	pte_t *_pte;
>  	int none_or_zero = 0, shared = 0, referenced = 0;
>  	enum scan_result result = SCAN_FAIL;
> +	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> +	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);

These could be const, right? Or will that change in future patches?

>  
>  	for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
>  	     _pte++, addr += PAGE_SIZE) {
>  		pte_t pteval = ptep_get(_pte);
>  		if (pte_none_or_zero(pteval)) {
> -			++none_or_zero;
> -			if (!userfaultfd_armed(vma) &&
> -			    (!cc->is_khugepaged ||
> -			     none_or_zero <= khugepaged_max_ptes_none)) {
> -				continue;
> -			} else {
> +			if (++none_or_zero > max_ptes_none) {
>  				result = SCAN_EXCEED_NONE_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
>  				goto out;
>  			}
> +			continue;
>  		}
>  		if (!pte_present(pteval)) {
>  			result = SCAN_PTE_NON_PRESENT;
> @@ -591,9 +641,7 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
>  
>  		/* See collapse_scan_pmd(). */
>  		if (folio_maybe_mapped_shared(folio)) {
> -			++shared;
> -			if (cc->is_khugepaged &&
> -			    shared > khugepaged_max_ptes_shared) {
> +			if (++shared > max_ptes_shared) {
>  				result = SCAN_EXCEED_SHARED_PTE;
>  				count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
>  				goto out;
> @@ -1270,6 +1318,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
>  	unsigned long addr;
>  	spinlock_t *ptl;
>  	int node = NUMA_NO_NODE, unmapped = 0;
> +	unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> +	unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
> +	unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);

Same question here.

>  
>  	VM_BUG_ON(start_addr & ~HPAGE_PMD_MASK);
>  


In general, LGTM. With the doc fixed up

Acked-by: David Hildenbrand (Arm) <david@kernel.org>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH 7.2 v16 02/13] mm/khugepaged: generalize alloc_charge_folio()
From: David Hildenbrand (Arm) @ 2026-04-27 19:41 UTC (permalink / raw)
  To: Nico Pache, linux-doc, linux-kernel, linux-mm, linux-trace-kernel
  Cc: aarcange, akpm, anshuman.khandual, apopple, baohua, baolin.wang,
	byungchul, catalin.marinas, cl, corbet, dave.hansen, dev.jain,
	gourry, hannes, hughd, jack, jackmanb, jannh, jglisse,
	joshua.hahnjy, kas, lance.yang, Liam.Howlett, ljs,
	mathieu.desnoyers, matthew.brost, mhiramat, mhocko, peterx,
	pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang, rientjes,
	rostedt, rppt, ryan.roberts, shivankg, sunnanyong, surenb,
	thomas.hellstrom, tiwai, usamaarif642, vbabka, vishal.moola,
	wangkefeng.wang, will, willy, yang, ying.huang, ziy, zokeefe
In-Reply-To: <20260419185750.260784-3-npache@redhat.com>

On 4/19/26 20:57, Nico Pache wrote:
> From: Dev Jain <dev.jain@arm.com>
> 
> Pass order to alloc_charge_folio() and update mTHP statistics.
> 
> Reviewed-by: Wei Yang <richard.weiyang@gmail.com>
> Reviewed-by: Lance Yang <lance.yang@linux.dev>
> Reviewed-by: Baolin Wang <baolin.wang@linux.alibaba.com>
> Reviewed-by: Lorenzo Stoakes <ljs@kernel.org>
> Reviewed-by: Zi Yan <ziy@nvidia.com>
> Acked-by: David Hildenbrand (Arm) <david@kernel.org>
> Co-developed-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>

Your SOB should come last, the order represents the history of this patch:

Signed-off-by: Dev Jain <dev.jain@arm.com>
Co-developed-by: Nico Pache <npache@redhat.com>
Signed-off-by: Nico Pache <npache@redhat.com>

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH] Documentation/rv: Replace stale website link
From: Steven Rostedt @ 2026-04-27 16:45 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: Jonathan Corbet, rdunlap, linux-trace-kernel, linux-doc,
	linux-kernel, matteo.martelli, skhan
In-Reply-To: <93666b516d93f880ed14c3b9309e203014a7deb0.camel@redhat.com>

On Mon, 27 Apr 2026 14:56:46 +0200
Gabriele Monaco <gmonaco@redhat.com> wrote:

> > 
> > I will defer to others in the end, but to me it seems that we should
> > make life easier for our readers whenever we can.  Providing a link
> > seems better than requiring them to search for it themselves.  
> 
> Alright, makes sense. I'm going to send a V2 with [1] (the open access
> PDF), in the remote case the link stops working, we can update it.

Can you add both?

[1] - Daniel Bristot de Oliveira et al.: A thread synchronization model for the PREEMPT_RT Linux kernel, J. Syst. Archit., 2020.
      https://www.iris.sssup.it/bitstream/11382/533630/1/Elsevier-JSA-2020.pdf

 ?

-- Steve

^ permalink raw reply

* Re: [PATCH v2] Documentation/rv: Replace stale website link
From: Randy Dunlap @ 2026-04-27 16:50 UTC (permalink / raw)
  To: Gabriele Monaco, Steven Rostedt, Jonathan Corbet,
	linux-trace-kernel, linux-doc, linux-kernel
  Cc: matteo.martelli, skhan
In-Reply-To: <20260427131709.170505-2-gmonaco@redhat.com>



On 4/27/26 6:17 AM, Gabriele Monaco wrote:
> The sched monitor page was linking to Daniel's website which is now
> down. The main purpose of the link was to point to a source for the
> models from the original author and that can be found also in his
> published paper.
> 
> Replace the link with a reference to Daniel's "A thread synchronization
> model for the PREEMPT_RT Linux kernel" which can be found online and
> includes the models definitions as well as the work behind them (not the
> original patches but since they're based on a 5.0 kernel and are mostly
> included upstream, there's little value in keeping them in the docs).
> 
> Fixes: 03abeaa63c08 ("Documentation/rv: Add docs for the sched monitors")
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
> ---
> V2: Add link to the PDF and fixed RST references
> 
>  Documentation/trace/rv/monitor_sched.rst | 7 +++++--
>  1 file changed, 5 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/trace/rv/monitor_sched.rst b/Documentation/trace/rv/monitor_sched.rst
> index 0b96d6e147c6..d3ba7edc202f 100644
> --- a/Documentation/trace/rv/monitor_sched.rst
> +++ b/Documentation/trace/rv/monitor_sched.rst
> @@ -36,7 +36,7 @@ Specifications
>  --------------
>  
>  The specifications included in sched are currently a work in progress, adapting the ones
> -defined in by Daniel Bristot in [1].
> +defined by Daniel Bristot in [1]_.
>  
>  Currently we included the following:
>  
> @@ -365,4 +365,7 @@ constraints when processing the events::
>  References
>  ----------
>  
> -[1] - https://bristot.me/linux-task-model
> +.. [1] Daniel Bristot de Oliveira et al.:
> +       `A thread synchronization model for the PREEMPT_RT Linux kernel
> +       <https://www.iris.sssup.it/bitstream/11382/533630/1/Elsevier-JSA-2020.pdf>`_,
> +       J. Syst. Archit., 2020.
> 
> base-commit: 254f49634ee16a731174d2ae34bc50bd5f45e731

Tested-by: Randy Dunlap <rdunlap@infradead.org>
Acked-by: Randy Dunlap <rdunlap@infradead.org>

although I don't care for the "J. Syst. Archit." abbreviation.
Does JSA use that? Not that I can see.

thanks.
-- 
~Randy

^ permalink raw reply

* Re: [PATCH v5 1/2] blk-mq: add tracepoint block_rq_tag_wait
From: Steven Rostedt @ 2026-04-27 16:38 UTC (permalink / raw)
  To: Aaron Tomlin
  Cc: axboe, mhiramat, mathieu.desnoyers, bvanassche,
	johannes.thumshirn, kch, dlemoal, ritesh.list, loberman, neelx,
	sean, mproche, chjohnst, linux-block, linux-kernel,
	linux-trace-kernel
In-Reply-To: <20260427020142.358912-2-atomlin@atomlin.com>

On Sun, 26 Apr 2026 22:01:41 -0400
Aaron Tomlin <atomlin@atomlin.com> wrote:

> +TRACE_EVENT(block_rq_tag_wait,
> +
> +	TP_PROTO(struct request_queue *q, struct blk_mq_hw_ctx *hctx, bool is_sched_tag),
> +
> +	TP_ARGS(q, hctx, is_sched_tag),
> +
> +	TP_STRUCT__entry(
> +		__field( dev_t,		dev			)
> +		__field( u32,		hctx_id			)
> +		__field( u32,		nr_tags			)
> +		__field( bool,		is_sched_tag		)
> +	),
> +
> +	TP_fast_assign(
> +		__entry->dev		= q->disk ? disk_devt(q->disk);

Hmm, does the above even compile?

-- Steve

> +		__entry->hctx_id	= hctx->queue_num;
> +		__entry->is_sched_tag	= is_sched_tag;
> +
> +		if (is_sched_tag)
> +			__entry->nr_tags = hctx->sched_tags->nr_tags;
> +		else
> +			__entry->nr_tags = hctx->tags->nr_tags;
> +	),
> +
> +	TP_printk("%d,%d hctx=%u starved on %s tags (depth=%u)",
> +		  MAJOR(__entry->dev), MINOR(__entry->dev),
> +		  __entry->hctx_id,
> +		  __entry->is_sched_tag ? "scheduler" : "hardware",
> +		  __entry->nr_tags)
> +);
> +

^ permalink raw reply

* [RFC PATCH 12/12] rv: Add KUnit tests for some LTL monitors
From: Gabriele Monaco @ 2026-04-27 15:11 UTC (permalink / raw)
  To: linux-trace-kernel, linux-kernel, Steven Rostedt, Gabriele Monaco,
	Masami Hiramatsu
  Cc: Nam Cao, Thomas Weissschuh, Tomas Glozar, John Kacur, Wen Yang
In-Reply-To: <20260427151134.192971-1-gmonaco@redhat.com>

Validate the functionality of LTL monitors by injecting events in a
controlled environment (KUnit) and expecting reactions, just like it is
done in DA monitors.

Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>
---
 include/rv/ltl_monitor.h                      | 31 ++++++++++++++
 .../trace/rv/monitors/pagefault/pagefault.c   | 24 +++++++++++
 kernel/trace/rv/monitors/sleep/sleep.c        | 42 +++++++++++++++++++
 kernel/trace/rv/rv_monitors_test.c            |  2 +
 kernel/trace/rv/rv_monitors_test.h            | 18 ++++++++
 5 files changed, 117 insertions(+)

diff --git a/include/rv/ltl_monitor.h b/include/rv/ltl_monitor.h
index 0f2c3820b9b8..49e680b769f6 100644
--- a/include/rv/ltl_monitor.h
+++ b/include/rv/ltl_monitor.h
@@ -172,3 +172,34 @@ static void __maybe_unused ltl_atom_pulse(struct task_struct *task, enum ltl_ato
 	ltl_atom_set(mon, atom, !value);
 	ltl_validate(task, mon);
 }
+
+#ifdef CONFIG_RV_MONITORS_KUNIT_TEST
+#include <kunit/test.h>
+
+/*
+ * rv_prepare_test - Disable the monitor for a kunit test
+ */
+static inline void ltl_teardown_test(void *arg)
+{
+	struct rv_monitor *rv_this = arg;
+
+	rv_this->enabled = 0;
+	ltl_monitor_destroy();
+}
+
+/*
+ * rv_prepare_test - Enable the monitor for a kunit test
+ *
+ * Do the bare minimum to set up the monitor, make sure it is not active and
+ * real tracepoint handlers are NOT attached.
+ */
+static inline void ltl_prepare_test(struct kunit *test, struct rv_monitor *rv_this)
+{
+	KUNIT_ASSERT_FALSE(test, rv_this->enabled);
+	ltl_monitor_init();
+	rv_this->enabled = 1;
+
+	KUNIT_ASSERT_EQ(test, 0,
+			kunit_add_action_or_reset(test, ltl_teardown_test, rv_this));
+}
+#endif /* CONFIG_RV_MONITORS_KUNIT_TEST */
diff --git a/kernel/trace/rv/monitors/pagefault/pagefault.c b/kernel/trace/rv/monitors/pagefault/pagefault.c
index 56abe5079676..1849d7b81545 100644
--- a/kernel/trace/rv/monitors/pagefault/pagefault.c
+++ b/kernel/trace/rv/monitors/pagefault/pagefault.c
@@ -86,3 +86,27 @@ module_exit(unregister_pagefault);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
 MODULE_DESCRIPTION("pagefault: Monitor that RT tasks do not raise page faults");
+
+#ifdef CONFIG_RV_MONITORS_KUNIT_TEST
+#include "rv_monitors_test.h"
+
+void rv_test_pagefault(struct kunit *test)
+{
+	static struct task_struct *target;
+	struct rv_kunit_ctx *ctx = test->priv;
+
+	ltl_prepare_test(test, &rv_pagefault);
+	target = kunit_kzalloc(test, sizeof(struct task_struct), GFP_KERNEL);
+	target->policy = SCHED_FIFO;
+	target->prio = MAX_RT_PRIO - 1;
+	handle_task_newtask(NULL, target, 0);
+
+	ltl_attempt_start(target, ltl_get_monitor(target));
+
+	/* RT task has a page fault */
+	rv_mock_current(ctx, target);
+	handle_page_fault(NULL, 0, NULL, 0);
+	RV_KUNIT_EXPECT_REACTION(test, ctx);
+}
+EXPORT_SYMBOL_GPL(rv_test_pagefault);
+#endif
diff --git a/kernel/trace/rv/monitors/sleep/sleep.c b/kernel/trace/rv/monitors/sleep/sleep.c
index 7c16967add70..9388ffb55aa9 100644
--- a/kernel/trace/rv/monitors/sleep/sleep.c
+++ b/kernel/trace/rv/monitors/sleep/sleep.c
@@ -247,3 +247,45 @@ module_exit(unregister_sleep);
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Nam Cao <namcao@linutronix.de>");
 MODULE_DESCRIPTION("sleep: Monitor that RT tasks do not undesirably sleep");
+
+#ifdef CONFIG_RV_MONITORS_KUNIT_TEST
+#include "rv_monitors_test.h"
+
+void rv_test_sleep(struct kunit *test)
+{
+	static struct task_struct *target, *other;
+	struct rv_kunit_ctx *ctx = test->priv;
+	unsigned long args[6];
+	struct pt_regs regs;
+
+	ltl_prepare_test(test, &rv_sleep);
+	target = kunit_kzalloc(test, sizeof(struct task_struct), GFP_KERNEL);
+	target->policy = SCHED_FIFO;
+	target->prio = MAX_RT_PRIO - 2;
+	other = kunit_kzalloc(test, sizeof(struct task_struct), GFP_KERNEL);
+	other->policy = SCHED_FIFO;
+	other->prio = MAX_RT_PRIO - 1;
+	handle_task_newtask(NULL, target, 0);
+
+	/* RT task sleeps on a non RT-friendly nanosleep */
+	rv_mock_current(ctx, target);
+	args[0] = CLOCK_REALTIME;
+	syscall_set_arguments(target, &regs, args);
+	handle_sys_enter(NULL, &regs, __NR_clock_nanosleep);
+	handle_sys_exit(NULL, NULL, 0);
+	handle_sched_set_state(NULL, target, TASK_INTERRUPTIBLE);
+	RV_KUNIT_EXPECT_REACTION(test, ctx);
+
+	/* RT task woken up by lower priority task */
+	args[1] = FUTEX_WAIT;
+	syscall_set_arguments(target, &regs, args);
+	rv_mock_current(ctx, target);
+	handle_sys_enter(NULL, &regs, __NR_futex);
+	handle_sched_set_state(NULL, target, TASK_INTERRUPTIBLE);
+	rv_mock_current(ctx, other);
+	handle_sched_waking(NULL, target);
+	handle_sched_wakeup(NULL, target);
+	RV_KUNIT_EXPECT_REACTION(test, ctx);
+}
+EXPORT_SYMBOL_GPL(rv_test_sleep);
+#endif
diff --git a/kernel/trace/rv/rv_monitors_test.c b/kernel/trace/rv/rv_monitors_test.c
index a57a40932d34..78af77310d56 100644
--- a/kernel/trace/rv/rv_monitors_test.c
+++ b/kernel/trace/rv/rv_monitors_test.c
@@ -81,6 +81,8 @@ static struct kunit_case rv_trigger_test_cases[] = {
 	KUNIT_CASE(rv_test_sts),
 	KUNIT_CASE(rv_test_opid),
 	KUNIT_CASE(rv_test_nomiss),
+	KUNIT_CASE(rv_test_pagefault),
+	KUNIT_CASE(rv_test_sleep),
 	{}
 };
 
diff --git a/kernel/trace/rv/rv_monitors_test.h b/kernel/trace/rv/rv_monitors_test.h
index 3015943c5dda..c745edc85991 100644
--- a/kernel/trace/rv/rv_monitors_test.h
+++ b/kernel/trace/rv/rv_monitors_test.h
@@ -70,3 +70,21 @@ static inline void rv_test_nomiss(struct kunit *test)
 	kunit_skip(test, "Monitor not enabled\n");
 }
 #endif
+
+#ifdef CONFIG_RV_MON_PAGEFAULT
+extern void rv_test_pagefault(struct kunit *test);
+#else
+static inline void rv_test_pagefault(struct kunit *test)
+{
+	kunit_skip(test, "Monitor not enabled\n");
+}
+#endif
+
+#ifdef CONFIG_RV_MON_SLEEP
+extern void rv_test_sleep(struct kunit *test);
+#else
+static inline void rv_test_sleep(struct kunit *test)
+{
+	kunit_skip(test, "Monitor not enabled\n");
+}
+#endif
-- 
2.53.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox