Linux Documentation
 help / color / mirror / Atom feed
* [PATCH V10 05/10] famfs_fuse: GET_DAXDEV message and daxdev_table
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com>

From: John Groves <john@groves.net>

- The new GET_DAXDEV message/response is added
- The famfs.c:famfs_teardown() function is added as a primary teardown
  function for famfs.
- The command it triggered by the update_daxdev_table() call, if there
  are any daxdevs in the subject fmap that are not represented in the
  daxdev_table yet.
- fs/namei.c: export may_open_dev()

Signed-off-by: John Groves <john@groves.net>
---
 fs/fuse/famfs.c           | 227 +++++++++++++++++++++++++++++++++++++-
 fs/fuse/famfs_kfmap.h     |  26 +++++
 fs/fuse/fuse_i.h          |  19 ++++
 fs/fuse/inode.c           |   7 +-
 fs/namei.c                |   1 +
 include/uapi/linux/fuse.h |  20 ++++
 6 files changed, 298 insertions(+), 2 deletions(-)

diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index ac52e54e2cb5..0e9415aa6339 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -21,6 +21,228 @@
 #include "famfs_kfmap.h"
 #include "fuse_i.h"
 
+/*
+ * famfs_teardown()
+ *
+ * Deallocate famfs metadata for a fuse_conn
+ */
+void
+famfs_teardown(struct fuse_conn *fc)
+{
+	struct famfs_dax_devlist *devlist __free(kfree) = fc->dax_devlist;
+	int i;
+
+	fc->dax_devlist = NULL;
+
+	if (!devlist)
+		return;
+
+	if (!devlist->devlist)
+		return;
+
+	/* Close & release all the daxdevs in our table */
+	for (i = 0; i < devlist->nslots; i++) {
+		struct famfs_daxdev *dd = &devlist->devlist[i];
+
+		if (!dd->valid)
+			continue;
+
+		/* Release reference from dax_dev_get() */
+		if (dd->devp)
+			put_dax(dd->devp);
+
+		kfree(dd->name);
+	}
+	kfree(devlist->devlist);
+}
+
+static int
+famfs_verify_daxdev(const char *pathname, dev_t *devno)
+{
+	struct inode *inode;
+	struct path path;
+	int err;
+
+	if (!pathname || !*pathname)
+		return -EINVAL;
+
+	err = kern_path(pathname, LOOKUP_FOLLOW, &path);
+	if (err)
+		return err;
+
+	inode = d_backing_inode(path.dentry);
+	if (!S_ISCHR(inode->i_mode)) {
+		err = -EINVAL;
+		goto out_path_put;
+	}
+
+	if (!may_open_dev(&path)) { /* had to export this */
+		err = -EACCES;
+		goto out_path_put;
+	}
+
+	*devno = inode->i_rdev;
+
+out_path_put:
+	path_put(&path);
+	return err;
+}
+
+/**
+ * famfs_fuse_get_daxdev() - Retrieve info for a DAX device from fuse server
+ *
+ * Send a GET_DAXDEV message to the fuse server to retrieve info on a
+ * dax device.
+ *
+ * @fm:     fuse_mount
+ * @index:  the index of the dax device; daxdevs are referred to by index
+ *          in fmaps, and the server resolves the index to a particular daxdev
+ *
+ * Returns: 0=success
+ *          -errno=failure
+ */
+static int
+famfs_fuse_get_daxdev(struct fuse_mount *fm, const u64 index)
+{
+	struct fuse_daxdev_out daxdev_out = { 0 };
+	struct fuse_conn *fc = fm->fc;
+	struct famfs_daxdev *daxdev;
+	int rc;
+
+	FUSE_ARGS(args);
+
+	/* Store the daxdev in our table */
+	if (index >= fc->dax_devlist->nslots) {
+		pr_err("%s: index(%lld) > nslots(%d)\n",
+		       __func__, index, fc->dax_devlist->nslots);
+		return -EINVAL;
+	}
+
+	args.opcode = FUSE_GET_DAXDEV;
+	args.nodeid = index;
+
+	args.in_numargs = 0;
+
+	args.out_numargs = 1;
+	args.out_args[0].size = sizeof(daxdev_out);
+	args.out_args[0].value = &daxdev_out;
+
+	/* Send GET_DAXDEV command */
+	rc = fuse_simple_request(fm, &args);
+	if (rc) {
+		pr_err("%s: rc=%d from fuse_simple_request()\n",
+		       __func__, rc);
+		/* Error will be that the payload is smaller than FMAP_BUFSIZE,
+		 * which is the max we can handle. Empty payload handled below.
+		 */
+		return rc;
+	}
+
+	scoped_guard(rwsem_write, &fc->famfs_devlist_sem) {
+		daxdev = &fc->dax_devlist->devlist[index];
+
+		/* Abort if daxdev is now valid (races are possible here) */
+		if (daxdev->valid) {
+			pr_debug("%s: daxdev already known\n", __func__);
+			return 0;
+		}
+
+		/* Verify dev is valid and can be opened and gets the devno */
+		rc = famfs_verify_daxdev(daxdev_out.name, &daxdev->devno);
+		if (rc) {
+			pr_err("%s: rc=%d from famfs_verify_daxdev()\n",
+			       __func__, rc);
+			return rc;
+		}
+
+		daxdev->name = kstrdup(daxdev_out.name, GFP_KERNEL);
+		if (!daxdev->name)
+			return -ENOMEM;
+
+		/* This will fail if it's not a dax device */
+		daxdev->devp = dax_dev_get(daxdev->devno);
+		if (!daxdev->devp) {
+			pr_warn("%s: device %s not found or not dax\n",
+				__func__, daxdev_out.name);
+			kfree(daxdev->name);
+			daxdev->name = NULL;
+			return -ENODEV;
+		}
+
+		wmb(); /* All other fields must be visible before valid */
+		daxdev->valid = 1;
+	}
+
+	return 0;
+}
+
+/**
+ * famfs_update_daxdev_table() - Update the daxdev table
+ * @fm:   fuse_mount
+ * @meta: famfs_file_meta, in-memory format, built from a GET_FMAP response
+ *
+ * This function is called for each new file fmap, to verify whether all
+ * referenced daxdevs are already known (i.e. in the table). Any daxdev
+ * indices referenced in @meta but not in the table will be retrieved via
+ * famfs_fuse_get_daxdev() and added to the table
+ *
+ * Return: 0=success
+ *         -errno=failure
+ */
+static int
+famfs_update_daxdev_table(
+	struct fuse_mount *fm,
+	const struct famfs_file_meta *meta)
+{
+	struct famfs_dax_devlist *local_devlist;
+	struct fuse_conn *fc = fm->fc;
+	int indices_to_fetch[MAX_DAXDEVS];
+	int n_to_fetch = 0;
+	int err;
+
+	/* First time through we will need to allocate the dax_devlist */
+	if (!fc->dax_devlist) {
+		local_devlist = kcalloc(1, sizeof(*fc->dax_devlist), GFP_KERNEL);
+		if (!local_devlist)
+			return -ENOMEM;
+
+		local_devlist->nslots = MAX_DAXDEVS;
+
+		local_devlist->devlist = kcalloc(MAX_DAXDEVS,
+						 sizeof(struct famfs_daxdev),
+						 GFP_KERNEL);
+		if (!local_devlist->devlist) {
+			kfree(local_devlist);
+			return -ENOMEM;
+		}
+
+		/* We don't need famfs_devlist_sem here because we use cmpxchg */
+		if (cmpxchg(&fc->dax_devlist, NULL, local_devlist) != NULL) {
+			kfree(local_devlist->devlist);
+			kfree(local_devlist); /* another thread beat us to it */
+		}
+	}
+
+	/* Collect indices that need fetching while holding read lock */
+	scoped_guard(rwsem_read, &fc->famfs_devlist_sem) {
+		unsigned long i;
+
+		for_each_set_bit(i, (unsigned long *)&meta->dev_bitmap, MAX_DAXDEVS) {
+			if (!(fc->dax_devlist->devlist[i].valid))
+				indices_to_fetch[n_to_fetch++] = i;
+		}
+	}
+
+	/* Fetch needed daxdevs outside the read lock */
+	for (int j = 0; j < n_to_fetch; j++) {
+		err = famfs_fuse_get_daxdev(fm, indices_to_fetch[j]);
+		if (err)
+			pr_err("%s: failed to get daxdev=%d\n",
+			       __func__, indices_to_fetch[j]);
+	}
+
+	return 0;
+}
 
 /***************************************************************************/
 
@@ -184,7 +406,7 @@ famfs_fuse_meta_alloc(
 			/* ie_in = one interleaved extent in fmap_buf */
 			ie_in = fmap_buf + next_offset;
 
-			/* Move past one interleaved extent header in fmap_buf */
+			/* Move past 1 interleaved extent header in fmap_buf */
 			next_offset += sizeof(*ie_in);
 			if (next_offset > fmap_buf_size) {
 				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
@@ -329,6 +551,9 @@ famfs_file_init_dax(
 	if (rc)
 		goto errout;
 
+	/* Make sure this fmap doesn't reference any unknown daxdevs */
+	famfs_update_daxdev_table(fm, meta);
+
 	/* Publish the famfs metadata on fi->famfs_meta */
 	inode_lock(inode);
 
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
index 18ab22bcc5a1..eb9f70b5cb81 100644
--- a/fs/fuse/famfs_kfmap.h
+++ b/fs/fuse/famfs_kfmap.h
@@ -64,4 +64,30 @@ struct famfs_file_meta {
 	};
 };
 
+/*
+ * famfs_daxdev - tracking struct for a daxdev within a famfs file system
+ *
+ * This is the in-memory daxdev metadata that is populated by parsing
+ * the responses to GET_FMAP messages
+ */
+struct famfs_daxdev {
+	/* Include dev uuid? */
+	bool valid;
+	bool error;
+	dev_t devno;
+	struct dax_device *devp;
+	char *name;
+};
+
+#define MAX_DAXDEVS 24
+
+/*
+ * famfs_dax_devlist - list of famfs_daxdev's
+ */
+struct famfs_dax_devlist {
+	int nslots;
+	int ndevs;
+	struct famfs_daxdev *devlist;
+};
+
 #endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index df4e9c9f80bf..8170266cbb02 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1006,6 +1006,11 @@ struct fuse_conn {
 		/* Request timeout (in jiffies). 0 = no timeout */
 		unsigned int req_timeout;
 	} timeout;
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+	struct rw_semaphore famfs_devlist_sem;
+	struct famfs_dax_devlist *dax_devlist;
+#endif
 };
 
 /*
@@ -1647,6 +1652,8 @@ int famfs_file_init_dax(struct fuse_mount *fm,
 			size_t fmap_size);
 void __famfs_meta_free(void *map);
 
+void famfs_teardown(struct fuse_conn *fc);
+
 /* Set fi->famfs_meta = NULL regardless of prior value */
 static inline void famfs_meta_init(struct fuse_inode *fi)
 {
@@ -1668,6 +1675,11 @@ static inline void famfs_meta_free(struct fuse_inode *fi)
 	}
 }
 
+static inline void famfs_init_devlist_sem(struct fuse_conn *fc)
+{
+	init_rwsem(&fc->famfs_devlist_sem);
+}
+
 static inline int fuse_file_famfs(struct fuse_inode *fi)
 {
 	return (READ_ONCE(fi->famfs_meta) != NULL);
@@ -1677,6 +1689,9 @@ int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode);
 
 #else /* !CONFIG_FUSE_FAMFS_DAX */
 
+static inline void famfs_teardown(struct fuse_conn *fc)
+{
+}
 static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
 						  void *meta)
 {
@@ -1687,6 +1702,10 @@ static inline void famfs_meta_free(struct fuse_inode *fi)
 {
 }
 
+static inline void famfs_init_devlist_sem(struct fuse_conn *fc)
+{
+}
+
 static inline int fuse_file_famfs(struct fuse_inode *fi)
 {
 	return 0;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5e692fc84297..40e7ea5b6437 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1048,6 +1048,9 @@ void fuse_conn_put(struct fuse_conn *fc)
 		WARN_ON(atomic_read(&bucket->count) != 1);
 		kfree(bucket);
 	}
+	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+		famfs_teardown(fc);
+
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		fuse_backing_files_free(fc);
 	call_rcu(&fc->rcu, delayed_release);
@@ -1477,8 +1480,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				u64 in_flags = FIELD_PREP(GENMASK_ULL(63, 32), ia->in.flags2)
 						| ia->in.flags;
 
-				if (in_flags & FUSE_DAX_FMAP)
+				if (in_flags & FUSE_DAX_FMAP) {
+					famfs_init_devlist_sem(fc);
 					fc->famfs_iomap = 1;
+				}
 			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
diff --git a/fs/namei.c b/fs/namei.c
index 9e5500dad14f..38e6e4be089d 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -4212,6 +4212,7 @@ bool may_open_dev(const struct path *path)
 	return !(path->mnt->mnt_flags & MNT_NODEV) &&
 		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
 }
+EXPORT_SYMBOL(may_open_dev);
 
 static int may_open(struct mnt_idmap *idmap, const struct path *path,
 		    int acc_mode, int flag)
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index cf678bebbfe0..1b82895108be 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -247,6 +247,9 @@
  *    - struct fuse_famfs_simple_ext
  *    - struct fuse_famfs_iext
  *    - struct fuse_famfs_fmap_header
+ *  - Add the following structs for the GET_DAXDEV message and reply
+ *    - struct fuse_get_daxdev_in
+ *    - struct fuse_get_daxdev_out
  *  - Add the following enumerated types
  *    - enum fuse_famfs_file_type
  *    - enum famfs_ext_type
@@ -678,6 +681,7 @@ enum fuse_opcode {
 
 	/* Famfs / devdax opcodes */
 	FUSE_GET_FMAP           = 54,
+	FUSE_GET_DAXDEV         = 55,
 
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,
@@ -1369,6 +1373,22 @@ struct fuse_famfs_fmap_header {
 	uint64_t reserved1;
 };
 
+struct fuse_get_daxdev_in {
+	uint32_t        daxdev_num;
+};
+
+#define DAXDEV_NAME_MAX 256
+
+/* fuse_daxdev_out has enough space for a uuid if we need it */
+struct fuse_daxdev_out {
+	uint16_t index;
+	uint16_t reserved;
+	uint32_t reserved2;
+	uint64_t reserved3;
+	uint64_t reserved4;
+	char name[DAXDEV_NAME_MAX];
+};
+
 static inline int32_t fmap_msg_min_size(void)
 {
 	/* Smallest fmap message is a header plus one simple extent */
-- 
2.53.0



^ permalink raw reply related

* [PATCH V10 04/10] famfs_fuse: Create files with famfs fmaps
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com>

From: John Groves <john@groves.net>

On completion of GET_FMAP message/response, setup the full famfs
metadata such that it's possible to handle read/write/mmap directly to
dax. Note that the devdax_iomap plumbing is not in yet...

* Add famfs_kfmap.h: in-memory structures for resolving famfs file maps
  (fmaps) to dax.
* famfs.c: allocate, initialize and free fmaps
* inode.c: only allow famfs mode if the fuse server has CAP_SYS_RAWIO
* Update MAINTAINERS for the new file.

Signed-off-by: John Groves <john@groves.net>
---
 MAINTAINERS               |   1 +
 fs/fuse/famfs.c           | 339 +++++++++++++++++++++++++++++++++++++-
 fs/fuse/famfs_kfmap.h     |  67 ++++++++
 fs/fuse/fuse_i.h          |   8 +-
 fs/fuse/inode.c           |  20 ++-
 include/uapi/linux/fuse.h |  56 +++++++
 6 files changed, 481 insertions(+), 10 deletions(-)
 create mode 100644 fs/fuse/famfs_kfmap.h

diff --git a/MAINTAINERS b/MAINTAINERS
index a789394552a2..4edb56afb947 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10530,6 +10530,7 @@ L:	linux-cxl@vger.kernel.org
 L:	linux-fsdevel@vger.kernel.org
 S:	Supported
 F:	fs/fuse/famfs.c
+F:	fs/fuse/famfs_kfmap.h
 
 FUTEX SUBSYSTEM
 M:	Thomas Gleixner <tglx@kernel.org>
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
index d238d853afa8..ac52e54e2cb5 100644
--- a/fs/fuse/famfs.c
+++ b/fs/fuse/famfs.c
@@ -18,9 +18,339 @@
 #include <linux/namei.h>
 #include <linux/string.h>
 
+#include "famfs_kfmap.h"
 #include "fuse_i.h"
 
 
+/***************************************************************************/
+
+void __famfs_meta_free(void *famfs_meta)
+{
+	struct famfs_file_meta *fmap = famfs_meta;
+
+	if (!fmap)
+		return;
+
+	switch (fmap->fm_extent_type) {
+	case SIMPLE_DAX_EXTENT:
+		kfree(fmap->se);
+		break;
+	case INTERLEAVED_EXTENT:
+		if (fmap->ie) {
+			for (int i = 0; i < fmap->fm_niext; i++)
+				kfree(fmap->ie[i].ie_strips);
+		}
+		kfree(fmap->ie);
+		break;
+	default:
+		pr_err("%s: invalid fmap type\n", __func__);
+		break;
+	}
+
+	kfree(fmap);
+}
+DEFINE_FREE(__famfs_meta_free, void *, if (_T) __famfs_meta_free(_T))
+
+static int
+famfs_check_ext_alignment(struct famfs_meta_simple_ext *se)
+{
+	int errs = 0;
+
+	if (se->dev_index != 0)
+		errs++;
+
+	/* TODO: pass in alignment so we can support the other page sizes */
+	if (!IS_ALIGNED(se->ext_offset, PMD_SIZE))
+		errs++;
+
+	if (!IS_ALIGNED(se->ext_len, PMD_SIZE))
+		errs++;
+
+	return errs;
+}
+
+/**
+ * famfs_fuse_meta_alloc() - Allocate famfs file metadata
+ * @fmap_buf:  fmap buffer from fuse server
+ * @fmap_buf_size: size of fmap buffer
+ * @metap:         pointer where 'struct famfs_file_meta' is returned
+ *
+ * Returns: 0=success
+ *          -errno=failure
+ */
+static int
+famfs_fuse_meta_alloc(
+	void *fmap_buf,
+	size_t fmap_buf_size,
+	struct famfs_file_meta **metap)
+{
+	struct fuse_famfs_fmap_header *fmh;
+	size_t extent_total = 0;
+	size_t next_offset = 0;
+	int errs = 0;
+	int i, j;
+
+	fmh = fmap_buf;
+
+	/* Move past fmh in fmap_buf */
+	next_offset += sizeof(*fmh);
+	if (next_offset > fmap_buf_size) {
+		pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+		       __func__, __LINE__, next_offset, fmap_buf_size);
+		return -EINVAL;
+	}
+
+	if (fmh->nextents < 1) {
+		pr_err("%s: nextents %d < 1\n", __func__, fmh->nextents);
+		return -ERANGE;
+	}
+
+	if (fmh->nextents > FUSE_FAMFS_MAX_EXTENTS) {
+		pr_err("%s: nextents %d > max (%d) 1\n",
+		       __func__, fmh->nextents, FUSE_FAMFS_MAX_EXTENTS);
+		return -ERANGE;
+	}
+
+	struct famfs_file_meta *meta __free(__famfs_meta_free) = kzalloc(sizeof(*meta), GFP_KERNEL);
+
+	if (!meta)
+		return -ENOMEM;
+
+	meta->error = false;
+	meta->file_type = fmh->file_type;
+	meta->file_size = fmh->file_size;
+	meta->fm_extent_type = fmh->ext_type;
+
+	switch (fmh->ext_type) {
+	case FUSE_FAMFS_EXT_SIMPLE: {
+		struct fuse_famfs_simple_ext *se_in;
+
+		se_in = fmap_buf + next_offset;
+
+		/* Move past simple extents */
+		next_offset += fmh->nextents * sizeof(*se_in);
+		if (next_offset > fmap_buf_size) {
+			pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+			       __func__, __LINE__, next_offset, fmap_buf_size);
+			return -EINVAL;
+		}
+
+		meta->fm_nextents = fmh->nextents;
+
+		meta->se = kcalloc(meta->fm_nextents, sizeof(*(meta->se)),
+				   GFP_KERNEL);
+		if (!meta->se)
+			return -ENOMEM;
+
+		if ((meta->fm_nextents > FUSE_FAMFS_MAX_EXTENTS) ||
+		    (meta->fm_nextents < 1))
+			return -EINVAL;
+
+		for (i = 0; i < fmh->nextents; i++) {
+			meta->se[i].dev_index  = se_in[i].se_devindex;
+			meta->se[i].ext_offset = se_in[i].se_offset;
+			meta->se[i].ext_len    = se_in[i].se_len;
+
+			/* Record bitmap of referenced daxdev indices */
+			meta->dev_bitmap |= (1 << meta->se[i].dev_index);
+
+			errs += famfs_check_ext_alignment(&meta->se[i]);
+
+			extent_total += meta->se[i].ext_len;
+		}
+		break;
+	}
+
+	case FUSE_FAMFS_EXT_INTERLEAVE: {
+		s64 size_remainder = meta->file_size;
+		struct fuse_famfs_iext *ie_in;
+		int niext = fmh->nextents;
+
+		meta->fm_niext = niext;
+
+		/* Allocate interleaved extent */
+		meta->ie = kcalloc(niext, sizeof(*(meta->ie)), GFP_KERNEL);
+		if (!meta->ie)
+			return -ENOMEM;
+
+		/*
+		 * Each interleaved extent has a simple extent list of strips.
+		 * Outer loop is over separate interleaved extents
+		 */
+		for (i = 0; i < niext; i++) {
+			u64 nstrips;
+			struct fuse_famfs_simple_ext *sie_in;
+
+			/* ie_in = one interleaved extent in fmap_buf */
+			ie_in = fmap_buf + next_offset;
+
+			/* Move past one interleaved extent header in fmap_buf */
+			next_offset += sizeof(*ie_in);
+			if (next_offset > fmap_buf_size) {
+				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+				       __func__, __LINE__, next_offset,
+				       fmap_buf_size);
+				return -EINVAL;
+			}
+
+			if (!IS_ALIGNED(ie_in->ie_chunk_size, PMD_SIZE)) {
+				pr_err("%s: chunk_size %lld not PMD-aligned\n",
+				       __func__, meta->ie[i].fie_chunk_size);
+				return -EINVAL;
+			}
+
+			if (ie_in->ie_nbytes == 0) {
+				pr_err("%s: zero-length interleave!\n",
+				       __func__);
+				return -EINVAL;
+			}
+
+			nstrips = ie_in->ie_nstrips;
+			meta->ie[i].fie_chunk_size = ie_in->ie_chunk_size;
+			meta->ie[i].fie_nstrips    = ie_in->ie_nstrips;
+			meta->ie[i].fie_nbytes     = ie_in->ie_nbytes;
+
+			/* sie_in = the strip extents in fmap_buf */
+			sie_in = fmap_buf + next_offset;
+
+			/* Move past strip extents in fmap_buf */
+			next_offset += nstrips * sizeof(*sie_in);
+			if (next_offset > fmap_buf_size) {
+				pr_err("%s:%d: fmap_buf underflow offset/size %ld/%ld\n",
+				       __func__, __LINE__, next_offset,
+				       fmap_buf_size);
+				return -EINVAL;
+			}
+
+			if ((nstrips > FUSE_FAMFS_MAX_STRIPS) || (nstrips < 1)) {
+				pr_err("%s: invalid nstrips=%lld (max=%d)\n",
+				       __func__, nstrips,
+				       FUSE_FAMFS_MAX_STRIPS);
+				errs++;
+			}
+
+			/* Allocate strip extent array */
+			meta->ie[i].ie_strips =
+				kcalloc(ie_in->ie_nstrips,
+					sizeof(meta->ie[i].ie_strips[0]),
+					GFP_KERNEL);
+			if (!meta->ie[i].ie_strips)
+				return -ENOMEM;
+
+			/* Inner loop is over strips */
+			for (j = 0; j < nstrips; j++) {
+				struct famfs_meta_simple_ext *strips_out;
+				u64 devindex = sie_in[j].se_devindex;
+				u64 offset   = sie_in[j].se_offset;
+				u64 len      = sie_in[j].se_len;
+
+				strips_out = meta->ie[i].ie_strips;
+				strips_out[j].dev_index  = devindex;
+				strips_out[j].ext_offset = offset;
+				strips_out[j].ext_len    = len;
+
+				/* Record bitmap of referenced daxdev indices */
+				meta->dev_bitmap |= (1 << devindex);
+
+				extent_total += len;
+				errs += famfs_check_ext_alignment(&strips_out[j]);
+				size_remainder -= len;
+			}
+		}
+
+		if (size_remainder > 0) {
+			/* Sum of interleaved extent sizes is less than file size! */
+			pr_err("%s: size_remainder %lld (0x%llx)\n",
+			       __func__, size_remainder, size_remainder);
+			return -EINVAL;
+		}
+		break;
+	}
+
+	default:
+		pr_err("%s: invalid ext_type %d\n", __func__, fmh->ext_type);
+		return -EINVAL;
+	}
+
+	if (errs > 0) {
+		pr_err("%s: %d alignment errors found\n", __func__, errs);
+		return -EINVAL;
+	}
+
+	/* More sanity checks */
+	if (extent_total < meta->file_size) {
+		pr_err("%s: file size %ld larger than map size %ld\n",
+		       __func__, meta->file_size, extent_total);
+		return -EINVAL;
+	}
+
+	if (cmpxchg(metap, NULL, meta) != NULL) {
+		pr_debug("%s: fmap race detected\n", __func__);
+		return 0; /* fmap already installed */
+	}
+	retain_and_null_ptr(meta);
+
+	return 0;
+}
+
+/**
+ * famfs_file_init_dax() - init famfs dax file metadata
+ *
+ * @fm:        fuse_mount
+ * @inode:     the inode
+ * @fmap_buf:  fmap response message
+ * @fmap_size: Size of the fmap message
+ *
+ * Initialize famfs metadata for a file, based on the contents of the GET_FMAP
+ * response
+ *
+ * Return: 0=success
+ *          -errno=failure
+ */
+int
+famfs_file_init_dax(
+	struct fuse_mount *fm,
+	struct inode *inode,
+	void *fmap_buf,
+	size_t fmap_size)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct famfs_file_meta *meta = NULL;
+	int rc;
+
+	if (fi->famfs_meta) {
+		pr_notice("%s: i_no=%ld fmap_size=%ld ALREADY INITIALIZED\n",
+			  __func__,
+			  inode->i_ino, fmap_size);
+		return 0;
+	}
+
+	rc = famfs_fuse_meta_alloc(fmap_buf, fmap_size, &meta);
+	if (rc)
+		goto errout;
+
+	/* Publish the famfs metadata on fi->famfs_meta */
+	inode_lock(inode);
+
+	if (famfs_meta_set(fi, meta) == NULL) {
+		i_size_write(inode, meta->file_size);
+		inode->i_flags |= S_DAX;
+	} else {
+		pr_debug("%s: file already had metadata\n", __func__);
+		__famfs_meta_free(meta);
+		/* rc is 0 - the file is valid */
+	}
+
+	inode_unlock(inode);
+	return 0;
+
+errout:
+	if (rc)
+		__famfs_meta_free(meta);
+
+	return rc;
+}
+
 #define FMAP_BUFSIZE PAGE_SIZE
 
 int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
@@ -63,11 +393,8 @@ int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
 	}
 	fmap_size = rc;
 
-	/* We retrieved the "fmap" (the file's map to memory), but
-	 * we haven't used it yet. A call to famfs_file_init_dax() will be added
-	 * here in a subsequent patch, when we add the ability to attach
-	 * fmaps to files.
-	 */
+	/* Convert fmap into in-memory format and hang from inode */
+	rc = famfs_file_init_dax(fm, inode, fmap_buf, fmap_size);
 
-	return 0;
+	return rc;
 }
diff --git a/fs/fuse/famfs_kfmap.h b/fs/fuse/famfs_kfmap.h
new file mode 100644
index 000000000000..18ab22bcc5a1
--- /dev/null
+++ b/fs/fuse/famfs_kfmap.h
@@ -0,0 +1,67 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2026 Micron Technology, Inc.
+ */
+#ifndef FAMFS_KFMAP_H
+#define FAMFS_KFMAP_H
+
+/*
+ * The structures below are the in-memory metadata format for famfs files.
+ * Metadata retrieved via the GET_FMAP response is converted to this format
+ * for use in resolving file mapping faults.
+ *
+ * The GET_FMAP response contains the same information, but in a more
+ * message-and-versioning-friendly format. Those structs can be found in the
+ * famfs section of include/uapi/linux/fuse.h (aka fuse_kernel.h in libfuse)
+ */
+
+enum famfs_file_type {
+	FAMFS_REG,
+	FAMFS_SUPERBLOCK,
+	FAMFS_LOG,
+};
+
+/* We anticipate the possibility of supporting additional types of extents */
+enum famfs_extent_type {
+	SIMPLE_DAX_EXTENT,
+	INTERLEAVED_EXTENT,
+	INVALID_EXTENT_TYPE,
+};
+
+struct famfs_meta_simple_ext {
+	u64 dev_index;
+	u64 ext_offset;
+	u64 ext_len;
+};
+
+struct famfs_meta_interleaved_ext {
+	u64 fie_nstrips;
+	u64 fie_chunk_size;
+	u64 fie_nbytes;
+	struct famfs_meta_simple_ext *ie_strips;
+};
+
+/*
+ * Each famfs dax file has this hanging from its fuse_inode->famfs_meta
+ */
+struct famfs_file_meta {
+	bool                   error;
+	enum famfs_file_type   file_type;
+	size_t                 file_size;
+	enum famfs_extent_type fm_extent_type;
+	u64 dev_bitmap; /* bitmap of referenced daxdevs by index */
+	union {
+		struct {
+			size_t         fm_nextents;
+			struct famfs_meta_simple_ext  *se;
+		};
+		struct {
+			size_t         fm_niext;
+			struct famfs_meta_interleaved_ext *ie;
+		};
+	};
+};
+
+#endif /* FAMFS_KFMAP_H */
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index b5466743c13f..df4e9c9f80bf 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1642,6 +1642,9 @@ extern void fuse_sysctl_unregister(void);
 /* famfs.c */
 
 #if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+int famfs_file_init_dax(struct fuse_mount *fm,
+			struct inode *inode, void *fmap_buf,
+			size_t fmap_size);
 void __famfs_meta_free(void *map);
 
 /* Set fi->famfs_meta = NULL regardless of prior value */
@@ -1659,7 +1662,10 @@ static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
 
 static inline void famfs_meta_free(struct fuse_inode *fi)
 {
-	famfs_meta_set(fi, NULL);
+	if (fi->famfs_meta != NULL) {
+		__famfs_meta_free(fi->famfs_meta);
+		famfs_meta_set(fi, NULL);
+	}
 }
 
 static inline int fuse_file_famfs(struct fuse_inode *fi)
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 862f4e61a5fb..5e692fc84297 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -10,6 +10,7 @@
 #include "fuse_dev_i.h"
 #include "dev_uring_i.h"
 
+#include <linux/bitfield.h>
 #include <linux/dax.h>
 #include <linux/pagemap.h>
 #include <linux/slab.h>
@@ -1464,8 +1465,21 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 				timeout = arg->request_timeout;
 
 			if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
-			    flags & FUSE_DAX_FMAP)
-				fc->famfs_iomap = 1;
+			    flags & FUSE_DAX_FMAP) {
+				/* famfs_iomap is only allowed if the fuse
+				 * server has CAP_SYS_RAWIO. This was checked
+				 * in fuse_send_init, and FUSE_DAX_IOMAP was
+				 * set in in_flags if so. Only allow enablement
+				 * if we find it there. This function is
+				 * normally not running in fuse server context,
+				 * so we can't do the capability check here...
+				 */
+				u64 in_flags = FIELD_PREP(GENMASK_ULL(63, 32), ia->in.flags2)
+						| ia->in.flags;
+
+				if (in_flags & FUSE_DAX_FMAP)
+					fc->famfs_iomap = 1;
+			}
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1527,7 +1541,7 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
 		flags |= FUSE_SUBMOUNTS;
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		flags |= FUSE_PASSTHROUGH;
-	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) && capable(CAP_SYS_RAWIO))
 		flags |= FUSE_DAX_FMAP;
 
 	/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 9eff9083d3b5..cf678bebbfe0 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -243,6 +243,13 @@
  *
  *  7.46
  *  - Add FUSE_DAX_FMAP capability - ability to handle in-kernel fsdax maps
+ *  - Add the following structures for the GET_FMAP message reply components:
+ *    - struct fuse_famfs_simple_ext
+ *    - struct fuse_famfs_iext
+ *    - struct fuse_famfs_fmap_header
+ *  - Add the following enumerated types
+ *    - enum fuse_famfs_file_type
+ *    - enum famfs_ext_type
  */
 
 #ifndef _LINUX_FUSE_H
@@ -1318,6 +1325,55 @@ struct fuse_uring_cmd_req {
 
 /* Famfs fmap message components */
 
+#define FAMFS_FMAP_VERSION 1
+
 #define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
+#define FUSE_FAMFS_MAX_EXTENTS 32
+#define FUSE_FAMFS_MAX_STRIPS 32
+
+enum fuse_famfs_file_type {
+	FUSE_FAMFS_FILE_REG,
+	FUSE_FAMFS_FILE_SUPERBLOCK,
+	FUSE_FAMFS_FILE_LOG,
+};
+
+enum famfs_ext_type {
+	FUSE_FAMFS_EXT_SIMPLE = 0,
+	FUSE_FAMFS_EXT_INTERLEAVE = 1,
+};
+
+struct fuse_famfs_simple_ext {
+	uint32_t se_devindex;
+	uint32_t reserved;
+	uint64_t se_offset;
+	uint64_t se_len;
+};
+
+struct fuse_famfs_iext { /* Interleaved extent */
+	uint32_t ie_nstrips;
+	uint32_t ie_chunk_size;
+	uint64_t ie_nbytes; /* Total bytes for this interleaved_ext;
+			     * sum of strips may be more
+			     */
+	uint64_t reserved;
+};
+
+struct fuse_famfs_fmap_header {
+	uint8_t file_type; /* enum famfs_file_type */
+	uint8_t reserved;
+	uint16_t fmap_version;
+	uint32_t ext_type; /* enum famfs_log_ext_type */
+	uint32_t nextents;
+	uint32_t reserved0;
+	uint64_t file_size;
+	uint64_t reserved1;
+};
+
+static inline int32_t fmap_msg_min_size(void)
+{
+	/* Smallest fmap message is a header plus one simple extent */
+	return (sizeof(struct fuse_famfs_fmap_header)
+		+ sizeof(struct fuse_famfs_simple_ext));
+}
 
 #endif /* _LINUX_FUSE_H */
-- 
2.53.0



^ permalink raw reply related

* [PATCH V10 03/10] famfs_fuse: Plumb the GET_FMAP message/response
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com>

From: John Groves <john@groves.net>

Upon completion of an OPEN, if we're in famfs-mode we do a GET_FMAP to
retrieve and cache up the file-to-dax map in the kernel. If this
succeeds, read/write/mmap are resolved direct-to-dax with no upcalls.

Signed-off-by: John Groves <john@groves.net>
---
 MAINTAINERS               |  8 +++++
 fs/fuse/Makefile          |  1 +
 fs/fuse/famfs.c           | 73 +++++++++++++++++++++++++++++++++++++++
 fs/fuse/file.c            | 14 +++++++-
 fs/fuse/fuse_i.h          | 70 ++++++++++++++++++++++++++++++++++---
 fs/fuse/inode.c           |  8 ++++-
 fs/fuse/iomode.c          |  2 +-
 include/uapi/linux/fuse.h |  7 ++++
 8 files changed, 175 insertions(+), 8 deletions(-)
 create mode 100644 fs/fuse/famfs.c

diff --git a/MAINTAINERS b/MAINTAINERS
index ac49067c64ee..a789394552a2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10523,6 +10523,14 @@ F:	fs/fuse/
 F:	include/uapi/linux/fuse.h
 F:	tools/testing/selftests/filesystems/fuse/
 
+FUSE [FAMFS Fabric-Attached Memory File System]
+M:	John Groves <jgroves@micron.com>
+M:	John Groves <John@Groves.net>
+L:	linux-cxl@vger.kernel.org
+L:	linux-fsdevel@vger.kernel.org
+S:	Supported
+F:	fs/fuse/famfs.c
+
 FUTEX SUBSYSTEM
 M:	Thomas Gleixner <tglx@kernel.org>
 M:	Ingo Molnar <mingo@redhat.com>
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
index 22ad9538dfc4..3f8dcc8cbbd0 100644
--- a/fs/fuse/Makefile
+++ b/fs/fuse/Makefile
@@ -17,5 +17,6 @@ fuse-$(CONFIG_FUSE_DAX) += dax.o
 fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o
 fuse-$(CONFIG_SYSCTL) += sysctl.o
 fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o
+fuse-$(CONFIG_FUSE_FAMFS_DAX) += famfs.o
 
 virtiofs-y := virtio_fs.o
diff --git a/fs/fuse/famfs.c b/fs/fuse/famfs.c
new file mode 100644
index 000000000000..d238d853afa8
--- /dev/null
+++ b/fs/fuse/famfs.c
@@ -0,0 +1,73 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * famfs - dax file system for shared fabric-attached memory
+ *
+ * Copyright 2023-2026 Micron Technology, Inc.
+ *
+ * This file system, originally based on ramfs the dax support from xfs,
+ * is intended to allow multiple host systems to mount a common file system
+ * view of dax files that map to shared memory.
+ */
+
+#include <linux/cleanup.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/dax.h>
+#include <linux/iomap.h>
+#include <linux/path.h>
+#include <linux/namei.h>
+#include <linux/string.h>
+
+#include "fuse_i.h"
+
+
+#define FMAP_BUFSIZE PAGE_SIZE
+
+int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	size_t fmap_bufsize = FMAP_BUFSIZE;
+	u64 nodeid = get_node_id(inode);
+	ssize_t fmap_size;
+	int rc;
+
+	FUSE_ARGS(args);
+
+	/* Don't retrieve if we already have the famfs metadata */
+	if (fi->famfs_meta)
+		return 0;
+
+	void *fmap_buf __free(kfree) = kzalloc(FMAP_BUFSIZE, GFP_KERNEL);
+
+	if (!fmap_buf)
+		return -ENOMEM;
+
+	args.opcode = FUSE_GET_FMAP;
+	args.nodeid = nodeid;
+
+	/* Variable-sized output buffer
+	 * this causes fuse_simple_request() to return the size of the
+	 * output payload
+	 */
+	args.out_argvar = true;
+	args.out_numargs = 1;
+	args.out_args[0].size = fmap_bufsize;
+	args.out_args[0].value = fmap_buf;
+
+	/* Send GET_FMAP command */
+	rc = fuse_simple_request(fm, &args);
+	if (rc < 0) {
+		pr_err("%s: err=%d from fuse_simple_request()\n",
+		       __func__, rc);
+		return rc;
+	}
+	fmap_size = rc;
+
+	/* We retrieved the "fmap" (the file's map to memory), but
+	 * we haven't used it yet. A call to famfs_file_init_dax() will be added
+	 * here in a subsequent patch, when we add the ability to attach
+	 * fmaps to files.
+	 */
+
+	return 0;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 150f2e1d6c2f..605f1c6cc10e 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -277,6 +277,16 @@ static int fuse_open(struct inode *inode, struct file *file)
 	err = fuse_do_open(fm, get_node_id(inode), file, false);
 	if (!err) {
 		ff = file->private_data;
+
+		if ((fm->fc->famfs_iomap) && (S_ISREG(inode->i_mode))) {
+			/* Get the famfs fmap - failure is fatal */
+			err = fuse_get_fmap(fm, inode);
+			if (err) {
+				fuse_sync_release(fi, ff, file->f_flags);
+				goto out_nowrite;
+			}
+		}
+
 		err = fuse_finish_open(inode, file);
 		if (err)
 			fuse_sync_release(fi, ff, file->f_flags);
@@ -284,12 +294,14 @@ static int fuse_open(struct inode *inode, struct file *file)
 			fuse_truncate_update_attr(inode, file);
 	}
 
+out_nowrite:
 	if (is_wb_truncate || dax_truncate)
 		fuse_release_nowrite(inode);
 	if (!err) {
 		if (is_truncate)
 			truncate_pagecache(inode, 0);
-		else if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+		else if (!(ff->open_flags & FOPEN_KEEP_CACHE) &&
+			 !fuse_file_famfs(fi))
 			invalidate_inode_pages2(inode->i_mapping);
 	}
 	if (dax_truncate)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 712038a554d9..b5466743c13f 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -223,6 +223,14 @@ struct fuse_inode {
 	 * so preserve the blocksize specified by the server.
 	 */
 	u8 cached_i_blkbits;
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+	/* Pointer to the file's famfs metadata. Primary content is the
+	 * in-memory version of the fmap - the map from file's offset range
+	 * to DAX memory
+	 */
+	void *famfs_meta;
+#endif
 };
 
 /** FUSE inode state bits */
@@ -1511,11 +1519,8 @@ void fuse_free_conn(struct fuse_conn *fc);
 
 /* dax.c */
 
-static inline bool fuse_file_famfs(struct fuse_inode *fuse_inode) /* Will be superseded */
-{
-	(void)fuse_inode;
-	return false;
-}
+static inline int fuse_file_famfs(struct fuse_inode *fi); /* forward */
+
 #define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX)	\
 					&& IS_DAX(&(fuse_inode)->inode)  \
 					&& !fuse_file_famfs(fuse_inode))
@@ -1634,4 +1639,59 @@ extern void fuse_sysctl_unregister(void);
 #define fuse_sysctl_unregister()	do { } while (0)
 #endif /* CONFIG_SYSCTL */
 
+/* famfs.c */
+
+#if IS_ENABLED(CONFIG_FUSE_FAMFS_DAX)
+void __famfs_meta_free(void *map);
+
+/* Set fi->famfs_meta = NULL regardless of prior value */
+static inline void famfs_meta_init(struct fuse_inode *fi)
+{
+	fi->famfs_meta = NULL;
+}
+
+/* Set fi->famfs_meta iff the current value is NULL */
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+						  void *meta)
+{
+	return cmpxchg(&fi->famfs_meta, NULL, meta);
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+	famfs_meta_set(fi, NULL);
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+	return (READ_ONCE(fi->famfs_meta) != NULL);
+}
+
+int fuse_get_fmap(struct fuse_mount *fm, struct inode *inode);
+
+#else /* !CONFIG_FUSE_FAMFS_DAX */
+
+static inline struct fuse_backing *famfs_meta_set(struct fuse_inode *fi,
+						  void *meta)
+{
+	return NULL;
+}
+
+static inline void famfs_meta_free(struct fuse_inode *fi)
+{
+}
+
+static inline int fuse_file_famfs(struct fuse_inode *fi)
+{
+	return 0;
+}
+
+static inline int
+fuse_get_fmap(struct fuse_mount *fm, struct inode *inode)
+{
+	return 0;
+}
+
+#endif /* CONFIG_FUSE_FAMFS_DAX */
+
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f4a265734270..862f4e61a5fb 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -120,6 +120,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		fuse_inode_backing_set(fi, NULL);
 
+	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+		famfs_meta_set(fi, NULL);
+
 	return &fi->inode;
 
 out_free_forget:
@@ -141,6 +144,9 @@ static void fuse_free_inode(struct inode *inode)
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		fuse_backing_put(fuse_inode_backing(fi));
 
+	if (S_ISREG(inode->i_mode) && fuse_file_famfs(fi))
+		famfs_meta_free(fi);
+
 	kmem_cache_free(fuse_inode_cachep, fi);
 }
 
@@ -162,7 +168,7 @@ static void fuse_evict_inode(struct inode *inode)
 	/* Will write inode on close/munmap and in all other dirtiers */
 	WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
 
-	if (FUSE_IS_VIRTIO_DAX(fi))
+	if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi))
 		dax_break_layout_final(inode);
 
 	truncate_inode_pages_final(&inode->i_data);
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index 31ee7f3304c6..948148316ef0 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -203,7 +203,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
 	 * io modes are not relevant with DAX and with server that does not
 	 * implement open.
 	 */
-	if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
+	if (FUSE_IS_VIRTIO_DAX(fi) || fuse_file_famfs(fi) || !ff->args)
 		return 0;
 
 	/*
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index 25686f088e6a..9eff9083d3b5 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -669,6 +669,9 @@ enum fuse_opcode {
 	FUSE_STATX		= 52,
 	FUSE_COPY_FILE_RANGE_64	= 53,
 
+	/* Famfs / devdax opcodes */
+	FUSE_GET_FMAP           = 54,
+
 	/* CUSE specific operations */
 	CUSE_INIT		= 4096,
 
@@ -1313,4 +1316,8 @@ struct fuse_uring_cmd_req {
 	uint8_t padding[6];
 };
 
+/* Famfs fmap message components */
+
+#define FAMFS_FMAP_MAX 32768 /* Largest supported fmap message */
+
 #endif /* _LINUX_FUSE_H */
-- 
2.53.0



^ permalink raw reply related

* [PATCH V10 02/10] famfs_fuse: Basic fuse kernel ABI enablement for famfs
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com>

From: John Groves <john@groves.net>

This patch starts the kernel ABI enablement of famfs in fuse.

- Kconfig: Add FUSE_FAMFS_DAX config parameter, to control
  compilation of famfs within fuse.
- FUSE_DAX_FMAP flag in INIT request/reply
- fuse_conn->famfs_iomap (enable famfs-mapped files) to denote a
  famfs-enabled connection

Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: John Groves <john@groves.net>
---
 fs/fuse/Kconfig           | 13 +++++++++++++
 fs/fuse/fuse_i.h          |  3 +++
 fs/fuse/inode.c           |  6 ++++++
 include/uapi/linux/fuse.h |  5 +++++
 4 files changed, 27 insertions(+)

diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 3a4ae632c94a..17fe1f490cbd 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -76,3 +76,16 @@ config FUSE_IO_URING
 
 	  If you want to allow fuse server/client communication through io-uring,
 	  answer Y
+
+config FUSE_FAMFS_DAX
+	bool "FUSE support for fs-dax filesystems backed by devdax"
+	depends on FUSE_FS
+	depends on DEV_DAX_FSDEV
+	default FUSE_FS
+	help
+	  This enables the fabric-attached memory file system (famfs),
+	  which enables formatting devdax memory as a file system. Famfs
+	  is primarily intended for scale-out shared access to
+	  disaggregated memory.
+
+	  To enable famfs or other fuse/fs-dax file systems, answer Y
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 80bf4438c436..712038a554d9 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -921,6 +921,9 @@ struct fuse_conn {
 	/* Is synchronous FUSE_INIT allowed? */
 	unsigned int sync_init:1;
 
+	/* dev_dax_iomap support for famfs */
+	unsigned int famfs_iomap:1;
+
 	/* Use io_uring for communication */
 	unsigned int io_uring;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f688c31f7eef..f4a265734270 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1456,6 +1456,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args,
 
 			if (flags & FUSE_REQUEST_TIMEOUT)
 				timeout = arg->request_timeout;
+
+			if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX) &&
+			    flags & FUSE_DAX_FMAP)
+				fc->famfs_iomap = 1;
 		} else {
 			ra_pages = fc->max_read / PAGE_SIZE;
 			fc->no_lock = 1;
@@ -1517,6 +1521,8 @@ static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm)
 		flags |= FUSE_SUBMOUNTS;
 	if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH))
 		flags |= FUSE_PASSTHROUGH;
+	if (IS_ENABLED(CONFIG_FUSE_FAMFS_DAX))
+		flags |= FUSE_DAX_FMAP;
 
 	/*
 	 * This is just an information flag for fuse server. No need to check
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
index c13e1f9a2f12..25686f088e6a 100644
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -240,6 +240,9 @@
  *  - add FUSE_COPY_FILE_RANGE_64
  *  - add struct fuse_copy_file_range_out
  *  - add FUSE_NOTIFY_PRUNE
+ *
+ *  7.46
+ *  - Add FUSE_DAX_FMAP capability - ability to handle in-kernel fsdax maps
  */
 
 #ifndef _LINUX_FUSE_H
@@ -448,6 +451,7 @@ struct fuse_file_lock {
  * FUSE_OVER_IO_URING: Indicate that client supports io-uring
  * FUSE_REQUEST_TIMEOUT: kernel supports timing out requests.
  *			 init_out.request_timeout contains the timeout (in secs)
+ * FUSE_DAX_FMAP: kernel supports dev_dax_iomap (aka famfs) fmaps
  */
 #define FUSE_ASYNC_READ		(1 << 0)
 #define FUSE_POSIX_LOCKS	(1 << 1)
@@ -495,6 +499,7 @@ struct fuse_file_lock {
 #define FUSE_ALLOW_IDMAP	(1ULL << 40)
 #define FUSE_OVER_IO_URING	(1ULL << 41)
 #define FUSE_REQUEST_TIMEOUT	(1ULL << 42)
+#define FUSE_DAX_FMAP		(1ULL << 43)
 
 /**
  * CUSE INIT request/reply flags
-- 
2.53.0



^ permalink raw reply related

* [PATCH V10 01/10] famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/
From: John Groves @ 2026-03-31 12:38 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <0100019d43e5f632-f5862a3e-361c-4b54-a9a6-96c242a8f17a-000000@email.amazonses.com>

From: John Groves <john@groves.net>

Virtio_fs now needs to determine if an inode is DAX && not famfs.
This relaces the FUSE_IS_DAX() macro with FUSE_IS_VIRTIO_DAX(),
in preparation for famfs in later commits. The dummy
fuse_file_famfs() macro will be replaced with a working
function.

Reviewed-by: Joanne Koong <joannelkoong@gmail.com>
Reviewed-by: Dave Jiang <dave.jiang@intel.com>
Signed-off-by: John Groves <john@groves.net>
---
 fs/fuse/dir.c    |  2 +-
 fs/fuse/file.c   | 13 ++++++++-----
 fs/fuse/fuse_i.h |  9 ++++++++-
 fs/fuse/inode.c  |  4 ++--
 fs/fuse/iomode.c |  2 +-
 5 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 7ac6b232ef12..c63f097bc697 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -2161,7 +2161,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 		is_truncate = true;
 	}
 
-	if (FUSE_IS_DAX(inode) && is_truncate) {
+	if (FUSE_IS_VIRTIO_DAX(fi) && is_truncate) {
 		filemap_invalidate_lock(mapping);
 		fault_blocked = true;
 		err = fuse_dax_break_layouts(inode, 0, -1);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 676fd9856bfb..150f2e1d6c2f 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -252,7 +252,7 @@ static int fuse_open(struct inode *inode, struct file *file)
 	int err;
 	bool is_truncate = (file->f_flags & O_TRUNC) && fc->atomic_o_trunc;
 	bool is_wb_truncate = is_truncate && fc->writeback_cache;
-	bool dax_truncate = is_truncate && FUSE_IS_DAX(inode);
+	bool dax_truncate = is_truncate && FUSE_IS_VIRTIO_DAX(fi);
 
 	if (fuse_is_bad(inode))
 		return -EIO;
@@ -1812,11 +1812,12 @@ static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct inode *inode = file_inode(file);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	if (fuse_is_bad(inode))
 		return -EIO;
 
-	if (FUSE_IS_DAX(inode))
+	if (FUSE_IS_VIRTIO_DAX(fi))
 		return fuse_dax_read_iter(iocb, to);
 
 	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
@@ -1833,11 +1834,12 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
 	struct inode *inode = file_inode(file);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	if (fuse_is_bad(inode))
 		return -EIO;
 
-	if (FUSE_IS_DAX(inode))
+	if (FUSE_IS_VIRTIO_DAX(fi))
 		return fuse_dax_write_iter(iocb, from);
 
 	/* FOPEN_DIRECT_IO overrides FOPEN_PASSTHROUGH */
@@ -2370,10 +2372,11 @@ static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fm->fc;
 	struct inode *inode = file_inode(file);
+	struct fuse_inode *fi = get_fuse_inode(inode);
 	int rc;
 
 	/* DAX mmap is superior to direct_io mmap */
-	if (FUSE_IS_DAX(inode))
+	if (FUSE_IS_VIRTIO_DAX(fi))
 		return fuse_dax_mmap(file, vma);
 
 	/*
@@ -2934,7 +2937,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		.mode = mode
 	};
 	int err;
-	bool block_faults = FUSE_IS_DAX(inode) &&
+	bool block_faults = FUSE_IS_VIRTIO_DAX(fi) &&
 		(!(mode & FALLOC_FL_KEEP_SIZE) ||
 		 (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE)));
 
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 7f16049387d1..80bf4438c436 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1508,7 +1508,14 @@ void fuse_free_conn(struct fuse_conn *fc);
 
 /* dax.c */
 
-#define FUSE_IS_DAX(inode) (IS_ENABLED(CONFIG_FUSE_DAX) && IS_DAX(inode))
+static inline bool fuse_file_famfs(struct fuse_inode *fuse_inode) /* Will be superseded */
+{
+	(void)fuse_inode;
+	return false;
+}
+#define FUSE_IS_VIRTIO_DAX(fuse_inode) (IS_ENABLED(CONFIG_FUSE_DAX)	\
+					&& IS_DAX(&(fuse_inode)->inode)  \
+					&& !fuse_file_famfs(fuse_inode))
 
 ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to);
 ssize_t fuse_dax_write_iter(struct kiocb *iocb, struct iov_iter *from);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index c795abe47a4f..f688c31f7eef 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -162,7 +162,7 @@ static void fuse_evict_inode(struct inode *inode)
 	/* Will write inode on close/munmap and in all other dirtiers */
 	WARN_ON(inode_state_read_once(inode) & I_DIRTY_INODE);
 
-	if (FUSE_IS_DAX(inode))
+	if (FUSE_IS_VIRTIO_DAX(fi))
 		dax_break_layout_final(inode);
 
 	truncate_inode_pages_final(&inode->i_data);
@@ -170,7 +170,7 @@ static void fuse_evict_inode(struct inode *inode)
 	if (inode->i_sb->s_flags & SB_ACTIVE) {
 		struct fuse_conn *fc = get_fuse_conn(inode);
 
-		if (FUSE_IS_DAX(inode))
+		if (FUSE_IS_VIRTIO_DAX(fi))
 			fuse_dax_inode_cleanup(inode);
 		if (fi->nlookup) {
 			fuse_queue_forget(fc, fi->forget, fi->nodeid,
diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c
index 3728933188f3..31ee7f3304c6 100644
--- a/fs/fuse/iomode.c
+++ b/fs/fuse/iomode.c
@@ -203,7 +203,7 @@ int fuse_file_io_open(struct file *file, struct inode *inode)
 	 * io modes are not relevant with DAX and with server that does not
 	 * implement open.
 	 */
-	if (FUSE_IS_DAX(inode) || !ff->args)
+	if (FUSE_IS_VIRTIO_DAX(fi) || !ff->args)
 		return 0;
 
 	/*
-- 
2.53.0



^ permalink raw reply related

* [PATCH V10 00/10] famfs: port into fuse
From: John Groves @ 2026-03-31 12:37 UTC (permalink / raw)
  To: John Groves, Miklos Szeredi, Dan Williams, Bernd Schubert,
	Alison Schofield
  Cc: John Groves, Jonathan Corbet, Shuah Khan, Vishal Verma,
	Dave Jiang, Matthew Wilcox, Jan Kara, Alexander Viro,
	David Hildenbrand, Christian Brauner, Darrick J . Wong,
	Randy Dunlap, Jeff Layton, Amir Goldstein, Jonathan Cameron,
	Stefan Hajnoczi, Joanne Koong, Josef Bacik, Bagas Sanjaya,
	Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
	Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
	Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
	linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org,
	John Groves
In-Reply-To: <20260331123702.35052-1-john@jagalactic.com>

From: John Groves <john@groves.net>

NOTE: this series depends on the famfs dax series in Ira's for-7.1/dax-famfs
branch [0]

Changes v9 -> v10
- Rebased to Ira's for-7.1/dax-famfs branch [0], which contains the required
  dax patches
- Add parentheses to FUSE_IS_VIRTIO_DAX() macro, in case something bad is
  passed in as fuse_inode (thanks Jonathan's AI)

Description:

This patch series introduces famfs into the fuse file system framework.
Famfs depends on the bundled dax patch set.

The famfs user space code can be found at [1].

Fuse Overview:

Famfs started as a standalone file system, but this series is intended to
permanently supersede that implementation. At a high level, famfs adds
two new fuse server messages:

GET_FMAP   - Retrieves a famfs fmap (the file-to-dax map for a famfs
	     file)
GET_DAXDEV - Retrieves the details of a particular daxdev that was
	     referenced by an fmap

Famfs Overview

Famfs exposes shared memory as a file system. Famfs consumes shared
memory from dax devices, and provides memory-mappable files that map
directly to the memory - no page cache involvement. Famfs differs from
conventional file systems in fs-dax mode, in that it handles in-memory
metadata in a sharable way (which begins with never caching dirty shared
metadata).

Famfs started as a standalone file system [2,3], but the consensus at
LSFMM was that it should be ported into fuse [4,5].

The key performance requirement is that famfs must resolve mapping faults
without upcalls. This is achieved by fully caching the file-to-devdax
metadata for all active files. This is done via two fuse client/server
message/response pairs: GET_FMAP and GET_DAXDEV.

Famfs remains the first fs-dax file system that is backed by devdax
rather than pmem in fs-dax mode (hence the need for the new dax mode).

Notes

- When a file is opened in a famfs mount, the OPEN is followed by a
  GET_FMAP message and response. The "fmap" is the full file-to-dax
  mapping, allowing the fuse/famfs kernel code to handle
  read/write/fault without any upcalls.

- After each GET_FMAP, the fmap is checked for extents that reference
  previously-unknown daxdevs. Each such occurrence is handled with a
  GET_DAXDEV message and response.

- Daxdevs are stored in a table (which might become an xarray at some
  point). When entries are added to the table, we acquire exclusive
  access to the daxdev via the fs_dax_get() call (modeled after how
  fs-dax handles this with pmem devices). Famfs provides
  holder_operations to devdax, providing a notification path in the
  event of memory errors or forced reconfiguration.

- If devdax notifies famfs of memory errors on a dax device, famfs
  currently blocks all subsequent accesses to data on that device. The
  recovery is to re-initialize the memory and file system. Famfs is
  memory, not storage...

- Because famfs uses backing (devdax) devices, only privileged mounts are
  supported (i.e. the fuse server requires CAP_SYS_RAWIO).

- The famfs kernel code never accesses the memory directly - it only
  facilitates read, write and mmap on behalf of user processes, using
  fmap metadata provided by its privileged fuse server. As such, the
  RAS of the shared memory affects applications, but not the kernel.

- Famfs has backing device(s), but they are devdax (char) rather than
  block. Right now there is no way to tell the vfs layer that famfs has a
  char backing device (unless we say it's block, but it's not). Currently
  we use the standard anonymous fuse fs_type - but I'm not sure that's
  ultimately optimal (thoughts?)

Changes v8 -> v9
- Kconfig: fs/fuse/Kconfig:CONFIG_FUSE_FAMFS_DAX now depends on the
  new CONFIG_DEV_DAX_FSDEV (from drivers/dax/Kconfig) rather than
  just CONFIG_DEV_DAX and CONFIG_FS_DAX. (CONFIG_FUSE_FAMFS_DAX
  depends on those...)

Changes v7 -> v8
- Moved to inline __free declaration in fuse_get_fmap() and
  famfs_fuse_meta_alloc(), famfs_teardown()
- Adopted FIELD_PREP() macro rather than manual bitfield manipulation
- Minor doc edits
- I dropped adding magic numbers to include/uapi/linux/magic.h. That
  can be done later if appropriate

Changes v6 -> v7
- Fixed a regression in famfs_interleave_fileofs_to_daxofs() that
  was reported by Intel's kernel test robot
- Added a check in __fsdev_dax_direct_access() for negative return
  from pgoff_to_phys(), which would indicate an out-of-range offset
- Fixed a bug in __famfs_meta_free(), where not all interleaved
  extents were freed
- Added chunksize alignment checks in famfs_fuse_meta_alloc() and
  famfs_interleave_fileofs_to_daxofs() as interleaved chunks must
  be PTE or PMD aligned
- Simplified famfs_file_init_dax() a bit
- Re-ran CM's kernel code review prompts on the entire series and
  fixed several minor issues

Changes v4 -> v5 -> v6
- None. Re-sending due to technical difficulties

Changes v3 [9] -> v4
- The patch "dax: prevent driver unbind while filesystem holds device"
  has been dropped. Dan Williams indicated that the favored behavior is
  for a file system to stop working if an underlying driver is unbound,
  rather than preventing the unbind.
- The patch "famfs_fuse: Famfs mount opt: -o shadow=<shadowpath>" has
  been dropped. Found a way for the famfs user space to do without the
  -o opt (via getxattr).
- Squashed the fs/fuse/Kconfig patch into the first subsequent patch
  that needed the change
  ("famfs_fuse: Basic fuse kernel ABI enablement for famfs")
- Many review comments addressed.
- Addressed minor kerneldoc infractions reported by test robot.

Changes v2 [7] -> v3
- Dax: Completely new fsdev driver (drivers/dax/fsdev.c) replaces the
  dev_dax_iomap modifications to bus.c/device.c. Devdax devices can now
  be switched among 'devdax', 'famfs' and 'system-ram' modes via daxctl
  or sysfs.
- Dax: fsdev uses MEMORY_DEVICE_FS_DAX type and leaves folios at order-0
  (no vmemmap_shift), allowing fs-dax to manage folio lifecycles
  dynamically like pmem does.
- Dax: The "poisoned page" problem is properly fixed via
  fsdev_clear_folio_state(), which clears stale mapping/compound state
  when fsdev binds. The temporary WARN_ON_ONCE workaround in fs/dax.c
  has been removed.
- Dax: Added dax_set_ops() so fsdev can set dax_operations at bind time
  (and clear them on unbind), since the dax_device is created before we
  know which driver will bind.
- Dax: Added custom bind/unbind sysfs handlers; unbind return -EBUSY if a
  filesystem holds the device, preventing unbind while famfs is mounted.
- Fuse: Famfs mounts now require that the fuse server/daemon has
  CAP_SYS_RAWIO because they expose raw memory devices.
- Fuse: Added DAX address_space_operations with noop_dirty_folio since
  famfs is memory-backed with no writeback required.
- Rebased to latest kernels, fully compatible with Alistair Popple
  et. al's recent dax refactoring.
- Ran this series through Chris Mason's code review AI prompts to check
  for issues - several subtle problems found and fixed.
- Dropped RFC status - this version is intended to be mergeable.

Changes v1 [8] -> v2:

- The GET_FMAP message/response has been moved from LOOKUP to OPEN, as
  was the pretty much unanimous consensus.
- Made the response payload to GET_FMAP variable sized (patch 12)
- Dodgy kerneldoc comments cleaned up or removed.
- Fixed memory leak of fc->shadow in patch 11 (thanks Joanne)
- Dropped many pr_debug and pr_notice calls


References

[0] - https://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm.git/
[1] - https://famfs.org (famfs user space)
[2] - https://lore.kernel.org/linux-cxl/cover.1708709155.git.john@groves.net/
[3] - https://lore.kernel.org/linux-cxl/cover.1714409084.git.john@groves.net/
[4] - https://lwn.net/Articles/983105/ (lsfmm 2024)
[5] - https://lwn.net/Articles/1020170/ (lsfmm 2025)
[6] - https://lore.kernel.org/linux-cxl/cover.8068ad144a7eea4a813670301f4d2a86a8e68ec4.1740713401.git-series.apopple@nvidia.com/
[7] - https://lore.kernel.org/linux-fsdevel/20250703185032.46568-1-john@groves.net/ (famfs fuse v2)
[8] - https://lore.kernel.org/linux-fsdevel/20250421013346.32530-1-john@groves.net/ (famfs fuse v1)
[9] - https://lore.kernel.org/linux-fsdevel/20260107153244.64703-1-john@groves.net/T/#mb2c868801be16eca82dab239a1d201628534aea7 (famfs fuse v3)


John Groves (10):
  famfs_fuse: Update macro s/FUSE_IS_DAX/FUSE_IS_VIRTIO_DAX/
  famfs_fuse: Basic fuse kernel ABI enablement for famfs
  famfs_fuse: Plumb the GET_FMAP message/response
  famfs_fuse: Create files with famfs fmaps
  famfs_fuse: GET_DAXDEV message and daxdev_table
  famfs_fuse: Plumb dax iomap and fuse read/write/mmap
  famfs_fuse: Add holder_operations for dax notify_failure()
  famfs_fuse: Add DAX address_space_operations with noop_dirty_folio
  famfs_fuse: Add famfs fmap metadata documentation
  famfs_fuse: Add documentation

 Documentation/filesystems/famfs.rst |  142 ++++
 Documentation/filesystems/index.rst |    1 +
 MAINTAINERS                         |   10 +
 fs/fuse/Kconfig                     |   13 +
 fs/fuse/Makefile                    |    1 +
 fs/fuse/dir.c                       |    2 +-
 fs/fuse/famfs.c                     | 1180 +++++++++++++++++++++++++++
 fs/fuse/famfs_kfmap.h               |  167 ++++
 fs/fuse/file.c                      |   45 +-
 fs/fuse/fuse_i.h                    |  116 ++-
 fs/fuse/inode.c                     |   35 +-
 fs/fuse/iomode.c                    |    2 +-
 fs/namei.c                          |    1 +
 include/uapi/linux/fuse.h           |   88 ++
 14 files changed, 1790 insertions(+), 13 deletions(-)
 create mode 100644 Documentation/filesystems/famfs.rst
 create mode 100644 fs/fuse/famfs.c
 create mode 100644 fs/fuse/famfs_kfmap.h


base-commit: 2ae624d5a555d47a735fb3f4d850402859a4db77
-- 
2.53.0



^ permalink raw reply

* Re: [PATCH v8 12/12] rv: Add nomiss deadline monitor
From: Juri Lelli @ 2026-03-31 12:32 UTC (permalink / raw)
  To: Gabriele Monaco
  Cc: linux-kernel, Steven Rostedt, Nam Cao, Juri Lelli,
	Jonathan Corbet, Masami Hiramatsu, linux-trace-kernel, linux-doc,
	Tomas Glozar, Clark Williams, John Kacur
In-Reply-To: <20260330111010.153663-13-gmonaco@redhat.com>

Hello,

On 30/03/26 13:10, Gabriele Monaco wrote:
> Add the deadline monitors collection to validate the deadline scheduler,
> both for deadline tasks and servers.
> 
> The currently implemented monitors are:
> * nomiss:
>     validate dl entities run to completion before their deadiline
> 
> Reviewed-by: Nam Cao <namcao@linutronix.de>
> Signed-off-by: Gabriele Monaco <gmonaco@redhat.com>

Looks good to me.

Reviewed-by: Juri Lelli <juri.lelli@redhat.com>

Best,
Juri


^ permalink raw reply

* Re: [PATCH net-next V9 04/14] devlink: Decouple rate storage from associated devlink object
From: Cosmin Ratiu @ 2026-03-31 12:28 UTC (permalink / raw)
  To: Tariq Toukan, kuba@kernel.org
  Cc: allison.henderson@oracle.com, Moshe Shemesh, jiri@resnulli.us,
	davem@davemloft.net, daniel.zahka@gmail.com,
	donald.hunter@gmail.com, netdev@vger.kernel.org,
	matttbe@kernel.org, pabeni@redhat.com, horms@kernel.org,
	Parav Pandit, corbet@lwn.net, kees@kernel.org, willemb@google.com,
	Dragos Tatulea, razor@blackwall.org, Adithya Jayachandran,
	Dan Jurgens, leon@kernel.org, vadim.fedorenko@linux.dev,
	linux-kernel@vger.kernel.org, Saeed Mahameed, shuah@kernel.org,
	andrew+netdev@lunn.ch, Mark Bloch, Jiri Pirko, Carolina Jubran,
	Shahar Shitrit, daniel@iogearbox.net, minhquangbui99@gmail.com,
	Nimrod Oren, dw@davidwei.uk, jacob.e.keller@intel.com,
	skhan@linuxfoundation.org, Petr Machata, edumazet@google.com,
	antonio@openvpn.net, mst@redhat.com,
	linux-kselftest@vger.kernel.org, linux-rdma@vger.kernel.org,
	Shay Drori, sdf@fomichev.me, chuck.lever@oracle.com, Gal Pressman,
	joe@dama.to, linux-doc@vger.kernel.org
In-Reply-To: <20260331020810.3524941-1-kuba@kernel.org>

On Mon, 2026-03-30 at 19:08 -0700, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> devlink: Decouple rate storage from associated devlink object
> 
> Devlink rate leafs and nodes were stored in their respective devlink
> objects pointed to by devlink_rate->devlink.
> 
> > diff --git a/net/devlink/rate.c b/net/devlink/rate.c
> > index 4781429109195..9ebbc72130c6a 100644
> > --- a/net/devlink/rate.c
> > +++ b/net/devlink/rate.c
> [ ... ]
> > @@ -648,8 +702,9 @@ int devlink_nl_rate_new_doit(struct sk_buff
> > *skb, struct genl_info *info)
> >  		goto err_rate_set;
> >  
> >  	refcount_set(&rate_node->refcnt, 1);
> > -	list_add(&rate_node->list, &devlink->rate_list);
> > +	list_add(&rate_node->list, &rate_devlink->rate_list);
> >  	devlink_rate_notify(rate_node, DEVLINK_CMD_RATE_NEW);
> > +	devl_rate_unlock(devlink);
> >  	return 0;
> >  
> >  err_rate_set:
> >  	ops->rate_node_del(rate_node, rate_node->priv, info-
> > >extack);
> > @@ -658,6 +713,8 @@ int devlink_nl_rate_new_doit(struct sk_buff
> > *skb, struct genl_info *info)
> >  	kfree(rate_node->name);
> >  err_strdup:
> >  	kfree(rate_node);
> > +unlock:
> > +	devl_rate_unlock(devlink);
> >  	return err;
> >  }
> 
> If devlink_nl_rate_set() fails after it successfully sets the parent
> node
> and increments the parent's refcount, does this error path
> permanently
> leak that reference since it frees the node without decrementing it?

This is a preexisting bug caused by the fact that devlink_nl_rate_set
doesn't rollback what it did on failure, leaving various side effects.
Most are setting of various rate params, but this parent refcount is
more serious. It will have to be addressed, but I don't think this
series is the right place for it. (I'll send a separate patch soon).

> 
> [ ... ]
> > @@ -716,14 +785,20 @@ devl_rate_node_create(struct devlink
> > *devlink, void *priv, char *node_name,
> >  		      struct devlink_rate *parent)
> >  {
> >  	struct devlink_rate *rate_node;
> > +	struct devlink *rate_devlink;
> >  
> > +	rate_devlink = devl_rate_lock(devlink);
> >  	rate_node = devlink_rate_node_get_by_name(devlink,
> > node_name);
> > -	if (!IS_ERR(rate_node))
> > -		return ERR_PTR(-EEXIST);
> > +	if (!IS_ERR(rate_node)) {
> > +		rate_node = ERR_PTR(-EEXIST);
> > +		goto unlock;
> > +	}
> >  
> >  	rate_node = kzalloc_obj(*rate_node);
> > -	if (!rate_node)
> > -		return ERR_PTR(-ENOMEM);
> > +	if (!rate_node) {
> > +		rate_node = ERR_PTR(-ENOMEM);
> > +		goto unlock;
> > +	}
> >  
> >  	if (parent) {
> >  		rate_node->parent = parent;
> > @@ -737,12 +812,15 @@ devl_rate_node_create(struct devlink
> > *devlink, void *priv, char *node_name,
> >  	rate_node->name = kstrdup(node_name, GFP_KERNEL);
> >  	if (!rate_node->name) {
> >  		kfree(rate_node);
> > -		return ERR_PTR(-ENOMEM);
> > +		rate_node = ERR_PTR(-ENOMEM);
> > +		goto unlock;
> >  	}
> 
> Similarly, if kstrdup() fails here, does this leak the parent's
> refcount
> since it was already incremented earlier in the function?

Another preexisting issue. These code changes brought this section to
Argus' Eyes attention. I made a note to send a patch to fix this.

Cosmin.

^ permalink raw reply

* [PATCH] Documentation: gpio: update the preferred method for using software node lookup
From: Bartosz Golaszewski @ 2026-03-31 12:28 UTC (permalink / raw)
  To: Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Shuah Khan,
	Dmitry Torokhov
  Cc: linux-gpio, linux-doc, linux-kernel, Bartosz Golaszewski

In its current version, the manual for converting of board files from
using GPIO lookup tables to software nodes recommends leaving the
software nodes representing GPIO controllers as "free-floating", not
attached objects and relying on the matching of their names against the
GPIO controller's name. This is an abuse of the software node API and
makes it impossible to create fw_devlinks between GPIO suppliers and
consumers in this case. We want to remove this behavior from GPIOLIB and
to this end, work on converting all existing drivers to using "attached"
software nodes.

Except for a few corner-cases where board files define consumers
depending on GPIO controllers described in firmware - where we need to
reference a real firmware node from a software node - which requires a
more complex approach, most board files can easily be converted to using
propert firmware node lookup.

Update the documentation to recommend attaching the GPIO chip's software
nodes to the actual platform devices and show how to do it.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 Documentation/driver-api/gpio/board.rst         | 15 +++++++++---
 Documentation/driver-api/gpio/legacy-boards.rst | 32 ++++++++++++++++++-------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/Documentation/driver-api/gpio/board.rst b/Documentation/driver-api/gpio/board.rst
index 0993cac891fb5e4887a1aee6deae273197c6aae1..c2880533742b1b55108f28853a3903cb273fe791 100644
--- a/Documentation/driver-api/gpio/board.rst
+++ b/Documentation/driver-api/gpio/board.rst
@@ -108,9 +108,8 @@ macro, which ties a software node representing the GPIO controller with
 consumer device. It allows consumers to use regular gpiolib APIs, such as
 gpiod_get(), gpiod_get_optional().
 
-The software node representing a GPIO controller need not be attached to the
-GPIO controller device. The only requirement is that the node must be
-registered and its name must match the GPIO controller's label.
+The software node representing a GPIO controller must be attached to the
+GPIO controller device - either as the primary or the secondary firmware node.
 
 For example, here is how to describe a single GPIO-connected LED. This is an
 alternative to using platform_data on legacy systems.
@@ -153,6 +152,16 @@ alternative to using platform_data on legacy systems.
 	};
 	software_node_register_node_group(swnodes);
 
+	/*
+	 * 5. Attach the GPIO controller's software node to the device and
+	 *    register it.
+	 */
+	 static void gpio_foo_register(void)
+	 {
+		gpio_foo_pdev.dev.fwnode = software_node_fwnode(&gpio_controller_node);
+		platform_device_register(&gpio_foo_pdev);
+	 }
+
 	// Then register a platform_device for "leds-gpio" and associate
 	// it with &led_device_swnode via .fwnode.
 
diff --git a/Documentation/driver-api/gpio/legacy-boards.rst b/Documentation/driver-api/gpio/legacy-boards.rst
index 46e3a26dba772e5e5117866b5d202e76c8e2adf2..fac63dd38d5b71c3bf43b5286a432f6449f422d0 100644
--- a/Documentation/driver-api/gpio/legacy-boards.rst
+++ b/Documentation/driver-api/gpio/legacy-boards.rst
@@ -36,12 +36,10 @@ Requirements for GPIO Properties
 When using software nodes to describe GPIO connections, the following
 requirements must be met for the GPIO core to correctly resolve the reference:
 
-1.  **The GPIO controller's software node "name" must match the controller's
-    "label".** The gpiolib core uses this name to find the corresponding
-    struct gpio_chip at runtime.
-    This software node has to be registered, but need not be attached to the
-    device representing the GPIO controller that is providing the GPIO in
-    question. It may be left as a "free floating" node.
+1.  **The GPIO controller's software node must be registered and attached to
+    the controller's ``struct device`` either as its primary or secondary
+    firmware node.** The gpiolib core uses the address of the firmware node to
+    find the corresponding ``struct gpio_chip`` at runtime.
 
 2.  **The GPIO property must be a reference.** The ``PROPERTY_ENTRY_GPIO()``
     macro handles this as it is an alias for ``PROPERTY_ENTRY_REF()``.
@@ -75,6 +73,11 @@ A typical legacy board file might look like this:
 
   #define MYBOARD_GPIO_CONTROLLER "gpio-foo"
 
+  static struct platform_device myboard_gpio = {
+        .name = MYBOARD_GPIO_CONTROLLER,
+        .id = PLATFORM_DEVID_NONE,
+  };
+
   /* LED setup */
   static const struct gpio_led myboard_leds[] = {
   	{
@@ -124,6 +127,7 @@ A typical legacy board file might look like this:
   	gpiod_add_lookup_table(&myboard_leds_gpios);
   	gpiod_add_lookup_table(&myboard_buttons_gpios);
 
+        platform_device_register(&myboard_gpio);
   	platform_device_register_data(NULL, "leds-gpio", -1,
   				      &myboard_leds_pdata, sizeof(myboard_leds_pdata));
   	platform_device_register_data(NULL, "gpio-keys", -1,
@@ -141,8 +145,7 @@ Step 1: Define the GPIO Controller Node
 ***************************************
 
 First, define a software node that represents the GPIO controller that the
-LEDs and buttons are connected to. The ``name`` of this node must match the
-name of the driver for the GPIO controller (e.g., "gpio-foo").
+LEDs and buttons are connected to. The ``name`` of this node is optional.
 
 .. code-block:: c
 
@@ -257,6 +260,16 @@ software nodes using the ``fwnode`` field in struct platform_device_info.
   	if (error)
   		return error;
 
+  	memset(&pdev_info, 0, sizeof(pdev_info));
+  	pdev_info.name = MYBOARD_GPIO_CONTROLLER;
+  	pdev_info.id = PLATFORM_DEVID_NONE;
+  	pdev_info.fwnode = software_node_fwnode(&myboard_gpio_controller_node);
+  	gpio_pdev = platform_device_register_full(&pdev_info);
+  	if (IS_ERR(gpio_pdev)) {
+  		error = PTR_ERR(gpio_pdev);
+  		goto err_unregister_nodes;
+  	}
+
   	memset(&pdev_info, 0, sizeof(pdev_info));
   	pdev_info.name = "leds-gpio";
   	pdev_info.id = PLATFORM_DEVID_NONE;
@@ -264,6 +277,7 @@ software nodes using the ``fwnode`` field in struct platform_device_info.
   	leds_pdev = platform_device_register_full(&pdev_info);
   	if (IS_ERR(leds_pdev)) {
   		error = PTR_ERR(leds_pdev);
+  		platform_device_unregister(gpio_pdev);
   		goto err_unregister_nodes;
   	}
 
@@ -274,6 +288,7 @@ software nodes using the ``fwnode`` field in struct platform_device_info.
   	keys_pdev = platform_device_register_full(&pdev_info);
   	if (IS_ERR(keys_pdev)) {
   		error = PTR_ERR(keys_pdev);
+  		platform_device_unregister(gpio_pdev);
   		platform_device_unregister(leds_pdev);
   		goto err_unregister_nodes;
   	}
@@ -289,6 +304,7 @@ software nodes using the ``fwnode`` field in struct platform_device_info.
   {
   	platform_device_unregister(keys_pdev);
   	platform_device_unregister(leds_pdev);
+	platform_device_unregister(gpio_pdev);
   	software_node_unregister_node_group(myboard_swnodes);
   }
 

---
base-commit: 3b058d1aeeeff27a7289529c4944291613b364e9
change-id: 20260331-doc-gpio-swnodes-fc3ddf59b8dc

Best regards,
-- 
Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>


^ permalink raw reply related

* Re: [PATCH v10 0/8] ACPI: Unify CPU UID interface and fix ARM64 TPH steer-tag issue
From: Rafael J. Wysocki @ 2026-03-31 12:24 UTC (permalink / raw)
  To: Chengwen Feng
  Cc: Bjorn Helgaas, Catalin Marinas, Will Deacon, Jonathan Corbet,
	Palmer Dabbelt, Borislav Petkov, H . Peter Anvin, Juergen Gross,
	Boris Ostrovsky, Sunil V L, Mark Rutland, Jonathan Cameron,
	Kees Cook, Yanteng Si, Sean Christopherson, Kai Huang,
	Tom Lendacky, Thomas Huth, Thorsten Blum, Kevin Loughlin,
	Zheyun Shen, Peter Zijlstra, Pawan Gupta, Xin Li,
	Ahmed S . Darwish, Sohil Mehta, Ilkka Koskinen, Robin Murphy,
	James Clark, Besar Wicaksono, Ma Ke, Wei Huang, Andy Gospodarek,
	Somnath Kotur, punit.agrawal, guohanjun, suzuki.poulose,
	ryan.roberts, chenl311, masahiroy, wangyuquan1236,
	anshuman.khandual, heinrich.schuchardt, Eric.VanTassell,
	wangzhou1, wanghuiqiang, liuyonglong, linux-pci, linux-doc,
	linux-kernel, linux-arm-kernel, loongarch, linux-riscv, xen-devel,
	linux-acpi, linux-perf-users
In-Reply-To: <20260320031737.35048-1-fengchengwen@huawei.com>

On Fri, Mar 20, 2026 at 4:17 AM Chengwen Feng <fengchengwen@huawei.com> wrote:
>
> This patchset unifies ACPI Processor UID retrieval across
> arm64/loongarch/riscv/x86 via acpi_get_cpu_uid() (with input validation)
> and fixes ARM64 CPU steer-tag retrieval failure in PCI/TPH:
>
> 1-4: Add acpi_get_cpu_uid() for arm64/loongarch/riscv/x86 (update
>      respective users)
> 5: Centralize acpi_get_cpu_uid() declaration in include/linux/acpi.h
> 6: Clean up perf/arm_cspmu
> 7: Clean up ACPI/PPTT and remove unused get_acpi_id_for_cpu()
> 8: Pass ACPI Processor UID to Cache Locality _DSM
>
> The interface refactor ensures consistent CPU UID retrieval across
> architectures (no functional changes for valid inputs) and provides the
> unified interface required for the ARM64 TPH fix.
>
> ---
> Changes in v10:
> - Refine commit header&log according to Punit's and Bjorn's review
> - Split perf/arm_cspmu as a separate commit which address Punit's
>   review
>
> Changes in v9:
> - Address Bjorn's review: split commits to each platform so that make
>   them easy to review
>
> Changes in v8:
> - Moving arm64's get_cpu_for_acpi_id() to kernel/acpi.c which address
>   Jeremy's review
>
> Chengwen Feng (8):
>   arm64: acpi: Add acpi_get_cpu_uid() for unified ACPI CPU UID retrieval
>   LoongArch: Add acpi_get_cpu_uid() for unified ACPI CPU UID retrieval
>   RISC-V: ACPI: Add acpi_get_cpu_uid() for unified ACPI CPU UID
>     retrieval
>   x86/acpi: Add acpi_get_cpu_uid() for unified ACPI CPU UID retrieval
>   ACPI: Centralize acpi_get_cpu_uid() declaration in
>     include/linux/acpi.h
>   perf: arm_cspmu: Switch to acpi_get_cpu_uid() from
>     get_acpi_id_for_cpu()
>   ACPI: PPTT: Use acpi_get_cpu_uid() and remove get_acpi_id_for_cpu()
>   PCI/TPH: Pass ACPI Processor UID to Cache Locality _DSM
>
>  Documentation/PCI/tph.rst          |  4 +--
>  arch/arm64/include/asm/acpi.h      | 17 +---------
>  arch/arm64/kernel/acpi.c           | 30 ++++++++++++++++++
>  arch/loongarch/include/asm/acpi.h  |  5 ---
>  arch/loongarch/kernel/acpi.c       |  9 ++++++
>  arch/riscv/include/asm/acpi.h      |  4 ---
>  arch/riscv/kernel/acpi.c           | 16 ++++++++++
>  arch/riscv/kernel/acpi_numa.c      |  9 ++++--
>  arch/x86/include/asm/cpu.h         |  1 -
>  arch/x86/include/asm/smp.h         |  1 -
>  arch/x86/kernel/acpi/boot.c        | 20 ++++++++++++
>  arch/x86/xen/enlighten_hvm.c       |  5 +--
>  drivers/acpi/pptt.c                | 50 ++++++++++++++++++++++--------
>  drivers/acpi/riscv/rhct.c          |  7 ++++-
>  drivers/pci/tph.c                  | 16 +++++++---
>  drivers/perf/arm_cspmu/arm_cspmu.c |  6 ++--
>  include/linux/acpi.h               | 11 +++++++
>  include/linux/pci-tph.h            |  4 +--
>  18 files changed, 158 insertions(+), 57 deletions(-)
>
> --

It doesn't look like anyone has a particular heartburn related to this
series, so I could apply it in principle, but I'd appreciate some ACKs
from arch maintainers.

Why don't you resend it with all of the tags collected so far (and
please add x86@kernel.org to the CC list)?

^ permalink raw reply

* Re: (sashiko status) [PATCH 0/2] Docs/admin-guide/mm/damon: warn commit_inputs vs other params race
From: Theodore Tso @ 2026-03-31 12:19 UTC (permalink / raw)
  To: Andrew Morton
  Cc: SeongJae Park, Greg KH, Liam R. Howlett, # 5 . 19 . x,
	David Hildenbrand, Jonathan Corbet, Lorenzo Stoakes, Michal Hocko,
	Mike Rapoport, Shuah Khan, Suren Baghdasaryan, Vlastimil Babka,
	damon, linux-doc, linux-kernel, linux-mm, Roman Gushchin
In-Reply-To: <20260330142205.e7c7d7b47ec15a634f6eebf4@linux-foundation.org>

On Mon, Mar 30, 2026 at 02:22:05PM -0700, Andrew Morton wrote:
> 
> I view Sashiko as primarily an author tool.  Sometimes I call it
> checkpatch++.  In a better world, author would be able to sort out
> Sashiko issues before ever sending out the patchset.  But in this
> world, a public send is needed to obtain that review.

Note that Sashiko is fully open source and the prompts are available
in third_party/prompts in the git repo:

	https://github.com/sashiko-dev/sashiko

So people can run it privately, although they will need to provide
their own LLM credits.  This also means that you can use some other
LLM besides Gemini 3.1 Pro.

Cheers,

						- Ted

^ permalink raw reply

* Re: [PATCH net-next V9 02/14] devlink: Add helpers to lock nested-in instances
From: Cosmin Ratiu @ 2026-03-31 12:20 UTC (permalink / raw)
  To: Tariq Toukan, kuba@kernel.org
  Cc: allison.henderson@oracle.com, Moshe Shemesh, jiri@resnulli.us,
	davem@davemloft.net, daniel.zahka@gmail.com,
	donald.hunter@gmail.com, netdev@vger.kernel.org,
	matttbe@kernel.org, pabeni@redhat.com, horms@kernel.org,
	Parav Pandit, corbet@lwn.net, kees@kernel.org, willemb@google.com,
	Dragos Tatulea, razor@blackwall.org, Adithya Jayachandran,
	Dan Jurgens, leon@kernel.org, vadim.fedorenko@linux.dev,
	linux-kernel@vger.kernel.org, Saeed Mahameed, shuah@kernel.org,
	andrew+netdev@lunn.ch, Mark Bloch, Jiri Pirko, Carolina Jubran,
	Shahar Shitrit, daniel@iogearbox.net, minhquangbui99@gmail.com,
	Nimrod Oren, dw@davidwei.uk, jacob.e.keller@intel.com,
	skhan@linuxfoundation.org, Petr Machata, edumazet@google.com,
	antonio@openvpn.net, mst@redhat.com,
	linux-kselftest@vger.kernel.org, linux-rdma@vger.kernel.org,
	Shay Drori, sdf@fomichev.me, chuck.lever@oracle.com, Gal Pressman,
	joe@dama.to, linux-doc@vger.kernel.org
In-Reply-To: <20260331020807.3524811-1-kuba@kernel.org>

On Mon, 2026-03-30 at 19:08 -0700, Jakub Kicinski wrote:
> This is an AI-generated review of your patch. The human sending this
> email has considered the AI review valid, or at least plausible.
> ---
> devlink: Add helpers to lock nested-in instances
> 
> Upcoming code will need to obtain a reference to locked nested-in
> devlink instances. Add helpers to lock, obtain an already locked
> reference and unlock/unref the nested-in instance.
> 
> > diff --git a/net/devlink/core.c b/net/devlink/core.c
> > index eeb6a71f5f56e..db11248df7120 100644
> > --- a/net/devlink/core.c
> > +++ b/net/devlink/core.c
> 
> [ ... ]
> 
> > +/* Returns the nested in devlink object and validates its lock is
> > held. */
> > +struct devlink *devlink_nested_in_get_locked(struct devlink_rel
> > *rel)
> > +{
> > +	struct devlink *devlink;
> > +	unsigned long index;
> > +
> > +	if (!rel)
> > +		return NULL;
> > +	index = rel->nested_in.devlink_index;
> > +	devlink = xa_find(&devlinks, &index, index,
> > DEVLINK_REGISTERED);
> > +	if (devlink)
> > +		devl_assert_locked(devlink);
> 
> Can this dereference a freed pointer if the lock is not actually
> held?
> 
> xa_find() acquires and releases the RCU read lock internally. When it
> returns,
> the devlink pointer is no longer protected by RCU or a reference
> count.
> 
> If a caller invokes this helper without holding the devlink lock
> (which is the
> exact scenario the assertion is designed to detect), another thread
> could
> concurrently unregister and free the devlink.
> 
> If the RCU grace period expires before the assertion executes,
> devl_assert_locked() would dereference freed memory. Should the
> lookup and
> assertion be enclosed within an explicit rcu_read_lock() block to
> safely
> assert the lock?

This function is meant to be invoked by callers already holding all
locks from a given instance up the nesting tree. Calling it outside
this context could indeed lead to a race as described, where another
entity unregisters a devlink about-to-be-asserted on.

All current callers hold the locks, but to make it more robust, I will
add the rcu_read_lock/unlock section around the find+assert.

> 
> > +	return devlink;
> > +}
> > +
> > +void devlink_nested_in_put_unlock(struct devlink_rel *rel)
> > +{
> > +	struct devlink *devlink =
> > devlink_nested_in_get_locked(rel);
> 
> Could this corrupt the mutex state and cause a refcount underflow?
> 
> This helper takes a devlink_rel pointer instead of the devlink
> pointer
> acquired by devlink_nested_in_get_lock(), and performs a secondary
> global
> lookup to find the devlink.
> 
> If a caller mistakenly calls this in an error cleanup path where they
> did not
> actually acquire the lock, the global xa_find() will still locate the
> registered devlink. This would execute devl_unlock() and
> devlink_put() on a
> devlink the current thread does not own.
> 
> Would it be safer for unlock/put helpers to take the exact pointer
> returned by
> the lock/get helper to ensure safe resource cleanup?

2 issues here:
1) Mistakenly calling this without having acquired the lock. This is
akin to saying mutex_unlock is dangerous if the lock isn't held.
Technically true, but moot.
2) The rel argument: It is intentional, so that all 3 functions are
symmetrical.

> 
> > +
> > +	if (devlink) {
> > +		devl_unlock(devlink);
> > +		devlink_put(devlink);
> > +	}
> > +}


^ permalink raw reply

* Re: [PATCH] Documentation: amd-pstate: fix dead links in the reference section
From: Mario Limonciello @ 2026-03-31 11:58 UTC (permalink / raw)
  To: Ninad Naik, ray.huang, gautham.shenoy, mario.limonciello,
	perry.yuan, corbet, skhan
  Cc: linux-pm, linux-doc, linux-kernel, me
In-Reply-To: <20260330190855.1115304-1-ninadnaik07@gmail.com>



On 3/30/26 2:08 PM, Ninad Naik wrote:
> The links for AMD64 Architecture Programmer's Manual and PPR for AMD
> Family 19h Model 51h, Revision A1 Processors redirect to a generic page.
> Update the links to the working ones.
> 
> Signed-off-by: Ninad Naik <ninadnaik07@gmail.com>
Acked-by: Mario Limonciello (AMD) <superm1@kernel.org>

Applied.

> ---
>   Documentation/admin-guide/pm/amd-pstate.rst | 4 ++--
>   1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst
> index e1771f2225d5..13d6580894bc 100644
> --- a/Documentation/admin-guide/pm/amd-pstate.rst
> +++ b/Documentation/admin-guide/pm/amd-pstate.rst
> @@ -790,13 +790,13 @@ Reference
>   ===========
>   
>   .. [1] AMD64 Architecture Programmer's Manual Volume 2: System Programming,
> -       https://www.amd.com/system/files/TechDocs/24593.pdf
> +       https://docs.amd.com/v/u/en-US/24593_3.44_APM_Vol2
>   
>   .. [2] Advanced Configuration and Power Interface Specification,
>          https://uefi.org/sites/default/files/resources/ACPI_Spec_6_4_Jan22.pdf
>   
>   .. [3] Processor Programming Reference (PPR) for AMD Family 19h Model 51h, Revision A1 Processors
> -       https://www.amd.com/system/files/TechDocs/56569-A1-PUB.zip
> +       https://docs.amd.com/v/u/en-US/56569-A1-PUB_3.03
>   
>   .. [4] Linux Kernel Selftests,
>          https://www.kernel.org/doc/html/latest/dev-tools/kselftest.html


^ permalink raw reply

* Re: [PATCH net-next v2 2/3] dpll: add frequency monitoring callback ops
From: Vadim Fedorenko @ 2026-03-31 11:45 UTC (permalink / raw)
  To: Ivan Vecera, netdev
  Cc: Arkadiusz Kubalewski, Jiri Pirko, Jonathan Corbet, Shuah Khan,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Donald Hunter, Prathosh Satish, Petr Oros,
	linux-doc, linux-kernel
In-Reply-To: <20260330105505.715099-3-ivecera@redhat.com>

On 30/03/2026 11:55, Ivan Vecera wrote:
> Add new callback operations for a dpll device:
> - freq_monitor_get(..) - to obtain current state of frequency monitor
>    feature from dpll device,
> - freq_monitor_set(..) - to allow feature configuration.
> 
> Add new callback operation for a dpll pin:
> - measured_freq_get(..) - to obtain the measured frequency in Hz.
> 
> Obtain the feature state value using the get callback and provide it to
> the user if the device driver implements callbacks. The measured_freq_get
> pin callback is only invoked when the frequency monitor is enabled.
> 
> Execute the set callback upon user requests.
> 
> Signed-off-by: Ivan Vecera <ivecera@redhat.com>

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>

^ permalink raw reply

* Re: [PATCH v11 03/22] drm: Add new general DRM property "color format"
From: Ville Syrjälä @ 2026-03-31 11:44 UTC (permalink / raw)
  To: Nicolas Frattaroli
  Cc: Maxime Ripard, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Thomas Zimmermann, Andrzej Hajda,
	Neil Armstrong, Robert Foss, Laurent Pinchart, Jonas Karlman,
	Jernej Skrabec, Sandy Huang, Heiko Stübner, Andy Yan,
	Jani Nikula, Rodrigo Vivi, Joonas Lahtinen, Tvrtko Ursulin,
	Dmitry Baryshkov, Sascha Hauer, Rob Herring, Jonathan Corbet,
	Shuah Khan, kernel, amd-gfx, dri-devel, linux-kernel,
	linux-arm-kernel, linux-rockchip, intel-gfx, intel-xe, linux-doc,
	Werner Sembach, Andri Yngvason, Marius Vlad, Pekka Paalanen,
	Simon Ser, Sebastian Wick, Jonas Ådahl, Xaver Hugl
In-Reply-To: <5583906.GXAFRqVoOG@workhorse>

On Tue, Mar 31, 2026 at 12:33:00PM +0200, Nicolas Frattaroli wrote:
> On Tuesday, 31 March 2026 01:56:16 Central European Summer Time Ville Syrjälä wrote:
> > On Sat, Mar 28, 2026 at 02:49:04AM +0200, Ville Syrjälä wrote:
> > > On Fri, Mar 27, 2026 at 01:56:06PM +0100, Nicolas Frattaroli wrote:
> > > > On Thursday, 26 March 2026 18:58:25 Central European Standard Time Ville Syrjälä wrote:
> > > > > On Thu, Mar 26, 2026 at 06:02:47PM +0100, Maxime Ripard wrote:
> > > > > > On Wed, Mar 25, 2026 at 08:43:15PM +0200, Ville Syrjälä wrote:
> > > > > > > On Wed, Mar 25, 2026 at 03:56:58PM +0100, Maxime Ripard wrote:
> > > > > > > > On Wed, Mar 25, 2026 at 01:03:07PM +0200, Ville Syrjälä wrote:
> > > > > > > > > On Wed, Mar 25, 2026 at 09:24:27AM +0100, Maxime Ripard wrote:
> > > > > > > > > > On Tue, Mar 24, 2026 at 09:53:35PM +0200, Ville Syrjälä wrote:
> > > > > > > > > > > On Tue, Mar 24, 2026 at 08:10:11PM +0100, Nicolas Frattaroli wrote:
> > > > > > > > > > > > On Tuesday, 24 March 2026 18:00:45 Central European Standard Time Ville Syrjälä wrote:
> > > > > > > > > > > > > On Tue, Mar 24, 2026 at 05:01:07PM +0100, Nicolas Frattaroli wrote:
> > > > > > > > > > > > > > +enum drm_connector_color_format {
> > > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_AUTO: The driver or display protocol
> > > > > > > > > > > > > > +	 * helpers should pick a suitable color format. All implementations of a
> > > > > > > > > > > > > > +	 * specific display protocol must behave the same way with "AUTO", but
> > > > > > > > > > > > > > +	 * different display protocols do not necessarily have the same "AUTO"
> > > > > > > > > > > > > > +	 * semantics.
> > > > > > > > > > > > > > +	 *
> > > > > > > > > > > > > > +	 * For HDMI, "AUTO" picks RGB, but falls back to YCbCr 4:2:0 if the
> > > > > > > > > > > > > > +	 * bandwidth required for full-scale RGB is not available, or the mode
> > > > > > > > > > > > > > +	 * is YCbCr 4:2:0-only, as long as the mode and output both support
> > > > > > > > > > > > > > +	 * YCbCr 4:2:0.
> > > > > > > > > > > > > > +	 *
> > > > > > > > > > > > > > +	 * For display protocols other than HDMI, the recursive bridge chain
> > > > > > > > > > > > > > +	 * format selection picks the first chain of bridge formats that works,
> > > > > > > > > > > > > > +	 * as has already been the case before the introduction of the "color
> > > > > > > > > > > > > > +	 * format" property. Non-HDMI bridges should therefore either sort their
> > > > > > > > > > > > > > +	 * bus output formats by preference, or agree on a unified auto format
> > > > > > > > > > > > > > +	 * selection logic that's implemented in a common state helper (like
> > > > > > > > > > > > > > +	 * how HDMI does it).
> > > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_AUTO = 0,
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_RGB444: RGB output format
> > > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_RGB444,
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR444: YCbCr 4:4:4 output format (ie.
> > > > > > > > > > > > > > +	 * not subsampled)
> > > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR444,
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR422: YCbCr 4:2:2 output format (ie.
> > > > > > > > > > > > > > +	 * with horizontal subsampling)
> > > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR422,
> > > > > > > > > > > > > > +
> > > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR420: YCbCr 4:2:0 output format (ie.
> > > > > > > > > > > > > > +	 * with horizontal and vertical subsampling)
> > > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR420,
> > > > > > > > > > > > > 
> > > > > > > > > > > > > Seems like this should document what the quantization range
> > > > > > > > > > > > > should be for each format.
> > > > > > > > > > > > > 
> > > > > > > > > > > > 
> > > > > > > > > > > > I don't think so? If you want per-component bit depth values,
> > > > > > > > > > > > DRM_FORMAT_* defines would be the appropriate values to use. This
> > > > > > > > > > > > enum is more abstract than that, and is there to communicate
> > > > > > > > > > > > YUV vs. RGB and chroma subsampling, with bit depth being handled
> > > > > > > > > > > > by other properties.
> > > > > > > > > > > > 
> > > > > > > > > > > > If you mean the factor used for subsampling, then that'd only be
> > > > > > > > > > > > relevant if YCBCR410 was supported where one chroma plane isn't
> > > > > > > > > > > > halved but quartered in resolution. I suspect 4:1:0 will never
> > > > > > > > > > > > be added; no digital display protocol standard supports it to my
> > > > > > > > > > > > knowledge, and hopefully none ever will.
> > > > > > > > > > > 
> > > > > > > > > > > No, I mean the quantization range (16-235 vs. 0-255 etc).
> > > > > > > > > > > 
> > > > > > > > > > > The i915 behaviour is that YCbCr is always limited range,
> > > > > > > > > > > RGB can either be full or limited range depending on the 
> > > > > > > > > > > "Broadcast RGB" property and other related factors.
> > > > > > > > > > 
> > > > > > > > > > So far the HDMI state has both the format and quantization range as
> > > > > > > > > > different fields. I'm not sure we need to document the range in the
> > > > > > > > > > format field, maybe only mention it's not part of the format but has a
> > > > > > > > > > field of its own?
> > > > > > > > > 
> > > > > > > > > I think we only have it for RGB (on some drivers only?). For YCbCr
> > > > > > > > > I think the assumption is limited range everywhere.
> > > > > > > > > 
> > > > > > > > > But I'm not really concerned about documenting struct members.
> > > > > > > > > What I'm talking about is the *uapi* docs. Surely userspace
> > > > > > > > > will want to know what the new property actually does so the
> > > > > > > > > uapi needs to be documented properly. And down the line some
> > > > > > > > > new driver might also implement the wrong behaviour if there
> > > > > > > > > is no clear specification.
> > > > > > > > 
> > > > > > > > Ack
> > > > > > > > 
> > > > > > > > > So I'm thinking (or perhaps hoping) the rule might be something like:
> > > > > > > > > - YCbCr limited range 
> > > > > > > > > - RGB full range if "Broadcast RGB" property is not present
> > > > > > > > 
> > > > > > > > Isn't it much more complicated than that for HDMI though? My
> > > > > > > > recollection was that any VIC but VIC1 would be limited range, and
> > > > > > > > anything else full range?
> > > > > > > 
> > > > > > > Do we have some driver that implements the CTA-861 CE vs. IT mode
> > > > > > > logic but doesn't expose the "Broadcast RGB" property? I was hoping
> > > > > > > those would always go hand in hand now.
> > > > > > 
> > > > > > I'm not sure. i915 and the HDMI state helpers handle it properly (I
> > > > > > think?) but it looks like only vc4 registers the Broadcast RGB property
> > > > > > and uses the HDMI state helpers.
> > > > > > 
> > > > > > And it looks like amdgpu registers Broadcast RGB but doesn't use
> > > > > > drm_default_rgb_quant_range() which seems suspicious?
> > > > > 
> > > > > If they want just manual full vs. limited then they should
> > > > > limit the property to not expose the "auto" option at all.
> > > > > 
> > > > > amdgpu also ties this in with the "colorspace" property, which
> > > > > originally in i915 only controlled the infoframes/etc. But on
> > > > > amdgpu it now controls various aspects of output color
> > > > > transformation. The end result is that the property is a complete
> > > > > mess with most of the values making no sense. And for whatever
> > > > > reason everyone involved refused to remove/deprecate the
> > > > > nonsensical values :/
> > > > > 
> > > > > Looks like this series should make sure the documentation for
> > > > > the "colorspace" property is in sync with the new property
> > > > > as well. Currently now it's giving conflicting information.
> > > > > 
> > > > 
> > > > I take it the problematic information is in
> > > > 
> > > >     * DOC: standard connector properties
> > > >     *
> > > >     * Colorspace:
> > > > 
> > > > and probably specifically BT2020_YCC's (and BT2020_RGB's?) insistence
> > > > that they "produce RGB content".
> > > > 
> > > > I think we probably just have to change the statement "The variants
> > > > BT2020_RGB and BT2020_YCC are equivalent and the driver chooses between
> > > > RGB and YCbCr on its own."
> > > > 
> > > > The "on its own" here would get turned into "based on the color format
> > > > property".
> > > > 
> > > > Speaking of i915, that patch is one of the very few (5) patches in
> > > > this series still lacking a review (hint hint nudge nudge). I'd like
> > > > to get some more feedback on the remaining patches before I send out
> > > > another revision, so that it's hopefully not just docs changes (I
> > > > know better than to think those patches must be perfect and won't
> > > > need revision.)
> > > 
> > > The i915 code around this is already a big mess, and I don't really
> > > adding to that mess. So I think we'll need to do some refactoring before
> > > we add anything there. I already started typing something and so far
> > > it looks fairly straightforward, so I should have something soon.
> > 
> > OK, posted something
> > https://lore.kernel.org/intel-gfx/20260330235339.29479-1-ville.syrjala@linux.intel.com/T/#m7c349478ca6c856fbc68d5e2178f1aa31678a05f
> 
> Thanks! I'll take a look at this today to get a more solid idea of
> where the pain points you highlighted are.
> 
> I'll also rebase/reimplement my i915 color format implementation
> (sans the DP-MST part, as discussed) on top of this on the next
> revision. I was never fully happy with the current one due to the
> logic being shoehorned into the already existing i915 fallback
> format logic, so I'm quite happy to have another opportunity to
> implement it with less historic baggage.
> 
> > Are the wayland/compositor/color management folks on board with
> > these new properties? I don't think I see the usual suspects on
> > the cc list.
> 
> I don't know which precise group of people you refer to,

Off the top of my head, Pekka,Simon,Sebastian,Jonas,Xaver might be
relevant here. Added to Cc...

> but at
> least from the Collabora side of things, the userspace Wayland
> people are on board with these new properties. In Weston, we use
> it to implement the Weston frontend's "color-format" option in a
> WIP branch at
> 
> https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/1859
> 
> I've also been made aware that LibreELEC is aware, and will look
> into making use of it rather than their own kernel patches.
> 
> Kind regards,
> Nicolas Frattaroli
> 
> > > 
> > > While doing that several questions came to my mind though:
> > > 
> > > * More interactions with the colorspace property, but I sent
> > >   a separate mail already about that
> > > 
> > > * Which conversion matrix to use, and the answer I suspect
> > >   should be "ask the colorspace property", as mentioned in the
> > >   other mail
> > > 
> > > * Should we flat out reject color formats (and I suppose also
> > >   colorspace prop values) the sink doesn't claim to support?
> > > 
> > >   If yes, then I think we'll have to forget about adding anything 
> > >   to i915 MST code. The way the MST stuff works is that if one
> > >   stream needs a modeset then all the related streams get modeset
> > >   as well. Thus if the user replaces a monitor getting fed with a
> > >   YCbCr stream just as another stream is being modeset, then the
> > >   entire atomic commit could fail due to the YCbCr stream getting
> > >   rejected.
> > > 
> > >   I think eventually we might have to invent some mechanism where
> > >   all the input into the modeset computation is cached somehow,
> > >   and said cache updated only on explicit userspace modesets.
> > >   Either that or we have to come up  with a way to skip some of
> > >   the calculations that depend on external factors. Either way
> > >   it's going to be a pain.
> > > 
> > >   OTOH if we don't mind feeding the sink with stuff it can't
> > >   understand, then I suppose we might add YCbCr 4:4:4 support
> > >   for MST. It shouldn't be any different from RGB apart from
> > >   the RGB->YCbCr conversion, which is handled elsewhere. But
> > >   YCbCr 4:2:0 is definitely out either way, the MST code has
> > >   no support for that currently.
> > > 
> > 
> > 
> 
> 
> 

-- 
Ville Syrjälä
Intel

^ permalink raw reply

* Re: [PATCH net-next v2 1/3] dpll: add frequency monitoring to netlink spec
From: Vadim Fedorenko @ 2026-03-31 11:43 UTC (permalink / raw)
  To: Ivan Vecera, netdev
  Cc: Arkadiusz Kubalewski, Jiri Pirko, Jonathan Corbet, Shuah Khan,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Donald Hunter, Prathosh Satish, Petr Oros,
	linux-doc, linux-kernel
In-Reply-To: <20260330105505.715099-2-ivecera@redhat.com>

On 30/03/2026 11:55, Ivan Vecera wrote:
> Add DPLL_A_FREQUENCY_MONITOR device attribute to allow control over
> the frequency monitor feature. The attribute uses the existing
> dpll_feature_state enum (enable/disable) and is present in both
> device-get reply and device-set request.
> 
> Add DPLL_A_PIN_MEASURED_FREQUENCY pin attribute to expose the measured
> input frequency in Hz. The attribute is present in the pin-get reply.
> 
> Signed-off-by: Ivan Vecera <ivecera@redhat.com>

Reviewed-by: Vadim Fedorenko <vadim.fedorenko@linux.dev>

^ permalink raw reply

* [PATCH v2 3/3] Documentation: document panic_on_unrecoverable_memory_failure sysctl
From: Breno Leitao @ 2026-03-31 11:00 UTC (permalink / raw)
  To: Miaohe Lin, Naoya Horiguchi, Andrew Morton, Jonathan Corbet,
	Shuah Khan
  Cc: linux-mm, linux-kernel, linux-doc, Breno Leitao, kernel-team
In-Reply-To: <20260331-ecc_panic-v2-0-9e40d0f64f7a@debian.org>

Document the new vm.panic_on_unrecoverable_memory_failure sysctl in the
admin guide, following the same format as panic_on_unrecovered_nmi.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 Documentation/admin-guide/sysctl/vm.rst | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst
index 97e12359775c..a811f503bca6 100644
--- a/Documentation/admin-guide/sysctl/vm.rst
+++ b/Documentation/admin-guide/sysctl/vm.rst
@@ -67,6 +67,7 @@ Currently, these files are in /proc/sys/vm:
 - page-cluster
 - page_lock_unfairness
 - panic_on_oom
+- panic_on_unrecoverable_memory_failure
 - percpu_pagelist_high_fraction
 - stat_interval
 - stat_refresh
@@ -925,6 +926,32 @@ panic_on_oom=2+kdump gives you very strong tool to investigate
 why oom happens. You can get snapshot.
 
 
+panic_on_unrecoverable_memory_failure
+======================================
+
+When a hardware memory error (e.g. multi-bit ECC) hits an in-use kernel
+page that cannot be recovered by the memory failure handler, the default
+behaviour is to ignore the error and continue operation.  This is
+dangerous because the corrupted data remains accessible to the kernel,
+risking silent data corruption or a delayed crash when the poisoned
+memory is next accessed.
+
+Pages that reach this path include slab objects (dentry cache, inode
+cache, etc.), page tables, kernel stacks, and other kernel allocations
+that lack the reverse mapping needed to isolate all references.
+
+For many environments it is preferable to panic immediately with a clean
+crash dump that captures the original error context, rather than to
+continue and face a random crash later whose cause is difficult to
+diagnose.
+
+= =====================================================================
+0 Try to continue operation (default).
+1 Panic immediately.  If the ``panic`` sysctl is also non-zero then the
+  machine will be rebooted.
+= =====================================================================
+
+
 percpu_pagelist_high_fraction
 =============================
 

-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 2/3] mm/memory-failure: add panic_on_unrecoverable_memory_failure sysctl
From: Breno Leitao @ 2026-03-31 11:00 UTC (permalink / raw)
  To: Miaohe Lin, Naoya Horiguchi, Andrew Morton, Jonathan Corbet,
	Shuah Khan
  Cc: linux-mm, linux-kernel, linux-doc, Breno Leitao, kernel-team
In-Reply-To: <20260331-ecc_panic-v2-0-9e40d0f64f7a@debian.org>

Add a sysctl that allows the system to panic when an unrecoverable
memory failure is detected. This covers kernel pages, high-order
kernel pages, and unknown page types that cannot be recovered.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 mm/memory-failure.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6ff80e01b91a..d0d911c54ff1 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -74,6 +74,8 @@ static int sysctl_memory_failure_recovery __read_mostly = 1;
 
 static int sysctl_enable_soft_offline __read_mostly = 1;
 
+static int sysctl_panic_on_unrecoverable_mf __read_mostly;
+
 atomic_long_t num_poisoned_pages __read_mostly = ATOMIC_LONG_INIT(0);
 
 static bool hw_memory_failure __read_mostly = false;
@@ -155,6 +157,15 @@ static const struct ctl_table memory_failure_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
+	},
+	{
+		.procname	= "panic_on_unrecoverable_memory_failure",
+		.data		= &sysctl_panic_on_unrecoverable_mf,
+		.maxlen		= sizeof(sysctl_panic_on_unrecoverable_mf),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
 	}
 };
 
@@ -1298,6 +1309,11 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
 	pr_err("%#lx: recovery action for %s: %s\n",
 		pfn, action_page_types[type], action_name[result]);
 
+	if (sysctl_panic_on_unrecoverable_mf && result == MF_IGNORED &&
+	    (type == MF_MSG_KERNEL || type == MF_MSG_KERNEL_HIGH_ORDER ||
+	     type == MF_MSG_UNKNOWN))
+		panic("Memory failure: %#lx: unrecoverable page", pfn);
+
 	return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
 }
 

-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 0/3] mm/memory-failure: add panic option for unrecoverable pages
From: Breno Leitao @ 2026-03-31 11:00 UTC (permalink / raw)
  To: Miaohe Lin, Naoya Horiguchi, Andrew Morton, Jonathan Corbet,
	Shuah Khan
  Cc: linux-mm, linux-kernel, linux-doc, Breno Leitao, kernel-team

When the memory failure handler encounters an in-use kernel page that it
cannot recover (slab, page tables, kernel stacks, vmalloc, etc.), it
currently logs the error as "Ignored" and continues operation.

This leaves corrupted data accessible to the kernel, which will inevitably
cause either silent data corruption or a delayed crash when the poisoned memory
is next accessed.

This is a common problem on large fleets. We frequently observe multi-bit ECC
errors hitting kernel slab pages, where memory_failure() fails to recover them
and the system crashes later at an unrelated code path, making root cause
analysis unnecessarily difficult.

Here is one specific example from production on an arm64 server: a multi-bit
ECC error hit a dentry cache slab page, memory_failure() failed to recover it
(slab pages are not supported by the hwpoison recovery mechanism), and 67
seconds later d_lookup() accessed the poisoned cache line causing a synchronous
external abort:

    [88690.479680] [Hardware Error]: error_type: 3, multi-bit ECC
    [88690.498473] Memory failure: 0x40272d: unhandlable page.
    [88690.498619] Memory failure: 0x40272d: recovery action for
                   get hwpoison page: Ignored
    ...
    [88757.847126] Internal error: synchronous external abort:
                   0000000096000410 [#1] SMP
    [88758.061075] pc : d_lookup+0x5c/0x220

This series adds a new sysctl vm.panic_on_unrecoverable_memory_failure
(default 0) that, when enabled, panics immediately on unrecoverable
memory failures. This provides a clean crash dump at the time of the
error, which is far more useful for diagnosis than a random crash later
at an unrelated code path.

This also categorizes reserved pages as MF_MSG_KERNEL, and panics on
unknown page types (MF_MSG_UNKNOWN), so all unrecoverable failure cases
are covered.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
Changes in v2:
- Panic on MF_MSG_KERNEL, MF_MSG_KERNEL_HIGH_ORDER and MF_MSG_UNKNOWN
  instead of MF_MSG_GET_HWPOISON.
- Report MF_MSG_KERNEL for reserved pages when get_hwpoison_page() fails
  instead of MF_MSG_GET_HWPOISON.
- Link to v1: https://patch.msgid.link/20260323-ecc_panic-v1-0-72a1921726c5@debian.org

---
Breno Leitao (3):
      mm/memory-failure: report MF_MSG_KERNEL for reserved pages
      mm/memory-failure: add panic_on_unrecoverable_memory_failure sysctl
      Documentation: document panic_on_unrecoverable_memory_failure sysctl

 Documentation/admin-guide/sysctl/vm.rst | 27 +++++++++++++++++++++++++++
 mm/memory-failure.c                     | 22 +++++++++++++++++++++-
 2 files changed, 48 insertions(+), 1 deletion(-)
---
base-commit: c369299895a591d96745d6492d4888259b004a9e
change-id: 20260323-ecc_panic-4e473b83087c

Best regards,
--  
Breno Leitao <leitao@debian.org>


^ permalink raw reply

* [PATCH v2 1/3] mm/memory-failure: report MF_MSG_KERNEL for reserved pages
From: Breno Leitao @ 2026-03-31 11:00 UTC (permalink / raw)
  To: Miaohe Lin, Naoya Horiguchi, Andrew Morton, Jonathan Corbet,
	Shuah Khan
  Cc: linux-mm, linux-kernel, linux-doc, Breno Leitao, kernel-team
In-Reply-To: <20260331-ecc_panic-v2-0-9e40d0f64f7a@debian.org>

When get_hwpoison_page() returns a negative value, distinguish
reserved pages from other failure cases by reporting MF_MSG_KERNEL
instead of MF_MSG_GET_HWPOISON. Reserved pages belong to the kernel
and should be classified accordingly for proper handling by the
panic_on_unrecoverable_memory_failure mechanism.

Signed-off-by: Breno Leitao <leitao@debian.org>
---
 mm/memory-failure.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index ee42d4361309..6ff80e01b91a 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -2432,7 +2432,11 @@ int memory_failure(unsigned long pfn, int flags)
 		}
 		goto unlock_mutex;
 	} else if (res < 0) {
-		res = action_result(pfn, MF_MSG_GET_HWPOISON, MF_IGNORED);
+		if (PageReserved(p))
+			res = action_result(pfn, MF_MSG_KERNEL, MF_IGNORED);
+		else
+			res = action_result(pfn, MF_MSG_GET_HWPOISON,
+					    MF_IGNORED);
 		goto unlock_mutex;
 	}
 

-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH v1 6/6] docs: misc: amd-sbi: Document SBTSI userspace interface
From: Gupta, Akshay @ 2026-03-31 10:46 UTC (permalink / raw)
  To: Greg KH
  Cc: linux-kernel, corbet, skhan, linux, arnd, Prathima.Lk,
	naveenkrishna.chatradhi, Anand.Umarji, linux-doc, linux-hwmon,
	kunyi
In-Reply-To: <2026032753-contently-overfeed-5872@gregkh>


On 3/27/2026 12:55 PM, Greg KH wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Mon, Mar 23, 2026 at 04:38:11PM +0530, Akshay Gupta wrote:
>> From: Prathima <Prathima.Lk@amd.com>
>>
>> - Document AMD sideband IOCTL description defined
>>    for SBTSI and its usage.
>>    User space C-APIs are made available by esmi_oob_library [1],
>>    which is provided by the E-SMS project [2].
>>
>>    Link: https://github.com/amd/esmi_oob_library [1]
>>    Link: https://www.amd.com/en/developer/e-sms.html [2]
> Ok, nevermind, here's the documentation :)
>
> But it's very tiny, it's not saying what the api actually is.
>
> thanks,
>
> greg k-h

Hi Greg,

Thank you for the feedback, will update the doc in next version with 
details.


^ permalink raw reply

* Re: [PATCH v1 1/6] hwmon/misc: amd-sbi: Move core SBTSI support from hwmon to misc
From: Gupta, Akshay @ 2026-03-31 10:46 UTC (permalink / raw)
  To: gregkh@linuxfoundation.org, Guenter Roeck
  Cc: linux-kernel@vger.kernel.org, corbet@lwn.net,
	skhan@linuxfoundation.org, arnd@arndb.de, L k, Prathima,
	Chatradhi, Naveen Krishna, Umarji, Anand,
	linux-doc@vger.kernel.org, linux-hwmon@vger.kernel.org,
	kunyi@google.com
In-Reply-To: <2026032744-humble-festival-0943@gregkh>


On 3/27/2026 12:53 PM, gregkh@linuxfoundation.org wrote:
> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>
>
> On Thu, Mar 26, 2026 at 10:52:29PM -0700, Guenter Roeck wrote:
>> On 3/26/26 22:07, Gupta, Akshay wrote:
>>> On 3/24/2026 5:03 PM, Guenter Roeck wrote:
>>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>>
>>>>
>>>> On 3/24/26 03:36, Gupta, Akshay wrote:
>>>>> On 3/23/2026 7:45 PM, Guenter Roeck wrote:
>>>>>> Caution: This message originated from an External Source. Use proper caution when opening attachments, clicking links, or responding.
>>>>>>
>>>>>>
>>>>>> On 3/23/26 04:08, Akshay Gupta wrote:
>>>>>>> From: Prathima <Prathima.Lk@amd.com>
>>>>>>>
>>>>>>> Move SBTSI core functionality out of the hwmon-only path and into
>>>>>>> drivers/misc/amd-sbi so it can be reused by non-hwmon consumers.
>>>>>>>
>>>>>>> This split prepares the driver for additional interfaces while keeping
>>>>>>> hwmon support as an optional layer on top of common SBTSI core logic.
>>>>>>>
>>>>>> This moves the driver out of hwmon space into misc/amd-sbi which,
>>>>>> in my opinion, is completely unnecessary to accomplish the stated goals.
>>>>>>
>>>>>> I assume this is to be able to make changes which do not follow
>>>>>> the hwmon ABI and/or to bypass hwmon subsystem review, similar
>>>>>> to what has been done by others.
>>>>>>
>>>>>> Obviously, I think this is a bad idea. I won't give it a NACK,
>>>>>> but I won't approve (nor review) it either.
>>>>>>
>>>>>> Guenter
>>>>> Hi Guenter,
>>>>>
>>>>> Thank you for your quick response.
>>>>>
>>>>> At present, TSI supports a range of functionalities that cannot be exposed through hwmon. Additionally, a new protocol leveraging the TSI endpoint in hardware has been introduced, which, to our understanding, cannot be accommodated within the hwmon subsystem.
>>>>>
>>>>> Since we already support the RMI interface via misc/amd-sbi, we believe this remains the appropriate place to continue AMD's out-of-band support.
>>>>>
>>>>> I will update the commit message and cover letter to clearly articulate the rationale behind this change.
>>>>>
>>>>> Thank you
>>>>>
>>>> That is neither a reason or an argument for moving _hwmon_ part of the code
>>>> out of the hwmon subsystem.
>>> Following feedback from the Greg and MFD subsystem maintainers, we introduced an sb-rmi driver under misc/ that calls devm_hwmon_device_register_with_info(). We are considering the same approach for the sb-tsi driver. Would you recommend a more suitable alternative?
>> I would have suggested to use an auxiliary driver, similar to PECI,
>> but who am I to argue if senior maintainers suggest otherwise.
> Sounds like an aux driver makes sense to me too, I don't remember saying
> that you HAD to call devm_hwmon_device_register_with_info(), where was
> that stated in previous reviews?
>
> thanks,
>
> greg k-h

Thank you Greg and Guenter for the feedback. I will explore auxiliary 
driver and submit the changes as part of next version.

This way hwmon sensor will remain in hwmon subsytem and new 
implementation can be part of misc.

Following the discussions at 
https://lore.kernel.org/all/01c95139-dfeb-6983-77d2-4382ffb50896@amd.com/, 
https://lore.kernel.org/all/8aa17f38-a6f6-4ba9-b38c-767ed39c1c92@roeck-us.net/, 
we plan to adopt the same approach for the TSI driver.


^ permalink raw reply

* Re: [PATCH 1/1] leds: Introduce the multi_max_intensity sysfs attribute
From: Lee Jones @ 2026-03-31 10:38 UTC (permalink / raw)
  To: Armin Wolf
  Cc: pavel, linux-kernel, corbet, skhan, linux-leds, linux-doc, wse,
	jacek.anaszewski, pobrn, m.tretter
In-Reply-To: <20260324202751.6486-2-W_Armin@gmx.de>

On Tue, 24 Mar 2026, Armin Wolf wrote:

> Some multicolor LEDs support global brightness control in hardware,
> meaning that the maximum intensity of the color components is not
> connected to the maximum global brightness. Such LEDs cannot be
> described properly by the current multicolor LED class interface,
> because it assumes that the maximum intensity of each color component
> is described by the maximum global brightness of the LED.
> 
> Fix this by introducing a new sysfs attribute called
> "multi_max_intensity" holding the maximum intensity values for the
> color components of a multicolor LED class device. Drivers can use
> the new max_intensity field inside struct mc_subled to tell the
> multicolor LED class code about those values. Intensity values written
> by userspace applications will be limited to this maximum value.
> 
> Drivers for multicolor LEDs that do not support global brightness
> control in hardware might still want to use the maximum global LED
> brightness supplied via devicetree as the maximum intensity of each
> individual color component. Such drivers should set max_intensity
> to 0 so that the multicolor LED core can act accordingly.
> 
> The lp50xx and ncp5623 LED drivers already use hardware-based control
> for the global LED brightness. Modify those drivers to correctly
> initalize .max_intensity to avoid being limited to the maximum global
> brightness supplied via devicetree.
> 
> Signed-off-by: Armin Wolf <W_Armin@gmx.de>
> ---
>  .../ABI/testing/sysfs-class-led-multicolor    | 19 ++++++--
>  Documentation/leds/leds-class-multicolor.rst  | 21 ++++++++-
>  drivers/leds/led-class-multicolor.c           | 47 ++++++++++++++++++-
>  drivers/leds/leds-lp50xx.c                    |  1 +
>  drivers/leds/rgb/leds-ncp5623.c               |  4 +-
>  include/linux/led-class-multicolor.h          | 30 +++++++++++-
>  6 files changed, 113 insertions(+), 9 deletions(-)
> 
> diff --git a/Documentation/ABI/testing/sysfs-class-led-multicolor b/Documentation/ABI/testing/sysfs-class-led-multicolor
> index 16fc827b10cb..197da3e775b4 100644
> --- a/Documentation/ABI/testing/sysfs-class-led-multicolor
> +++ b/Documentation/ABI/testing/sysfs-class-led-multicolor
> @@ -16,9 +16,22 @@ Date:		March 2020
>  KernelVersion:	5.9
>  Contact:	Dan Murphy <dmurphy@ti.com>
>  Description:	read/write
> -		This file contains array of integers. Order of components is
> -		described by the multi_index array. The maximum intensity should
> -		not exceed /sys/class/leds/<led>/max_brightness.
> +		This file contains an array of integers. The order of components
> +		is described by the multi_index array. The maximum intensity value
> +		supported by each color component is described by the multi_max_intensity
> +		file. Writing intensity values larger than the maximum value of a
> +		given color component will result in those values being clamped.
> +
> +		For additional details please refer to
> +		Documentation/leds/leds-class-multicolor.rst.
> +
> +What:		/sys/class/leds/<led>/multi_max_intensity
> +Date:		March 2026
> +KernelVersion:	7.1
> +Contact:	Armin Wolf <W_Armin@gmx.de>
> +Description:	read
> +		This file contains an array of integers describing the maximum
> +		intensity value for each intensity component.
>  		For additional details please refer to
>  		Documentation/leds/leds-class-multicolor.rst.
> diff --git a/Documentation/leds/leds-class-multicolor.rst b/Documentation/leds/leds-class-multicolor.rst
> index c6b47b4093c4..8f42f10078ad 100644
> --- a/Documentation/leds/leds-class-multicolor.rst
> +++ b/Documentation/leds/leds-class-multicolor.rst
> @@ -25,10 +25,14 @@ color name to indexed value.
>  The ``multi_index`` file is an array that contains the string list of the colors as
>  they are defined in each ``multi_*`` array file.
>  
> -The ``multi_intensity`` is an array that can be read or written to for the
> +The ``multi_intensity`` file is an array that can be read or written to for the
>  individual color intensities.  All elements within this array must be written in
>  order for the color LED intensities to be updated.
>  
> +The ``multi_max_intensity`` file is an array that contains the maximum intensity
> +value supported by each color intensity. Intensity values above this will be
> +automatically clamped into the supported range.
> +
>  Directory Layout Example
>  ========================
>  .. code-block:: console
> @@ -38,6 +42,7 @@ Directory Layout Example
>      -r--r--r--    1 root     root          4096 Oct 19 16:16 max_brightness
>      -r--r--r--    1 root     root          4096 Oct 19 16:16 multi_index
>      -rw-r--r--    1 root     root          4096 Oct 19 16:16 multi_intensity
> +    -r--r--r--    1 root     root          4096 OCt 19 16:16 multi_max_intensity

Nit: Oct

>  
>  ..
>  
> @@ -104,3 +109,17 @@ the color LED group.
>      128
>  
>  ..
> +
> +Writing intensity values larger than the maximum specified in ``multi_max_intensity``
> +will result in those values being clamped into the supported range.
> +
> +.. code-block:: console
> +
> +   # cat /sys/class/leds/multicolor:status/multi_max_intensity
> +   255 255 255
> +
> +   # echo 512 512 512 > /sys/class/leds/multicolor:status/multi_intensity
> +   # cat /sys/class/leds/multicolor:status/multi_intensity
> +   255 255 255
> +
> +..
> diff --git a/drivers/leds/led-class-multicolor.c b/drivers/leds/led-class-multicolor.c
> index 6b671f3f9c61..13a35e6a28df 100644
> --- a/drivers/leds/led-class-multicolor.c
> +++ b/drivers/leds/led-class-multicolor.c
> @@ -7,10 +7,28 @@
>  #include <linux/init.h>
>  #include <linux/led-class-multicolor.h>
>  #include <linux/math.h>
> +#include <linux/minmax.h>
>  #include <linux/module.h>
>  #include <linux/slab.h>
>  #include <linux/uaccess.h>
>  
> +static unsigned int led_mc_get_max_intensity(struct led_classdev_mc *mcled_cdev, size_t index)
> +{
> +	unsigned int max_intensity;
> +
> +	/* The maximum global brightness value might still be changed by
> +	 * led_classdev_register_ext() using devicetree properties. This
> +	 * prevents us from changing subled_info[X].max_intensity when
> +	 * registering a multicolor LED class device, so we have to do
> +	 * this during runtime.
> +	 */
> +	max_intensity = mcled_cdev->subled_info[index].max_intensity;
> +	if (max_intensity)
> +		return max_intensity;
> +
> +	return mcled_cdev->led_cdev.max_brightness;
> +}
> +
>  int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
>  				 enum led_brightness brightness)
>  {
> @@ -27,6 +45,27 @@ int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
>  }
>  EXPORT_SYMBOL_GPL(led_mc_calc_color_components);
>  
> +static ssize_t multi_max_intensity_show(struct device *dev,
> +					struct device_attribute *intensity_attr, char *buf)
> +{
> +	struct led_classdev *led_cdev = dev_get_drvdata(dev);
> +	struct led_classdev_mc *mcled_cdev = lcdev_to_mccdev(led_cdev);
> +	unsigned int max_intensity;
> +	int len = 0;
> +	int i;
> +
> +	for (i = 0; i < mcled_cdev->num_colors; i++) {
> +		max_intensity = led_mc_get_max_intensity(mcled_cdev, i);
> +		len += sprintf(buf + len, "%u", max_intensity);
> +		if (i < mcled_cdev->num_colors - 1)
> +			len += sprintf(buf + len, " ");
> +	}

This should be 'sysfs_emit_at()'.

> +
> +	buf[len++] = '\n';
> +	return len;
> +}
> +static DEVICE_ATTR_RO(multi_max_intensity);
> +
>  static ssize_t multi_intensity_store(struct device *dev,
>  				struct device_attribute *intensity_attr,
>  				const char *buf, size_t size)
> @@ -35,6 +74,7 @@ static ssize_t multi_intensity_store(struct device *dev,
>  	struct led_classdev_mc *mcled_cdev = lcdev_to_mccdev(led_cdev);
>  	int nrchars, offset = 0;
>  	unsigned int intensity_value[LED_COLOR_ID_MAX];
> +	unsigned int max_intensity;
>  	int i;
>  	ssize_t ret;
>  
> @@ -56,8 +96,10 @@ static ssize_t multi_intensity_store(struct device *dev,
>  		goto err_out;
>  	}
>  
> -	for (i = 0; i < mcled_cdev->num_colors; i++)
> -		mcled_cdev->subled_info[i].intensity = intensity_value[i];
> +	for (i = 0; i < mcled_cdev->num_colors; i++) {
> +		max_intensity = led_mc_get_max_intensity(mcled_cdev, i);
> +		mcled_cdev->subled_info[i].intensity = min(intensity_value[i], max_intensity);
> +	}
>  
>  	if (!test_bit(LED_BLINK_SW, &led_cdev->work_flags))
>  		led_set_brightness(led_cdev, led_cdev->brightness);
> @@ -111,6 +153,7 @@ static ssize_t multi_index_show(struct device *dev,
>  static DEVICE_ATTR_RO(multi_index);
>  
>  static struct attribute *led_multicolor_attrs[] = {
> +	&dev_attr_multi_max_intensity.attr,
>  	&dev_attr_multi_intensity.attr,
>  	&dev_attr_multi_index.attr,
>  	NULL,
> diff --git a/drivers/leds/leds-lp50xx.c b/drivers/leds/leds-lp50xx.c
> index e2a9c8592953..69c3550f1a31 100644
> --- a/drivers/leds/leds-lp50xx.c
> +++ b/drivers/leds/leds-lp50xx.c
> @@ -525,6 +525,7 @@ static int lp50xx_probe_dt(struct lp50xx *priv)
>  			}
>  
>  			mc_led_info[multi_index].color_index = color_id;
> +			mc_led_info[multi_index].max_intensity = 255;
>  			num_colors++;
>  		}
>  
> diff --git a/drivers/leds/rgb/leds-ncp5623.c b/drivers/leds/rgb/leds-ncp5623.c
> index 85d6be6fff2b..f2528f06507d 100644
> --- a/drivers/leds/rgb/leds-ncp5623.c
> +++ b/drivers/leds/rgb/leds-ncp5623.c
> @@ -56,8 +56,7 @@ static int ncp5623_brightness_set(struct led_classdev *cdev,
>  	for (int i = 0; i < mc_cdev->num_colors; i++) {
>  		ret = ncp5623_write(ncp->client,
>  				    NCP5623_PWM_REG(mc_cdev->subled_info[i].channel),
> -				    min(mc_cdev->subled_info[i].intensity,
> -					NCP5623_MAX_BRIGHTNESS));
> +				    mc_cdev->subled_info[i].intensity);
>  		if (ret)
>  			return ret;
>  	}
> @@ -190,6 +189,7 @@ static int ncp5623_probe(struct i2c_client *client)
>  			goto release_led_node;
>  
>  		subled_info[ncp->mc_dev.num_colors].channel = reg;
> +		subled_info[ncp->mc_dev.num_colors].max_intensity = NCP5623_MAX_BRIGHTNESS;
>  		subled_info[ncp->mc_dev.num_colors++].color_index = color_index;
>  	}
>  
> diff --git a/include/linux/led-class-multicolor.h b/include/linux/led-class-multicolor.h
> index db9f34c6736e..26f6d20b887d 100644
> --- a/include/linux/led-class-multicolor.h
> +++ b/include/linux/led-class-multicolor.h
> @@ -9,10 +9,31 @@
>  #include <linux/leds.h>
>  #include <dt-bindings/leds/common.h>
>  
> +/**
> + * struct mc_subled - Color component description.
> + * @color_index: Color ID.
> + * @brightness: Scaled intensity.
> + * @intensity: Current intensity.
> + * @max_intensity: Maximum supported intensity value.
> + * @channel: Channel index.
> + *
> + * Describes a color component of a multicolor LED. Many multicolor LEDs
> + * do no support gobal brightness control in hardware, so they use
> + * the brightness field in connection with led_mc_calc_color_components()
> + * to perform the intensity scaling in software.
> + * Such drivers should set max_intensity to 0 to signal the multicolor LED core
> + * that the maximum global brightness of the LED class device should be used for
> + * limiting incoming intensity values.
> + *
> + * Multicolor LEDs that do support global brightness control in hardware
> + * should instead set max_intensity to the maximum intensity value supported
> + * by the hardware for a given color component.
> + */
>  struct mc_subled {
>  	unsigned int color_index;
>  	unsigned int brightness;
>  	unsigned int intensity;
> +	unsigned int max_intensity;
>  	unsigned int channel;
>  };
>  
> @@ -53,7 +74,14 @@ int led_classdev_multicolor_register_ext(struct device *parent,
>   */
>  void led_classdev_multicolor_unregister(struct led_classdev_mc *mcled_cdev);
>  
> -/* Calculate brightness for the monochrome LED cluster */
> +/**
> + * led_mc_calc_color_components() - Calculates component brightness values of a LED cluster.
> + * @mcled_cdev - Multicolor LED class device of the LED cluster.
> + * @led_brightness - Global brightness of the LED cluster.

The header comment does not match the parameters.

Make sure you compile with W=1 to catch kernel-doc issues.

> + * Calculates the brightness values for each color component of a monochrome LED cluster,
> + * see Documentation/leds/leds-class-multicolor.rst for details.
> + */
>  int led_mc_calc_color_components(struct led_classdev_mc *mcled_cdev,
>  				 enum led_brightness brightness);
>  
> -- 
> 2.39.5
> 
> 

-- 
Lee Jones [李琼斯]

^ permalink raw reply

* Re: [PATCH v11 03/22] drm: Add new general DRM property "color format"
From: Nicolas Frattaroli @ 2026-03-31 10:33 UTC (permalink / raw)
  To: Ville Syrjälä
  Cc: Maxime Ripard, Harry Wentland, Leo Li, Rodrigo Siqueira,
	Alex Deucher, Christian König, David Airlie, Simona Vetter,
	Maarten Lankhorst, Thomas Zimmermann, Andrzej Hajda,
	Neil Armstrong, Robert Foss, Laurent Pinchart, Jonas Karlman,
	Jernej Skrabec, Sandy Huang, Heiko Stübner, Andy Yan,
	Jani Nikula, Rodrigo Vivi, Joonas Lahtinen, Tvrtko Ursulin,
	Dmitry Baryshkov, Sascha Hauer, Rob Herring, Jonathan Corbet,
	Shuah Khan, kernel, amd-gfx, dri-devel, linux-kernel,
	linux-arm-kernel, linux-rockchip, intel-gfx, intel-xe, linux-doc,
	Werner Sembach, Andri Yngvason, Marius Vlad
In-Reply-To: <acsNoCDsPtEhtkRn@intel.com>

On Tuesday, 31 March 2026 01:56:16 Central European Summer Time Ville Syrjälä wrote:
> On Sat, Mar 28, 2026 at 02:49:04AM +0200, Ville Syrjälä wrote:
> > On Fri, Mar 27, 2026 at 01:56:06PM +0100, Nicolas Frattaroli wrote:
> > > On Thursday, 26 March 2026 18:58:25 Central European Standard Time Ville Syrjälä wrote:
> > > > On Thu, Mar 26, 2026 at 06:02:47PM +0100, Maxime Ripard wrote:
> > > > > On Wed, Mar 25, 2026 at 08:43:15PM +0200, Ville Syrjälä wrote:
> > > > > > On Wed, Mar 25, 2026 at 03:56:58PM +0100, Maxime Ripard wrote:
> > > > > > > On Wed, Mar 25, 2026 at 01:03:07PM +0200, Ville Syrjälä wrote:
> > > > > > > > On Wed, Mar 25, 2026 at 09:24:27AM +0100, Maxime Ripard wrote:
> > > > > > > > > On Tue, Mar 24, 2026 at 09:53:35PM +0200, Ville Syrjälä wrote:
> > > > > > > > > > On Tue, Mar 24, 2026 at 08:10:11PM +0100, Nicolas Frattaroli wrote:
> > > > > > > > > > > On Tuesday, 24 March 2026 18:00:45 Central European Standard Time Ville Syrjälä wrote:
> > > > > > > > > > > > On Tue, Mar 24, 2026 at 05:01:07PM +0100, Nicolas Frattaroli wrote:
> > > > > > > > > > > > > +enum drm_connector_color_format {
> > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_AUTO: The driver or display protocol
> > > > > > > > > > > > > +	 * helpers should pick a suitable color format. All implementations of a
> > > > > > > > > > > > > +	 * specific display protocol must behave the same way with "AUTO", but
> > > > > > > > > > > > > +	 * different display protocols do not necessarily have the same "AUTO"
> > > > > > > > > > > > > +	 * semantics.
> > > > > > > > > > > > > +	 *
> > > > > > > > > > > > > +	 * For HDMI, "AUTO" picks RGB, but falls back to YCbCr 4:2:0 if the
> > > > > > > > > > > > > +	 * bandwidth required for full-scale RGB is not available, or the mode
> > > > > > > > > > > > > +	 * is YCbCr 4:2:0-only, as long as the mode and output both support
> > > > > > > > > > > > > +	 * YCbCr 4:2:0.
> > > > > > > > > > > > > +	 *
> > > > > > > > > > > > > +	 * For display protocols other than HDMI, the recursive bridge chain
> > > > > > > > > > > > > +	 * format selection picks the first chain of bridge formats that works,
> > > > > > > > > > > > > +	 * as has already been the case before the introduction of the "color
> > > > > > > > > > > > > +	 * format" property. Non-HDMI bridges should therefore either sort their
> > > > > > > > > > > > > +	 * bus output formats by preference, or agree on a unified auto format
> > > > > > > > > > > > > +	 * selection logic that's implemented in a common state helper (like
> > > > > > > > > > > > > +	 * how HDMI does it).
> > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_AUTO = 0,
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_RGB444: RGB output format
> > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_RGB444,
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR444: YCbCr 4:4:4 output format (ie.
> > > > > > > > > > > > > +	 * not subsampled)
> > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR444,
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR422: YCbCr 4:2:2 output format (ie.
> > > > > > > > > > > > > +	 * with horizontal subsampling)
> > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR422,
> > > > > > > > > > > > > +
> > > > > > > > > > > > > +	/**
> > > > > > > > > > > > > +	 * @DRM_CONNECTOR_COLOR_FORMAT_YCBCR420: YCbCr 4:2:0 output format (ie.
> > > > > > > > > > > > > +	 * with horizontal and vertical subsampling)
> > > > > > > > > > > > > +	 */
> > > > > > > > > > > > > +	DRM_CONNECTOR_COLOR_FORMAT_YCBCR420,
> > > > > > > > > > > > 
> > > > > > > > > > > > Seems like this should document what the quantization range
> > > > > > > > > > > > should be for each format.
> > > > > > > > > > > > 
> > > > > > > > > > > 
> > > > > > > > > > > I don't think so? If you want per-component bit depth values,
> > > > > > > > > > > DRM_FORMAT_* defines would be the appropriate values to use. This
> > > > > > > > > > > enum is more abstract than that, and is there to communicate
> > > > > > > > > > > YUV vs. RGB and chroma subsampling, with bit depth being handled
> > > > > > > > > > > by other properties.
> > > > > > > > > > > 
> > > > > > > > > > > If you mean the factor used for subsampling, then that'd only be
> > > > > > > > > > > relevant if YCBCR410 was supported where one chroma plane isn't
> > > > > > > > > > > halved but quartered in resolution. I suspect 4:1:0 will never
> > > > > > > > > > > be added; no digital display protocol standard supports it to my
> > > > > > > > > > > knowledge, and hopefully none ever will.
> > > > > > > > > > 
> > > > > > > > > > No, I mean the quantization range (16-235 vs. 0-255 etc).
> > > > > > > > > > 
> > > > > > > > > > The i915 behaviour is that YCbCr is always limited range,
> > > > > > > > > > RGB can either be full or limited range depending on the 
> > > > > > > > > > "Broadcast RGB" property and other related factors.
> > > > > > > > > 
> > > > > > > > > So far the HDMI state has both the format and quantization range as
> > > > > > > > > different fields. I'm not sure we need to document the range in the
> > > > > > > > > format field, maybe only mention it's not part of the format but has a
> > > > > > > > > field of its own?
> > > > > > > > 
> > > > > > > > I think we only have it for RGB (on some drivers only?). For YCbCr
> > > > > > > > I think the assumption is limited range everywhere.
> > > > > > > > 
> > > > > > > > But I'm not really concerned about documenting struct members.
> > > > > > > > What I'm talking about is the *uapi* docs. Surely userspace
> > > > > > > > will want to know what the new property actually does so the
> > > > > > > > uapi needs to be documented properly. And down the line some
> > > > > > > > new driver might also implement the wrong behaviour if there
> > > > > > > > is no clear specification.
> > > > > > > 
> > > > > > > Ack
> > > > > > > 
> > > > > > > > So I'm thinking (or perhaps hoping) the rule might be something like:
> > > > > > > > - YCbCr limited range 
> > > > > > > > - RGB full range if "Broadcast RGB" property is not present
> > > > > > > 
> > > > > > > Isn't it much more complicated than that for HDMI though? My
> > > > > > > recollection was that any VIC but VIC1 would be limited range, and
> > > > > > > anything else full range?
> > > > > > 
> > > > > > Do we have some driver that implements the CTA-861 CE vs. IT mode
> > > > > > logic but doesn't expose the "Broadcast RGB" property? I was hoping
> > > > > > those would always go hand in hand now.
> > > > > 
> > > > > I'm not sure. i915 and the HDMI state helpers handle it properly (I
> > > > > think?) but it looks like only vc4 registers the Broadcast RGB property
> > > > > and uses the HDMI state helpers.
> > > > > 
> > > > > And it looks like amdgpu registers Broadcast RGB but doesn't use
> > > > > drm_default_rgb_quant_range() which seems suspicious?
> > > > 
> > > > If they want just manual full vs. limited then they should
> > > > limit the property to not expose the "auto" option at all.
> > > > 
> > > > amdgpu also ties this in with the "colorspace" property, which
> > > > originally in i915 only controlled the infoframes/etc. But on
> > > > amdgpu it now controls various aspects of output color
> > > > transformation. The end result is that the property is a complete
> > > > mess with most of the values making no sense. And for whatever
> > > > reason everyone involved refused to remove/deprecate the
> > > > nonsensical values :/
> > > > 
> > > > Looks like this series should make sure the documentation for
> > > > the "colorspace" property is in sync with the new property
> > > > as well. Currently now it's giving conflicting information.
> > > > 
> > > 
> > > I take it the problematic information is in
> > > 
> > >     * DOC: standard connector properties
> > >     *
> > >     * Colorspace:
> > > 
> > > and probably specifically BT2020_YCC's (and BT2020_RGB's?) insistence
> > > that they "produce RGB content".
> > > 
> > > I think we probably just have to change the statement "The variants
> > > BT2020_RGB and BT2020_YCC are equivalent and the driver chooses between
> > > RGB and YCbCr on its own."
> > > 
> > > The "on its own" here would get turned into "based on the color format
> > > property".
> > > 
> > > Speaking of i915, that patch is one of the very few (5) patches in
> > > this series still lacking a review (hint hint nudge nudge). I'd like
> > > to get some more feedback on the remaining patches before I send out
> > > another revision, so that it's hopefully not just docs changes (I
> > > know better than to think those patches must be perfect and won't
> > > need revision.)
> > 
> > The i915 code around this is already a big mess, and I don't really
> > adding to that mess. So I think we'll need to do some refactoring before
> > we add anything there. I already started typing something and so far
> > it looks fairly straightforward, so I should have something soon.
> 
> OK, posted something
> https://lore.kernel.org/intel-gfx/20260330235339.29479-1-ville.syrjala@linux.intel.com/T/#m7c349478ca6c856fbc68d5e2178f1aa31678a05f

Thanks! I'll take a look at this today to get a more solid idea of
where the pain points you highlighted are.

I'll also rebase/reimplement my i915 color format implementation
(sans the DP-MST part, as discussed) on top of this on the next
revision. I was never fully happy with the current one due to the
logic being shoehorned into the already existing i915 fallback
format logic, so I'm quite happy to have another opportunity to
implement it with less historic baggage.

> Are the wayland/compositor/color management folks on board with
> these new properties? I don't think I see the usual suspects on
> the cc list.

I don't know which precise group of people you refer to, but at
least from the Collabora side of things, the userspace Wayland
people are on board with these new properties. In Weston, we use
it to implement the Weston frontend's "color-format" option in a
WIP branch at

https://gitlab.freedesktop.org/wayland/weston/-/merge_requests/1859

I've also been made aware that LibreELEC is aware, and will look
into making use of it rather than their own kernel patches.

Kind regards,
Nicolas Frattaroli

> > 
> > While doing that several questions came to my mind though:
> > 
> > * More interactions with the colorspace property, but I sent
> >   a separate mail already about that
> > 
> > * Which conversion matrix to use, and the answer I suspect
> >   should be "ask the colorspace property", as mentioned in the
> >   other mail
> > 
> > * Should we flat out reject color formats (and I suppose also
> >   colorspace prop values) the sink doesn't claim to support?
> > 
> >   If yes, then I think we'll have to forget about adding anything 
> >   to i915 MST code. The way the MST stuff works is that if one
> >   stream needs a modeset then all the related streams get modeset
> >   as well. Thus if the user replaces a monitor getting fed with a
> >   YCbCr stream just as another stream is being modeset, then the
> >   entire atomic commit could fail due to the YCbCr stream getting
> >   rejected.
> > 
> >   I think eventually we might have to invent some mechanism where
> >   all the input into the modeset computation is cached somehow,
> >   and said cache updated only on explicit userspace modesets.
> >   Either that or we have to come up  with a way to skip some of
> >   the calculations that depend on external factors. Either way
> >   it's going to be a pain.
> > 
> >   OTOH if we don't mind feeding the sink with stuff it can't
> >   understand, then I suppose we might add YCbCr 4:4:4 support
> >   for MST. It shouldn't be any different from RGB apart from
> >   the RGB->YCbCr conversion, which is handled elsewhere. But
> >   YCbCr 4:2:0 is definitely out either way, the MST code has
> >   no support for that currently.
> > 
> 
> 





^ permalink raw reply

* Re: [PATCH 1/2] mm/memory-failure: add panic_on_unrecoverable_memory_failure sysctl
From: Breno Leitao @ 2026-03-31 10:25 UTC (permalink / raw)
  To: Miaohe Lin
  Cc: linux-mm, linux-kernel, linux-doc, kernel-team, Naoya Horiguchi,
	Andrew Morton, Jonathan Corbet, Shuah Khan
In-Reply-To: <d8d2a5ad-9b8a-f0e2-3eb0-ee820eb7a148@huawei.com>

Hi Miaohe,

On Tue, Mar 31, 2026 at 10:27:33AM +0800, Miaohe Lin wrote:
> On 2026/3/30 21:45, Breno Leitao wrote:
> > On Mon, Mar 30, 2026 at 03:55:00PM +0800, Miaohe Lin wrote:
> >> On 2026/3/23 23:29, Breno Leitao wrote:
> >>
> >>> @@ -1298,6 +1309,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
> >>>  	pr_err("%#lx: recovery action for %s: %s\n",
> >>>  		pfn, action_page_types[type], action_name[result]);
> >>>
> >>> +	if (sysctl_panic_on_unrecoverable_mf &&
> >>> +	    type == MF_MSG_GET_HWPOISON && result == MF_IGNORED)
> >>> +		panic("Memory failure: %#lx: unrecoverable page", pfn);
> >>
> >> MF_MSG_GET_HWPOISON contains some other scenarios. For example, an isolated folio will
> >> make get_hwpoison_page return -EIO so we will see MF_MSG_GET_HWPOISON and MF_IGNORED in
> >> action_result. But that's recoverable if folio is used by userspace thus panic will be
> >> unacceptable.
> >> Will it better to check type against MF_MSG_KERNEL_HIGH_ORDER?
> >
> > Yes, I was discussing this with akpm, and maybe the better
> > approach would be to panic for types MF_MSG_KERNEL_HIGH_ORDER and MF_MSG_KERNEL.
> >
> > In both cases, it seems that, the page would not be able to migrate. What do
> > you think about a change like this:
> >
> >
> > @@ -1298,6 +1309,10 @@ static int action_result(unsigned long pfn, enum mf_action_page_type type,
> >         pr_err("%#lx: recovery action for %s: %s\n",
> >                 pfn, action_page_types[type], action_name[result]);
> >
> > +       if (sysctl_panic_on_unrecoverable_mf && result == MF_IGNORED &&
> > +           (type == MF_MSG_KERNEL || type == MF_MSG_KERNEL_HIGH_ORDER))
> > +               panic("Memory failure: %#lx: unrecoverable page", pfn);
> > +
> >         return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
> >  }
> >
>
> Maybe MF_MSG_UNKNOWN can also be considered? Kernel can't do anything further
> for those folios.

Agreed, I'll incorporate that change.

> BTW I think current code can't reach to MF_MSG_KERNEL and MF_MSG_UNKNOWN cases
> bacause there is always a (PageHuge() || HWPoisonHandlable()) check before calling
> identify_page_state.

You're absolutely right. I'd like to address this observation as well in the
updated patch.

Thanks,
--breno

^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox