Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* [PATCH v6 12/20] nfsd: add helper to marshal a fattr4 from completed args
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Break the loop that encodes the actual attr_vals field into a separate
function.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6f92ddeb449..7d162e5fb6ec 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3895,6 +3895,22 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 #endif
 };
 
+static __be32
+nfsd4_encode_attr_vals(struct xdr_stream *xdr, u32 *attrmask, struct nfsd4_fattr_args *args)
+{
+	DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+	unsigned long bit;
+	__be32 status;
+
+	bitmap_from_arr32(attr_bitmap, attrmask, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+	for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, args);
+		if (status != nfs_ok)
+			return status;
+	}
+	return nfs_ok;
+}
+
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves. @case_cache is NULL for callers that encode a single dentry
@@ -3908,7 +3924,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		    int ignore_crossmnt,
 		    struct nfsd_case_attrs_cache *case_cache)
 {
-	DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
 	struct nfs4_delegation *dp = NULL;
 	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
@@ -3923,7 +3938,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		.mnt	= exp->ex_path.mnt,
 		.dentry	= dentry,
 	};
-	unsigned long bit;
 
 	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
 	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
@@ -4137,27 +4151,22 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 #endif /* CONFIG_NFSD_V4_POSIX_ACLS */
 
 	/* attrmask */
-	status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
-				      attrmask[2]);
+	status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], attrmask[2]);
 	if (status)
 		goto out;
 
 	/* attr_vals */
 	attrlen_offset = xdr->buf->len;
-	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT)))
-		goto out_resource;
-	bitmap_from_arr32(attr_bitmap, attrmask,
-			  ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
-	for_each_set_bit(bit, attr_bitmap,
-			 ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
-		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
-		if (status != nfs_ok)
-			goto out;
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT))) {
+		status = nfserr_resource;
+		goto out;
 	}
-	attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
-	write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT);
-	status = nfs_ok;
 
+	status = nfsd4_encode_attr_vals(xdr, attrmask, &args);
+	if (status == nfs_ok) {
+		attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
+		write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT);
+	}
 out:
 #ifdef CONFIG_NFSD_V4_POSIX_ACLS
 	if (args.dpacl)
@@ -4180,9 +4189,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 out_nfserr:
 	status = nfserrno(err);
 	goto out;
-out_resource:
-	status = nfserr_resource;
-	goto out;
 }
 
 static bool

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 13/20] nfsd: allow nfsd4_encode_fattr4_change() to work with no export
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

In the context of a CB_NOTIFY callback, we may not have easy access to
a svc_export. nfsd will not currently grant a delegation on a the V4 root
however, so this should be safe.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7d162e5fb6ec..18adab1d7ca2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3273,7 +3273,7 @@ static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
 {
 	const struct svc_export *exp = args->exp;
 
-	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+	if (exp && unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
 		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
 
 		if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 14/20] nfsd: send basic file attributes in CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

In addition to the filename, send attributes about the inode in a
CB_NOTIFY event. This patch just adds a the basic inode information that
can be acquired via GETATTR.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 18adab1d7ca2..4fb61d05a4a7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4191,12 +4191,21 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	goto out;
 }
 
+#define CB_NOTIFY_STATX_REQUEST_MASK (STATX_BASIC_STATS   | \
+				      STATX_BTIME	  | \
+				      STATX_CHANGE_COOKIE)
+
 static bool
 nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct dentry *dentry, struct nfs4_delegation *dp,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
+	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
+			      .dentry = dentry };
+	struct nfsd4_fattr_args args = { };
 	uint32_t *attrmask;
+	__be32 status;
+	int ret;
 
 	/* Reserve space for attrmask */
 	attrmask = xdr_reserve_space(xdr, 3 * sizeof(uint32_t));
@@ -4207,6 +4216,41 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 	ne->ne_file.len = namelen;
 	ne->ne_attrs.attrmask.element = attrmask;
 
+	/* FIXME: d_find_alias for inode ? */
+	if (!path.dentry || !d_inode(path.dentry))
+		goto noattrs;
+
+	/*
+	 * It is possible that the client was granted a delegation when a file
+	 * was created. Note that we don't issue a CB_GETATTR here since stale
+	 * attributes are presumably ok.
+	 */
+	ret = vfs_getattr(&path, &args.stat, CB_NOTIFY_STATX_REQUEST_MASK, AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		goto noattrs;
+
+	args.change_attr = nfsd4_change_attribute(&args.stat);
+
+	attrmask[0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
+		      FATTR4_WORD0_SIZE | FATTR4_WORD0_FILEID;
+	attrmask[1] = FATTR4_WORD1_MODE | FATTR4_WORD1_NUMLINKS | FATTR4_WORD1_RAWDEV |
+		      FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS |
+		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
+	attrmask[2] = 0;
+
+	if (args.stat.result_mask & STATX_BTIME)
+		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
+
+	ne->ne_attrs.attrmask.count = 2;
+	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
+
+	status = nfsd4_encode_attr_vals(xdr, attrmask, &args);
+	if (status != nfs_ok)
+		goto noattrs;
+
+	ne->ne_attrs.attr_vals.len = (u8 *)xdr->p - ne->ne_attrs.attr_vals.data;
+	return true;
+noattrs:
 	attrmask[0] = 0;
 	attrmask[1] = 0;
 	attrmask[2] = 0;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 15/20] nfsd: allow encoding a filehandle into fattr4 without a svc_fh
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

The current fattr4 encoder requires a svc_fh in order to encode the
filehandle. This is not available in a CB_NOTIFY callback. Add a a new
"fhandle" field to struct nfsd4_fattr_args and copy the filehandle into
there from the svc_fh. CB_NOTIFY will populate it via other means.

A filehandle composed this way may still need a MAC appended on signed
exports, so generalize fh_append_mac() to operate on a bare knfsd_fh
(plus its maximum size and net) rather than a svc_fh.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 36 +++++++++++++++++++++---------------
 fs/nfsd/nfsfh.c   | 10 +++++-----
 fs/nfsd/nfsfh.h   |  1 +
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4fb61d05a4a7..7b19248b1503 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2715,7 +2715,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 }
 
 static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
-				   struct knfsd_fh *fh_handle)
+				   const struct knfsd_fh *fh_handle)
 {
 	return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
 }
@@ -3158,6 +3158,7 @@ struct nfsd4_fattr_args {
 	struct svc_fh		*fhp;
 	struct svc_export	*exp;
 	struct dentry		*dentry;
+	struct knfsd_fh		fhandle;
 	struct kstat		stat;
 	struct kstatfs		statfs;
 	struct nfs4_acl		*acl;
@@ -3402,7 +3403,7 @@ static __be32 nfsd4_encode_fattr4_homogeneous(struct xdr_stream *xdr,
 static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
 					     const struct nfsd4_fattr_args *args)
 {
-	return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+	return nfsd4_encode_nfs_fh4(xdr, &args->fhandle);
 }
 
 static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
@@ -4015,19 +4016,24 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		if (err)
 			goto out_nfserr;
 	}
-	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
-	    !fhp) {
-		tempfh = kmalloc_obj(struct svc_fh);
-		status = nfserr_jukebox;
-		if (!tempfh)
-			goto out;
-		fh_init(tempfh, NFS4_FHSIZE);
-		status = fh_compose(tempfh, exp, dentry, NULL);
-		if (status)
-			goto out;
-		args.fhp = tempfh;
-	} else
-		args.fhp = fhp;
+
+	args.fhp = fhp;
+	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID))) {
+		if (!args.fhp) {
+			tempfh = kmalloc_obj(struct svc_fh);
+			status = nfserr_jukebox;
+			if (!tempfh)
+				goto out;
+			fh_init(tempfh, NFS4_FHSIZE);
+			status = fh_compose(tempfh, exp, dentry, NULL);
+			if (status)
+				goto out;
+			args.fhp = tempfh;
+		}
+		if (args.fhp)
+			fh_copy_shallow(&args.fhandle, &args.fhp->fh_handle);
+	}
+
 	if (attrmask[0] & (FATTR4_WORD0_CASE_INSENSITIVE |
 			   FATTR4_WORD0_CASE_PRESERVING)) {
 		/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index b36915401758..3b29cd70d4a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -144,16 +144,15 @@ static inline __be32 check_pseudo_root(struct dentry *dentry,
 /* Size of a file handle MAC, in 4-octet words */
 #define FH_MAC_WORDS (sizeof(__le64) / 4)
 
-static bool fh_append_mac(struct svc_fh *fhp, struct net *net)
+bool fh_append_mac(struct knfsd_fh *fh, int fh_maxsize, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-	struct knfsd_fh *fh = &fhp->fh_handle;
 	siphash_key_t *fh_key = nn->fh_key;
 	__le64 hash;
 
 	if (!fh_key)
 		goto out_no_key;
-	if (fh->fh_size + sizeof(hash) > fhp->fh_maxsize)
+	if (fh->fh_size + sizeof(hash) > fh_maxsize)
 		goto out_no_space;
 
 	hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size, fh_key));
@@ -167,7 +166,7 @@ static bool fh_append_mac(struct svc_fh *fhp, struct net *net)
 
 out_no_space:
 	pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_size %zu would be greater than fh_maxsize %d.\n",
-			    fh->fh_size + sizeof(hash), fhp->fh_maxsize);
+			    fh->fh_size + sizeof(hash), fh_maxsize);
 	return false;
 }
 
@@ -566,7 +565,8 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
 		fhp->fh_handle.fh_size += maxsize * 4;
 
 		if (exp->ex_flags & NFSEXP_SIGN_FH)
-			if (!fh_append_mac(fhp, exp->cd->net))
+			if (!fh_append_mac(&fhp->fh_handle, fhp->fh_maxsize,
+					   exp->cd->net))
 				fhp->fh_handle.fh_fileid_type = FILEID_INVALID;
 	} else {
 		fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 5ef7191f8ad8..5dc10b442d6c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -226,6 +226,7 @@ __be32	fh_getattr(const struct svc_fh *fhp, struct kstat *stat);
 __be32	fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
 __be32	fh_update(struct svc_fh *);
 void	fh_put(struct svc_fh *);
+bool	fh_append_mac(struct knfsd_fh *fh, int fh_maxsize, struct net *net);
 
 static __inline__ struct svc_fh *
 fh_copy(struct svc_fh *dst, const struct svc_fh *src)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 16/20] nfsd: add a fi_connectable flag to struct nfs4_file
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

When encoding a filehandle for a CB_NOTIFY, there is no svc_export
available, but the server needs to know whether to encode a connectable
filehandle. Add a flag to the nfs4_file that tells whether the
svc_export under which a directory delegation was acquired has subtree
checking enabled, in which case it needs connectable filehandles.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 1 +
 fs/nfsd/state.h     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 513cbc1a583f..aa99783ce901 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5231,6 +5231,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
 	fp->fi_aliased = false;
 	fp->fi_inode = d_inode(fh->fh_dentry);
+	fp->fi_connectable = !(fh->fh_export->ex_flags & NFSEXP_NOSUBTREECHECK);
 #ifdef CONFIG_NFSD_PNFS
 	INIT_LIST_HEAD(&fp->fi_lo_states);
 	atomic_set(&fp->fi_lo_recalls, 0);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f8457e0f2b57..d912e3d04dd7 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -761,6 +761,7 @@ struct nfs4_file {
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+	bool			fi_connectable;
 #ifdef CONFIG_NFSD_PNFS
 	struct list_head	fi_lo_states;
 	atomic_t		fi_lo_recalls;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 17/20] nfsd: add the filehandle to returned attributes in CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

nfsd's usual fh_compose routine requires a svc_export and fills out a
svc_fh. In the context of a CB_NOTIFY there is no such export to
consult.

Add a new routine that composes a filehandle with only a parent
filehandle and nfs4_file. Use that to fill out the fhandle field in the
nfsd4_fattr_args.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7b19248b1503..15ccd54ffdb6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4197,6 +4197,39 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	goto out;
 }
 
+static bool
+setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
+		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
+{
+	int fileid_type, fsid_len, maxsize, flags = 0;
+	struct knfsd_fh *fhp = &args->fhandle;
+	struct inode *inode = d_inode(dentry);
+	struct inode *parent = NULL;
+	struct fid *fid;
+
+	fsid_len = key_len(fi->fi_fhandle.fh_fsid_type);
+	fhp->fh_size = 4 + fsid_len;
+
+	/* Copy first 4 bytes + fsid */
+	memcpy(&fhp->fh_raw, &fi->fi_fhandle.fh_raw, fhp->fh_size);
+
+	fid = (struct fid *)(fh_fsid(fhp) + fsid_len/4);
+	maxsize = (NFS4_FHSIZE - fhp->fh_size)/4;
+
+	if (fi->fi_connectable && !S_ISDIR(inode->i_mode)) {
+		parent = d_inode(nf->nf_file->f_path.dentry);
+		flags = EXPORT_FH_CONNECTABLE;
+	}
+
+	fileid_type = exportfs_encode_inode_fh(inode, fid, &maxsize, parent, flags);
+	if (fileid_type < 0 || fileid_type == FILEID_INVALID)
+		return false;
+
+	fhp->fh_fileid_type = fileid_type;
+	fhp->fh_size += maxsize * 4;
+	return true;
+}
+
 #define CB_NOTIFY_STATX_REQUEST_MASK (STATX_BASIC_STATS   | \
 				      STATX_BTIME	  | \
 				      STATX_CHANGE_COOKIE)
@@ -4206,6 +4239,7 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct dentry *dentry, struct nfs4_delegation *dp,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
+	struct nfs4_file *fi = dp->dl_stid.sc_file;
 	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
 			      .dentry = dentry };
 	struct nfsd4_fattr_args args = { };
@@ -4244,6 +4278,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
 	attrmask[2] = 0;
 
+	if (setup_notify_fhandle(dentry, fi, nf, &args))
+		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
+
 	if (args.stat.result_mask & STATX_BTIME)
 		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
 

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 18/20] nfsd: properly track requested child attributes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Track the union of requested and supported child attributes in the
delegation, and only encode the attributes in that union when sending
add/remove/rename updates.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c  |  2 ++
 fs/nfsd/nfs4state.c | 18 ++++++++++++++++++
 fs/nfsd/nfs4xdr.c   | 15 ++++++---------
 fs/nfsd/state.h     |  3 +++
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 29f7339dc220..caec82e77081 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2577,6 +2577,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 
 	gdd->gddrnf_status = GDD4_OK;
 	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid));
+	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
+	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
 	nfs4_put_stid(&dd->dl_stid);
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aa99783ce901..0e6e008c121e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -9930,6 +9930,21 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 	return status;
 }
 
+#define GDD_WORD0_CHILD_ATTRS	(FATTR4_WORD0_TYPE |		\
+				 FATTR4_WORD0_CHANGE |		\
+				 FATTR4_WORD0_SIZE |		\
+				 FATTR4_WORD0_FILEID |		\
+				 FATTR4_WORD0_FILEHANDLE)
+
+#define GDD_WORD1_CHILD_ATTRS	(FATTR4_WORD1_MODE |		\
+				 FATTR4_WORD1_NUMLINKS |	\
+				 FATTR4_WORD1_RAWDEV |		\
+				 FATTR4_WORD1_SPACE_USED |	\
+				 FATTR4_WORD1_TIME_ACCESS |	\
+				 FATTR4_WORD1_TIME_METADATA |	\
+				 FATTR4_WORD1_TIME_MODIFY |	\
+				 FATTR4_WORD1_TIME_CREATE)
+
 /**
  * nfsd_get_dir_deleg - attempt to get a directory delegation
  * @cstate: compound state
@@ -9998,6 +10013,9 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 		dp->dl_stid.sc_export =
 			exp_get(cstate->current_fh.fh_export);
 
+	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
+	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
+
 	/*
 	 * NB: gddr_notification[0] represents the notifications that
 	 * will be granted to the client
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15ccd54ffdb6..1e3c360c06cd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4271,18 +4271,15 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 
 	args.change_attr = nfsd4_change_attribute(&args.stat);
 
-	attrmask[0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
-		      FATTR4_WORD0_SIZE | FATTR4_WORD0_FILEID;
-	attrmask[1] = FATTR4_WORD1_MODE | FATTR4_WORD1_NUMLINKS | FATTR4_WORD1_RAWDEV |
-		      FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS |
-		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
+	attrmask[0] = dp->dl_child_attrs[0];
+	attrmask[1] = dp->dl_child_attrs[1];
 	attrmask[2] = 0;
 
-	if (setup_notify_fhandle(dentry, fi, nf, &args))
-		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
+	if (!setup_notify_fhandle(dentry, fi, nf, &args))
+		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
 
-	if (args.stat.result_mask & STATX_BTIME)
-		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
+	if (!(args.stat.result_mask & STATX_BTIME))
+		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
 
 	ne->ne_attrs.attrmask.count = 2;
 	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d912e3d04dd7..0763893bfd48 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -297,6 +297,9 @@ struct nfs4_delegation {
 	struct timespec64	dl_atime;
 	struct timespec64	dl_mtime;
 	struct timespec64	dl_ctime;
+
+	/* For dir delegations */
+	uint32_t		dl_child_attrs[2];
 };
 
 static inline bool deleg_is_read(u32 dl_type)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 19/20] nfsd: track requested dir attributes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Track the union of requested and supported dir attributes in the
delegation. In a later patch this will be used to ensure that we
only encode the attributes in that union when sending
add/remove/rename updates.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c  |  9 ++++++---
 fs/nfsd/nfs4state.c | 20 ++++++++++++++++----
 fs/nfsd/state.h     |  2 ++
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index caec82e77081..9e86f5907f06 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2530,9 +2530,10 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
-#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_REMOVE_ENTRY) |	\
-				 BIT(NOTIFY4_ADD_ENTRY) |	\
-				 BIT(NOTIFY4_RENAME_ENTRY) |	\
+#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_CHANGE_DIR_ATTRS) |	\
+				 BIT(NOTIFY4_REMOVE_ENTRY) |		\
+				 BIT(NOTIFY4_ADD_ENTRY) |		\
+				 BIT(NOTIFY4_RENAME_ENTRY) |		\
 				 BIT(NOTIFY4_GFLAG_EXTEND))
 
 static __be32
@@ -2579,6 +2580,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid));
 	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
 	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
+	gdd->gddr_dir_attributes[0] = dd->dl_dir_attrs[0];
+	gdd->gddr_dir_attributes[1] = dd->dl_dir_attrs[1];
 	nfs4_put_stid(&dd->dl_stid);
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0e6e008c121e..12627afb604f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -9945,6 +9945,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 				 FATTR4_WORD1_TIME_MODIFY |	\
 				 FATTR4_WORD1_TIME_CREATE)
 
+#define GDD_WORD0_DIR_ATTRS	(FATTR4_WORD0_CHANGE |		\
+				 FATTR4_WORD0_SIZE)
+
+#define GDD_WORD1_DIR_ATTRS	(FATTR4_WORD1_NUMLINKS |	\
+				 FATTR4_WORD1_SPACE_USED |	\
+				 FATTR4_WORD1_TIME_ACCESS |	\
+				 FATTR4_WORD1_TIME_METADATA |	\
+				 FATTR4_WORD1_TIME_MODIFY)
+
 /**
  * nfsd_get_dir_deleg - attempt to get a directory delegation
  * @cstate: compound state
@@ -10013,14 +10022,17 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 		dp->dl_stid.sc_export =
 			exp_get(cstate->current_fh.fh_export);
 
-	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
-	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
-
 	/*
 	 * NB: gddr_notification[0] represents the notifications that
 	 * will be granted to the client
 	 */
-	fl = nfs4_alloc_init_lease(dp, gdd->gddr_notification[0]);
+	dp->dl_notify_mask = gdd->gddr_notification[0];
+	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
+	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
+	dp->dl_dir_attrs[0] = gdd->gdda_dir_attributes[0] & GDD_WORD0_DIR_ATTRS;
+	dp->dl_dir_attrs[1] = gdd->gdda_dir_attributes[1] & GDD_WORD1_DIR_ATTRS;
+
+	fl = nfs4_alloc_init_lease(dp, dp->dl_notify_mask);
 	if (!fl)
 		goto out_put_stid;
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0763893bfd48..17be4011740d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -299,7 +299,9 @@ struct nfs4_delegation {
 	struct timespec64	dl_ctime;
 
 	/* For dir delegations */
+	uint32_t		dl_notify_mask;
 	uint32_t		dl_child_attrs[2];
+	uint32_t		dl_dir_attrs[2];
 };
 
 static inline bool deleg_is_read(u32 dl_type)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 20/20] nfsd: add support to CB_NOTIFY for dir attribute changes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

If the client requested dir attribute change notifications, send those
alongside any set of add/remove/rename events. Note that the server will
still recall the delegation on a SETATTR, so these are only sent for
changes to child dirents.

The child filehandle returned in these notifications is composed by
setup_notify_fhandle() without going through fh_compose(), so it does
not get a MAC appended. On exports configured with NFSEXP_SIGN_FH the
client would then get back an unsigned filehandle that fh_verify()
rejects as stale. Pass the delegation's export down to
setup_notify_fhandle() and append the MAC with fh_append_mac() when the
export requires signed filehandles; if signing fails, drop the
filehandle attribute rather than handing out an unusable one.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++--
 fs/nfsd/nfs4xdr.c   | 73 +++++++++++++++++++++++++++++++++++++++++++++--------
 fs/nfsd/xdr4.h      |  2 ++
 3 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12627afb604f..e394278fb92e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3503,10 +3503,15 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 	struct nfsd_notify_event *events[NOTIFY4_EVENT_QUEUE_SIZE];
 	struct xdr_buf xdr = { .buflen = PAGE_SIZE * NOTIFY4_PAGE_ARRAY_SIZE,
 			       .pages  = ncn->ncn_pages };
+	int limit = NOTIFY4_EVENT_QUEUE_SIZE;
 	struct xdr_stream stream;
 	struct nfsd_file *nf;
-	int count, i;
 	bool error = false;
+	int count, i;
+
+	/* Save a slot for dir attr update if requested */
+	if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS))
+		--limit;
 
 	xdr_init_encode_pages(&stream, &xdr);
 
@@ -3520,7 +3525,7 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 	}
 
 	/* we can't keep up! */
-	if (count > NOTIFY4_EVENT_QUEUE_SIZE) {
+	if (count > limit) {
 		spin_unlock(&ncn->ncn_lock);
 		goto out_recall;
 	}
@@ -3567,6 +3572,22 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 		nfsd_notify_event_put(nne);
 	}
 	if (!error) {
+		if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)) {
+			u32 *maskp = (u32 *)xdr_reserve_space(&stream, sizeof(*maskp));
+
+			if (maskp) {
+				u8 *p = nfsd4_encode_dir_attr_change(&stream, dp, nf);
+
+				if (p) {
+					*maskp = BIT(NOTIFY4_CHANGE_DIR_ATTRS);
+					ncn->ncn_nf[count].notify_mask.count = 1;
+					ncn->ncn_nf[count].notify_mask.element = maskp;
+					ncn->ncn_nf[count].notify_vals.data = p;
+					ncn->ncn_nf[count].notify_vals.len = (u8 *)stream.p - p;
+					++count;
+				}
+			}
+		}
 		ncn->ncn_nf_cnt = count;
 		nfsd_file_put(nf);
 		return true;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e3c360c06cd..7dd8476028d6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4199,7 +4199,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 static bool
 setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
-		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
+		     struct nfsd_file *nf, struct svc_export *exp,
+		     struct nfsd4_fattr_args *args)
 {
 	int fileid_type, fsid_len, maxsize, flags = 0;
 	struct knfsd_fh *fhp = &args->fhandle;
@@ -4227,6 +4228,17 @@ setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
 
 	fhp->fh_fileid_type = fileid_type;
 	fhp->fh_size += maxsize * 4;
+
+	/*
+	 * fh_compose() appends a MAC to filehandles on signed exports; this
+	 * hand-rolled filehandle must do the same or the client will get back
+	 * an unsigned filehandle that fh_verify() later rejects as stale.
+	 * If we can't sign it, don't hand it out at all.
+	 */
+	if (exp && (exp->ex_flags & NFSEXP_SIGN_FH))
+		if (!fh_append_mac(fhp, NFS4_FHSIZE, exp->cd->net))
+			return false;
+
 	return true;
 }
 
@@ -4240,11 +4252,11 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
 	struct nfs4_file *fi = dp->dl_stid.sc_file;
-	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
-			      .dentry = dentry };
+	struct path path = nf->nf_file->f_path;
 	struct nfsd4_fattr_args args = { };
 	uint32_t *attrmask;
 	__be32 status;
+	bool parent;
 	int ret;
 
 	/* Reserve space for attrmask */
@@ -4256,6 +4268,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 	ne->ne_file.len = namelen;
 	ne->ne_attrs.attrmask.element = attrmask;
 
+	parent = (dentry == path.dentry);
+	path.dentry = dentry;
+
 	/* FIXME: d_find_alias for inode ? */
 	if (!path.dentry || !d_inode(path.dentry))
 		goto noattrs;
@@ -4271,15 +4286,21 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 
 	args.change_attr = nfsd4_change_attribute(&args.stat);
 
-	attrmask[0] = dp->dl_child_attrs[0];
-	attrmask[1] = dp->dl_child_attrs[1];
-	attrmask[2] = 0;
+	if (parent) {
+		attrmask[0] = dp->dl_dir_attrs[0];
+		attrmask[1] = dp->dl_dir_attrs[1];
+	} else {
+		attrmask[0] = dp->dl_child_attrs[0];
+		attrmask[1] = dp->dl_child_attrs[1];
 
-	if (!setup_notify_fhandle(dentry, fi, nf, &args))
-		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
+		if (!setup_notify_fhandle(dentry, fi, nf,
+					  dp->dl_stid.sc_export, &args))
+			attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
 
-	if (!(args.stat.result_mask & STATX_BTIME))
-		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+		if (!(args.stat.result_mask & STATX_BTIME))
+			attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	}
+	attrmask[2] = 0;
 
 	ne->ne_attrs.attrmask.count = 2;
 	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
@@ -4392,6 +4413,38 @@ u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *
 	return NULL;
 }
 
+/**
+ * nfsd4_encode_dir_attr_change
+ * @xdr: stream to which to encode the fattr4
+ * @dp: delegation where the event occurred
+ * @nf: nfsd_file opened on the directory
+ *
+ * Encode a dir attr change event.
+ */
+u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+				 struct nfsd_file *nf)
+{
+	struct dentry *dentry = nf->nf_file->f_path.dentry;
+	struct notify_attr4 na = { };
+	bool ret;
+	u8 *p = NULL;
+
+	if (!(dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)))
+		return NULL;
+
+	/* RFC 8881 s10.4.3: ne_file must be a zero-length string for dir attrs */
+	ret = nfsd4_setup_notify_entry4(&na.na_changed_entry, xdr,
+					dentry, dp, nf, "", 0);
+
+	/* Don't bother with the event if we're not encoding attrs */
+	if (ret && na.na_changed_entry.ne_attrs.attr_vals.len) {
+		p = (u8 *)xdr->p;
+		if (!xdrgen_encode_notify_attr4(xdr, &na))
+			p = NULL;
+	}
+	return p;
+}
+
 static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
 				struct xdr_buf *buf, __be32 *p, int bytes)
 {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 62ac790428be..805c7122eb93 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -973,6 +973,8 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *nne,
 			      struct nfs4_delegation *dd, struct nfsd_file *nf,
 			      u32 *notify_mask);
+u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+				 struct nfsd_file *nf);
 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, union nfsd4_op_u *u);
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,

-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH net-next 2/3] docs: net: tls-offload: document tls_dev_del, tls_dev_resync, and rekey
From: Sabrina Dubroca @ 2026-06-11 17:55 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, corbet,
	linux-doc, bpf, john.fastabend, skhan
In-Reply-To: <20260611101817.18964bd9@kernel.org>

2026-06-11, 10:18:17 -0700, Jakub Kicinski wrote:
> On Wed, 10 Jun 2026 23:06:44 +0200 Sabrina Dubroca wrote:
> > > +The third TLS device callback is :c:member:`tls_dev_resync`, called by the core
> > > +to synchronize the TCP stream with the record boundaries:
> > > +
> > > +.. code-block:: c
> > > +
> > > +	int (*tls_dev_resync)(struct net_device *netdev,
> > > +			      struct sock *sk, u32 seq, u8 *rcd_sn,
> > > +			      enum tls_offload_ctx_dir direction);
> > > +
> > > +See the `Resync handling`_ section for details.  
> > 
> > Hmm, this callback is not mentioned at all in the "Resync handling"
> > section. I think it'd be good to add at least a quick note there about
> > how/when it's invoked, and what the arguments mean (at least the two
> > types of sequence numbers, since the rest is identical to the other
> > driver CBs).
> 
> Something like this, you mean?

Yeah, exactly.

-- 
Sabrina

^ permalink raw reply

* Re: [PATCH net-next 2/3] docs: net: tls-offload: document tls_dev_del, tls_dev_resync, and rekey
From: Randy Dunlap @ 2026-06-11 18:13 UTC (permalink / raw)
  To: Jakub Kicinski, Sabrina Dubroca
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, corbet,
	linux-doc, bpf, john.fastabend, skhan
In-Reply-To: <20260611101817.18964bd9@kernel.org>



On 6/11/26 10:18 AM, Jakub Kicinski wrote:
> On Wed, 10 Jun 2026 23:06:44 +0200 Sabrina Dubroca wrote:
>>> +The third TLS device callback is :c:member:`tls_dev_resync`, called by the core
>>> +to synchronize the TCP stream with the record boundaries:
>>> +
>>> +.. code-block:: c
>>> +
>>> +	int (*tls_dev_resync)(struct net_device *netdev,
>>> +			      struct sock *sk, u32 seq, u8 *rcd_sn,
>>> +			      enum tls_offload_ctx_dir direction);
>>> +
>>> +See the `Resync handling`_ section for details.  
>>
>> Hmm, this callback is not mentioned at all in the "Resync handling"
>> section. I think it'd be good to add at least a quick note there about
>> how/when it's invoked, and what the arguments mean (at least the two
>> types of sequence numbers, since the rest is identical to the other
>> driver CBs).
> 
> Something like this, you mean?
> 
> --- a/Documentation/networking/tls-offload.rst
> +++ b/Documentation/networking/tls-offload.rst
> @@ -278,9 +278,9 @@ sequence number (as it will be updated from a different context).
>    bool tls_offload_tx_resync_pending(struct sock *sk)
>  
>  Next time ``ktls`` pushes a record it will first send its TCP sequence number
> -and TLS record number to the driver. Stack will also make sure that
> -the new record will start on a segment boundary (like it does when
> -the connection is initially added).
> +and TLS record number to the driver via the ``tls_dev_resync`` callback.
> +Stack will also make sure that the new record will start on a segment boundary

Preferably "The stack ..."

> +(like it does when the connection is initially added).
>  
>  RX
>  --


-- 
~Randy


^ permalink raw reply

* Re: [PATCH v6 01/11] x86/virt/tdx: Simplify tdmr_get_pamt_sz()
From: Vishal Annapurve @ 2026-06-11 18:25 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx, x86,
	chao.gao, yan.y.zhao, kai.huang, Binbin Wu
In-Reply-To: <20260526023515.288829-2-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 7:35 PM Rick Edgecombe
<rick.p.edgecombe@intel.com> wrote:
>
> For each memory region that the TDX module might use (called TDMR), three
> separate traditional PAMT allocations are needed. One for each supported
> page size (1GB, 2MB, 4KB). These store information on each page in the
> TDMR. In Linux, they are allocated out of one physically contiguous block,
> in order to more efficiently use some internal TDX module book keeping
> resources. So some simple math is needed to break the single large
> allocation into three smaller allocations for each page size.
>
> There are some commonalities in the math needed to calculate the base and
> size for each smaller allocation, and so an effort was made to share logic
> across the three. Unfortunately doing this turned out unnaturally tortured,
> with a loop iterating over the three page sizes, only to call into a
> function with cases statement for each page size. In the future Dynamic
> PAMT will add more logic that is special to the 4KB page size, making the
> benefit of the math sharing even more questionable.
>
> Three is not a very high number, so get rid of the loop and just duplicate
> the small calculation three times. In doing so, setup for future Dynamic
> PAMT changes.
>
> Since the loop that iterates over it is gone, further simplify the code by
> dropping the array of intermediate size and base storage. Just store the
> values to their final locations. Accept the small complication of having
> to clear tdmr->pamt_4k_base in the error path, so that tdmr_do_pamt_func()
> will not try to operate on the TDMR struct when attempting to free it.
>
> Assisted-by: GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

Reviewed-by: Vishal Annapurve <vannapurve@google.com>

^ permalink raw reply

* [PATCH] Documentation: arch: fix brackets
From: Manuel Ebner @ 2026-06-11 18:35 UTC (permalink / raw)
  To: Vineet Gupta, Jonathan Corbet, Shuah Khan, Krzysztof Kozlowski,
	Peter Griffin, Alim Akhtar, Catalin Marinas, Will Deacon,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy, open list:SYNOPSYS ARC ARCHITECTURE,
	open list:DOCUMENTATION, open list,
	moderated list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
  Cc: Manuel Ebner

Add missing and remove needless parentheses, brackets and curly braces.

Signed-off-by: Manuel Ebner <manuelebner@mailbox.org>
---
 Documentation/arch/arc/arc.rst                 |  2 +-
 .../arm/samsung/clksrc-change-registers.awk    |  2 +-
 Documentation/arch/arm/vlocks.rst              |  4 ++--
 .../arch/arm64/memory-tagging-extension.rst    |  2 +-
 Documentation/arch/powerpc/vas-api.rst         |  2 +-
 Documentation/arch/sparc/oradax/dax-hv-api.txt | 18 +++++++++---------
 Documentation/arch/sparc/oradax/oracle-dax.rst |  3 ++-
 7 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/Documentation/arch/arc/arc.rst b/Documentation/arch/arc/arc.rst
index 6c4d978f3f4e..553851f43be7 100644
--- a/Documentation/arch/arc/arc.rst
+++ b/Documentation/arch/arc/arc.rst
@@ -36,7 +36,7 @@ Important note on ARC processors configurability
 
 ARC processors are highly configurable and several configurable options
 are supported in Linux. Some options are transparent to software
-(i.e cache geometries, some can be detected at runtime and configured
+(i.e cache geometries), some can be detected at runtime and configured
 and used accordingly, while some need to be explicitly selected or configured
 in the kernel's configuration utility (AKA "make menuconfig").
 
diff --git a/Documentation/arch/arm/samsung/clksrc-change-registers.awk b/Documentation/arch/arm/samsung/clksrc-change-registers.awk
index 7be1b8aa7cd9..48464397088c 100755
--- a/Documentation/arch/arm/samsung/clksrc-change-registers.awk
+++ b/Documentation/arch/arm/samsung/clksrc-change-registers.awk
@@ -163,4 +163,4 @@ BEGIN {
     }
 }
 
-// && ! /clksrc_clk.*=.*{/ { print $0 }
+// && ! /clksrc_clk.*=.*{/ { print $0 }}
diff --git a/Documentation/arch/arm/vlocks.rst b/Documentation/arch/arm/vlocks.rst
index 737aa8661a21..b0ac33263086 100644
--- a/Documentation/arch/arm/vlocks.rst
+++ b/Documentation/arch/arm/vlocks.rst
@@ -102,10 +102,10 @@ Features and limitations
 	if (I_won) {
 		/* we won the town election, let's go for the state */
 		my_state = states[(this_cpu >> 8) & 0xf];
-		I_won = vlock_lock(my_state, this_cpu & 0xf));
+		I_won = vlock_lock(my_state, this_cpu & 0xf);
 		if (I_won) {
 			/* and so on */
-			I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
+			I_won = vlock_lock(the_whole_country, this_cpu & 0xf);
 			if (I_won) {
 				/* ... */
 			}
diff --git a/Documentation/arch/arm64/memory-tagging-extension.rst b/Documentation/arch/arm64/memory-tagging-extension.rst
index 679725030731..e6fe428f0e2a 100644
--- a/Documentation/arch/arm64/memory-tagging-extension.rst
+++ b/Documentation/arch/arm64/memory-tagging-extension.rst
@@ -222,7 +222,7 @@ programs should not retry in case of a non-zero system call return.
 address ABI control and MTE configuration of a process as per the
 ``prctl()`` options described in
 Documentation/arch/arm64/tagged-address-abi.rst and above. The corresponding
-``regset`` is 1 element of 8 bytes (``sizeof(long))``).
+``regset`` is 1 element of 8 bytes (``sizeof(long)``).
 
 Core dump support
 -----------------
diff --git a/Documentation/arch/powerpc/vas-api.rst b/Documentation/arch/powerpc/vas-api.rst
index a9625a2fa0c6..1d0d055356e3 100644
--- a/Documentation/arch/powerpc/vas-api.rst
+++ b/Documentation/arch/powerpc/vas-api.rst
@@ -293,7 +293,7 @@ Simple example
 				//Format CRB request with compression or
 				//uncompression
 				// Refer tests for vas_copy/vas_paste
-				vas_copy((&crb, 0, 1);
+				vas_copy(&crb, 0, 1);
 				vas_paste(addr, 0, 1);
 				// Poll on csb.flags with timeout
 				// csb address is listed in CRB
diff --git a/Documentation/arch/sparc/oradax/dax-hv-api.txt b/Documentation/arch/sparc/oradax/dax-hv-api.txt
index ef1a4c2bf08b..ef6088aeaa66 100644
--- a/Documentation/arch/sparc/oradax/dax-hv-api.txt
+++ b/Documentation/arch/sparc/oradax/dax-hv-api.txt
@@ -457,7 +457,7 @@ bits set, and terminate at a CCB that has the Conditional bit set, but not the P
 Offset   Size   Field Description
                 Bits         Field Description
                 [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                             “Secondary Input Element Size”
+                             “Secondary Input Element Size”)
                 [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                 [9]          Padding Direction selector: A value of 1 causes padding bytes
                              to be added to the left side of output elements. A value of 0
@@ -656,7 +656,7 @@ Offset         Size            Field Description
                                [18:16]      Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                                             Element Offsets”)
                                [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                                            “Secondary Input Element Size”
+                                            “Secondary Input Element Size”)
                                [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                                [9:5]        Operand size for first scan criteria value. In a scan value
                                             operation, this is one of two potential exact match values.
@@ -793,13 +793,13 @@ Offset   Size   Field Description
                 [18:16]      Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                              Element Offsets”)
                 [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                             “Secondary Input Element Size”
+                             “Secondary Input Element Size”)
                 [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                 [9]          Reserved
                 [8:0]        Test value used for comparison against the most significant bits
                              in the input values, when using 2 or 3 byte input elements.
-8        8      Completion (same fields as Section 36.2.1.2, “Extract command”
-16       8      Primary Input (same fields as Section 36.2.1.2, “Extract command”
+8        8      Completion (same fields as Section 36.2.1.2, “Extract command)”
+16       8      Primary Input (same fields as Section 36.2.1.2, “Extract command”)
 24       8      Data Access Control (same fields as Section 36.2.1.2, “Extract command”,
                 except Primary Input Length Format may not use the 0x0 value)
 32       8      Secondary Input, if used by Primary Input Format. Same fields as Primary
@@ -880,7 +880,7 @@ Offset   Size   Field Description
                                        [18:16]     Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                                                    Element Offsets”)
                                        [15:14]     Secondary Input Element Size (see Section 36.2.1.1.4,
-                                                   “Secondary Input Element Size”
+                                                   “Secondary Input Element Size”)
 
 
                                                       524
@@ -895,8 +895,8 @@ Offset   Size   Field Description
                                                     causes padding bytes to be added to the right side of output
                                                     elements.
                                        [8:0]        Reserved
-        8              8               Completion (same fields as Section 36.2.1.2, “Extract command”
-        16             8               Primary Input (same fields as Section 36.2.1.2, “Extract command”
+        8              8               Completion (same fields as Section 36.2.1.2, “Extract command”)
+        16             8               Primary Input (same fields as Section 36.2.1.2, “Extract command”)
         24             8               Data Access Control (same fields as Section 36.2.1.2, “Extract command”)
         32             8               Secondary Bit Vector Input. Same fields as Primary Input.
         40             8               Reserved
@@ -949,7 +949,7 @@ Offset   Size   Field Description
                                    [31]        If set, this CCB functions as a Sync command. If clear, this
                                                CCB functions as a No-op command.
                                    [30:0]      Reserved
-       8             8             Completion (same fields as Section 36.2.1.2, “Extract command”
+       8             8             Completion (same fields as Section 36.2.1.2, “Extract command”)
        16            46            Reserved
 
 36.2.2. CCB Completion Area
diff --git a/Documentation/arch/sparc/oradax/oracle-dax.rst b/Documentation/arch/sparc/oradax/oracle-dax.rst
index d1e14d572918..67867ea7be40 100644
--- a/Documentation/arch/sparc/oradax/oracle-dax.rst
+++ b/Documentation/arch/sparc/oradax/oracle-dax.rst
@@ -438,7 +438,8 @@ that in user land::
 The output bitmap is ready for consumption immediately after the
 completion status indicates success.
 
-Excer[t from UltraSPARC Virtual Machine Specification
+Excer?t from UltraSPARC Virtual Machine Specification
+i guess this is wrong, but i don't know what's correct
 =====================================================
 
  .. include:: dax-hv-api.txt
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v6 02/11] x86/virt/tdx: Allocate page bitmap for Dynamic PAMT
From: Vishal Annapurve @ 2026-06-11 18:47 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx, x86,
	chao.gao, yan.y.zhao, kai.huang, Kirill A. Shutemov, Binbin Wu
In-Reply-To: <20260526023515.288829-3-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 7:35 PM Rick Edgecombe
<rick.p.edgecombe@intel.com> wrote:
>
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>
> The TDX Physical Address Metadata Table (PAMT) holds data about the
> physical memory used by TDX, and must be allocated by the kernel during
> TDX module initialization.
>
> The exact size of the required PAMT memory is determined by the TDX module
> and may vary between TDX module versions. Currently it is approximately
> 0.4% of the system memory. This is a significant commitment, especially if
> it is not known upfront whether the machine will run any TDX guests.
>
> Each memory region that the TDX module might use needs three separate PAMT
> allocations. One for each supported page size (1GB, 2MB, 4KB). The
> TDX module supports a new feature designed to reduce PAMT overhead called
> Dynamic PAMT. At a high level, Dynamic PAMT still has the 1GB and 2MB
> levels allocated on TDX module initialization, but the 4KB level is
> allocated dynamically during runtime.
>
> However, in the details, Dynamic PAMT still needs some smaller per 4KB
> page scoped data (currently it is 1 bit per page). The TDX module exposes
> the number of bits as a separate piece of metadata than the 4KB static
> allocation for regular PAMT. Although the size is enumerated differently,
> it is handed to the TDX module in the same way the 4KB page size PAMT
> allocation is for regular, non-dynamic PAMT.
>
> Begin to implement Dynamic PAMT in the kernel by reading the bits-per-page
> needed for Dynamic PAMT. Calculate the size needed for the bitmap,
> and use it instead of the 4KB size determined for normal PAMT, in the case
> of Dynamic PAMT.
>
> Unlike the existing metadata reading code, this code is not generated by a
> script. So adjust the comment to be more generic. Also, start to adopt a
> more normal kernel code style without the tenary statements and if
> conditionals assignments that the auto generated code has.
>
> Assisted-by: Sashiko:claude-opus-4-6
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

Kirill's comment make sense to me.

Reviewed-by: Vishal Annapurve <vannapurve@google.com>

^ permalink raw reply

* Re: [PATCH iproute2-next 7/7] devlink: add scope filter to resource show
From: David Ahern @ 2026-06-11 18:53 UTC (permalink / raw)
  To: Tariq Toukan, Stephen Hemminger, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrew Lunn, David S. Miller
  Cc: Donald Hunter, Simon Horman, Jiri Pirko, Jonathan Corbet,
	Shuah Khan, Saeed Mahameed, Leon Romanovsky, Mark Bloch,
	Shuah Khan, Matthieu Baerts (NGI0), Chuck Lever, Or Har-Toov,
	Carolina Jubran, Moshe Shemesh, Shay Drori, Dragos Tatulea,
	Daniel Zahka, Shahar Shitrit, Jacob Keller, Cosmin Ratiu,
	Parav Pandit, Kees Cook, Adithya Jayachandran, Daniel Jurgens,
	netdev, linux-kernel, linux-doc, linux-rdma, linux-kselftest,
	Gal Pressman, Ido Schimmel, Jiri Pirko, Petr Machata
In-Reply-To: <20260609053953.487152-8-tariqt@nvidia.com>

On 6/8/26 11:39 PM, Tariq Toukan wrote:
> @@ -9010,13 +9029,29 @@ static int cmd_resource_show(struct dl *dl)
>  	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
>  	struct nlmsghdr *nlh;
>  	struct resource_ctx resource_ctx = {};
> +	struct dl_opts *opts = &dl->opts;
>  	int err;
>  
> -	err = dl_argv_parse_with_selector(dl, &flags, DEVLINK_CMD_RESOURCE_DUMP,
> -					  DL_OPT_HANDLE | DL_OPT_HANDLEP,
> -					  0, 0, 0);
> -	if (err)
> -		return err;
> +	if (dl_argv_match(dl, "scope")) {
> +		const char *scopestr;
> +
> +		dl_arg_inc(dl);
> +		err = dl_argv_str(dl, &scopestr);
> +		if (err)
> +			return err;
> +		err = resource_scope_get(scopestr, &opts->resource_scope_mask);
> +		if (err)
> +			return err;
> +		opts->present |= DL_OPT_RESOURCE_SCOPE;

Comment from Claude that seems legit:

Issue found: In cmd_resource_show, the scope path sets opts->present |=
DL_OPT_RESOURCE_SCOPE without first clearing opts->present. In batch
mode, dl->opts is shared across commands, and the non-scope path
correctly resets opts->present via dl_argv_parse(). But the scope path
bypasses dl_argv_parse(), so stale bits (e.g. DL_OPT_HANDLE from a
previous dev show) remain. When dl_opts_put() runs, it writes the stale
DEVLINK_ATTR_BUS_NAME/DEV_NAME attributes into the dump request,
silently filtering to a single device instead of all devices. Fix: use =
instead of |=

Are you ok with the suggested resolution?


^ permalink raw reply

* Re: [RFC PATCH v1 00/13] exec: add spawn templates for repeated executable startup
From: John Ericson @ 2026-06-11 18:53 UTC (permalink / raw)
  To: Mateusz Guzik, Li Chen
  Cc: Andy Lutomirski, Christian Brauner, Kees Cook, Al Viro,
	linux-fsdevel, linux-api, LKML, linux-mm, linux-arch, linux-doc,
	linux-kselftest, x86, Arnd Bergmann, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Jan Kara,
	Jonathan Corbet, Shuah Khan
In-Reply-To: <hd3i6pxxohsjesyid7nhuic6ppp6nyoxxpwa4mny6riqvpyqec@mylfprni2yaw>

On Wed, Jun 10, 2026, at 7:40 PM, Mateusz Guzik wrote:
> [...]
>
> As I tried to explain in my previous e-mail this approach does not cut
> it because of NUMA.
>
> Suppose you have a machine with 2 nodes. The parent-to-be is running
> on node 0 and the child is intended to exec something on node 1.
>
> When the parent-to-be allocates and populates stuff, it takes place with
> memory backed by node 0. If you allocate task_struct, the file table and
> other frequently used (and modified!) objs in this way, you are
> guaranteeing performance loss due to interconnect traffic to access it.
>
> Trying to add plumbing so that all allocations respect numa placement is
> probably too cumbersome.

Are we sure that last part is true?

Let's also assume when this stuff was initially implemented, we didn't
have it. If the basic thrust of this work is to replace functions that
previously only worked on the current thread with those that worked on
either arbitrary (not yet started) threads or the current thread, would
that not prepare us for slowly migrating the allocation choice to
reflect the node of the target task (new parameter) rather than the node
of the current task over time?

(This assumes the task is pre-placed on a node before it is actually run
there, and that pre-placement happens as early in the allocation process
as possible, so subsequent allocations can read off the
partially-initialized task's node.)

"Slowly migrating" is good here! It doesn't need to be the fastest thing
out of the gate, but if this new proper spawning API gets popular as I
think it would, and there is a clear path to optimizing it per the
above, then I am confident that over the years it will happen.

> The primary example for that is looking up the binary to exec in the
> first place.
>
> userspace likes to pass paths which don't exist, meaning checking for
> the binary before any hard work is a useful optimization. Suppose the
> binary to be executed is in a container bound with a taskset using
> node 1 and the content of the fs part of the container is currently
> fully uncached.
>
> When you perform the lookup on node 0, you are populating a bunch of
> metadata (inode, dentry) using memory from that domain. But the intended
> user will only execute on node 1, again resulting in a performance loss.
>
> In order to not do it you would need to convince VFS to allocate memory
> elsewhere.

One thing I don't get about this is that isn't the cost doing a bunch of
work searching the PATH for the directories where the executable
*doesn't* exist? In the case of something like a shell that is going to
spawn a lot of processes, I would think it is *good* to keep all that
PATH crawling VFS filling to be on the shell's node, rather than the
child processes' nodes.

It is only the executable itself, the final step of the VFS crawl, that
should be loaded into the other NUMA nodes. Insofar as (unless I am
missing something) creating the process means finding the inode for the
executable but not loading those pages, aren't we OK here? Only when the
new process is actually scheduled and run must the ELF be paged into
memory, and then that will happen on the correct node.

> So I stand by my previous claim that ultimately a pristine child has to
> be created (like in this patch), but which also has to do the work on
> its own.

I have not been a kernel dev, so my apologies if I am missing things.
But in conclusion for me, the FS and other resource access patterns of
*creating a process* vs *that process itself running* do not seem
necessarily coincident to me. What you are describing as for sure a
problem might possibly be a *good thing*, if they are in fact quite
different.

> Suppose there is no explicit placement requested anywhere. Even in that
> case there are legitimate workloads which will eventually be forced to
> exec stuff on another node. Even these have a better chance retaining
> full locality if the child process does all the work.
>
> Per my previous message I don't see a clean interface to do it.
> something quasi-posix_spawn is probably the least bad way out, it will
> also allow userspace to easily wrap the new thing with posix_spawn
> itself.
>
> Also note there is another issue with the fd-based approach: the fd will
> get inherited on fork and will hang out in the child afterwards unless
> explicitly closed. Suppose you have a multithreaded program which likes
> to both fork(+no exec) and fork+exec. With the fd-based approach you
> have no means of stopping another thread from grabbing your state thanks
> to unix defaulting to copying everything. There was an attempt to fix
> this aspect with O_CLOFORK, but this got rejected.

I would think we don't need to worry about clone/fork very much, right?
I think the premise of your emails, and just about everyone else's in
this thread too, is that we agree fork+exec is bad, and the problem of
unnecessarily sharing resources is inherent to fork. Furthermore, I
think we all agree that while `O_CLOEXEC` and `O_CLOFORK` may help, both
are unsatisfying solutions because they are opt-out not opt-in, and
global to the parent process / preexec state (respectively) rather than
local to the specific fork / exec in question.

pidfds encounter these problems no more than any other
file-descriptor-based UAPI, right? And I don't think it is good to blame
any such file-descriptor-based UAPI when fork/exec are at fault.

Maybe during the transition, when some things use fork and some things
use this new API, stuff will be awkward, but I would rather that just be
an incentive to complete the transition away from fork, not a reason to
second-guess the plan.

Once the transition is complete, and everyone is diligently assembling
their child processes from scratch as is proposed, `O_CLOEXEC` and
`O_CLOFORK` are both unneeded, and oversharing privileges will be much
less common simply because "lazy coding"/"minimal typing" will only
share what is needed --- anything else is more code/keystrokes!

> Whatever exactly happens, NUMA is a sad fact of computing and needs to
> be accounted for. The approach as proposed not only does not do it, but
> it actively hinders such deployments.

Despite everything I said, I want to be clear that I do agree that NUMA
performance should be accounted for. Even if the first version isn't as
great as it could be on that metric, there should be a clear plan for
how future work can conclusively address it.

Cheers,

John

^ permalink raw reply

* Re: [RFC V2 1/3] lib/vsprintf: Add support for pgtable entries
From: Andy Shevchenko @ 2026-06-11 18:59 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Usama Arif, linux-mm, Rasmus Villemoes, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt, Jonathan Corbet, Andrew Morton,
	David Hildenbrand, linux-kernel, linux-doc, David Hildenbrand,
	Lorenzo Stoakes, Andy Whitcroft
In-Reply-To: <d41a24c1-592e-495f-8adf-dd538b769904@arm.com>

On Thu, Jun 11, 2026 at 03:20:13PM +0530, Anshuman Khandual wrote:
> On 11/06/26 12:47 PM, Andy Shevchenko wrote:
> > On Thu, Jun 11, 2026 at 10:45:01AM +0530, Anshuman Khandual wrote:
> >> On 10/06/26 4:43 PM, Usama Arif wrote:
> >>> On Wed, 10 Jun 2026 05:35:43 +0100 Anshuman Khandual <anshuman.khandual@arm.com> wrote:

...

> >>>> +		static_assert(sizeof(pte_t) == 4 ||
> >>>> +			      sizeof(pte_t) == 8,
> >>>> +			      "pte_t size must be 4 or 8 bytes");
> > 
> > Besides occupying too many lines, why are these static asserts hidden here and
> > not declared in the global space? More wide Q is why they are needed at all?
> 
> Sure, will move these static_assert just above pxd_pointer()
> These asserts ensure
> 
> - Platforms have either 32 bit or 64 bit pgtable descriptors
> - special_hex_number() can be used to print such descriptors

I understand that. My question is do we actually _need_ them? In other words
when this may be not satisfied? Any real (non-theoretical) example?

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [PATCH v5 2/4] PCI: endpoint: Add DOE mailbox support for endpoint functions
From: Bjorn Helgaas @ 2026-06-11 19:11 UTC (permalink / raw)
  To: Aksh Garg
  Cc: linux-pci, linux-doc, mani, kwilczynski, bhelgaas, corbet, kishon,
	skhan, lukas, cassel, alistair, linux-arm-kernel, linux-kernel,
	s-vadapalli, danishanwar, srk
In-Reply-To: <20260610100256.1889111-3-a-garg7@ti.com>

On Wed, Jun 10, 2026 at 03:32:54PM +0530, Aksh Garg wrote:
> DOE (Data Object Exchange) is a standard PCIe extended capability
> feature introduced in the Data Object Exchange (DOE) ECN for
> PCIe r5.0. It provides a communication mechanism primarily used for
> implementing PCIe security features such as device authentication, and
> secure link establishment. Think of DOE as a sophisticated mailbox
> system built into PCIe. The root complex can send structured requests
> to the endpoint device through DOE mailboxes, and the endpoint device
> responds with appropriate data.

Please cite a spec revision and section instead of the ECN because
it's easier to find the spec than the ECN.  E.g., "PCIe r7.0, sec
6.30" or similar.

> Add the DOE support for PCIe endpoint devices, enabling endpoint
> functions to process the DOE requests from the host. The implementation
> provides framework APIs for EPC core driver and controller drivers to
> register mailboxes, and request processing with workqueues ensuring
> sequential handling per mailbox, and parallel handling across mailboxes.
> The Discovery protocol is handled internally by the DOE core.
> 
> This implementation complements the existing DOE implementation for
> root complex in drivers/pci/doe.c.
> 
> Co-developed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Aksh Garg <a-garg7@ti.com>
> ---
> 
> Changes from v4 to v5:
> - Addressed the review comments by Sashiko
> - Added refcount per DOE Mailbox to fix Use-After-Free bug
> - Change in the Abort Sequence:
>   * Instead of waiting on flush_workqueue() to clear the CANCEL flag,
>     return immediately after setting the CANCEL flag. The CANCEL flag
>     gets cleared in signal_task_complete(), allowing the mailbox to
>     accept new requests
>   * Abort sequence handling in various scenarios is updated and explained
>     in the documentation at PATCH 4/4
> 
> Changes from v3 to v4:
> - Used 'Returns' instead of 'RETURNS' in the function docstrings to
>   comply with kernel-doc format, as suggested by Manivannan Sadhasivam.
> - In pci_ep_doe_process_request(), changed the type of request buffer
>   from "const void *" to "void *", as the ownership is transferred to
>   DOE-EP framework, which is responsible to free the buffer.
> - Added "struct pci_epc *epc" to typedef "pci_ep_doe_complete_t", to be
>   used by the EPC driver.
> 
> Changes from v2 to v3:
> - Rebased on 7.1-rc1.
> 
> Changes since v1:
> - Moved the DOE-EP core file to drivers/pci/endpoint/pci-ep-doe.c, and
>   corresponding Kconfig and Makefile to match the existing naming scheme,
>   as suggested by Niklas Cassel.
> - Renamed the config from PCI_DOE_EP to PCI_ENDPOINT_DOE
> - Moved the function declarations that need not be visible outside the
>   PCI core to drivers/pci/pci.h instead to include/linux/pci-doe.h as
>   suggested by Lukas Wunner
> - Converted from synchronous to asynchronous request processing:
>   * Removed wait_for_completion() from pci_ep_doe_process_request()
>   * Function returns immediately after queuing to workqueue, hence
>     removed private data for completion in the task structure
>   * Added completion callback as an additional argument to
>     pci_ep_doe_process_request(), which takes the response and status
>     parameters as arguments (along with other required arguments), hence
>     removed task_status in the task structure
>   * Created a typedef pci_ep_doe_complete_t for completion callback
>   * Removed the pci_ep_doe_task_complete() function, as it would not be
>     required anymore with these changes
>   * Moved from INIT_WORK_ONSTACK() to INIT_WORK(), to initialize the work
>     on heap instead of stack
>   * signal_task_complete() now invokes the completion callback, once the
>     protocol handler completes its task
> - Changed from dynamic xarray-based protocol registration to static array:
>   * Removed the register/unregister protocol APIs
>   * Replaced the dynamic xarray with static array of struct pci_doe_protocol
>   * Added discovery protocol to static array, instead of treating it specially,
>     hence removed the special handling for Discovery protocol in
>     doe_ep_task_work()
>   * Updated pci_ep_doe_handle_discovery() and pci_ep_doe_find_protocol()
>     accordingly.
> - Memory Management:
>   * DOE core frees request buffer in signal_task_complete()
>     or during error handling
>   * pci_ep_doe_process_request() defines response_pl and response_pl_sz
>     as NULL and 0 respectively, whose pointer is passed to the protocol
>     handler, hence removed the arguments void **response, size_t *response_sz
>     to this function.
> - Task structure refactoring:
>   * Response buffer: void **response_pl to void *response_pl
>   * Response size: size_t *response_pl_sz to size_t response_pl_sz
>   * Changed the completion callback to type pci_ep_doe_complete_t
>   * Removed void *private and int task_status
> - Updated documentation comments of the functions according to the changes
> 
> v4: https://lore.kernel.org/all/20260522052434.802034-3-a-garg7@ti.com/
> v3: https://lore.kernel.org/all/20260427051725.223704-3-a-garg7@ti.com/
> v2: https://lore.kernel.org/all/20260401073022.215805-3-a-garg7@ti.com/
> v1: https://lore.kernel.org/all/20260213123603.420941-4-a-garg7@ti.com/
> 
>  drivers/pci/endpoint/Kconfig      |  14 +
>  drivers/pci/endpoint/Makefile     |   1 +
>  drivers/pci/endpoint/pci-ep-doe.c | 594 ++++++++++++++++++++++++++++++
>  drivers/pci/pci.h                 |  39 ++
>  include/linux/pci-doe.h           |   5 +
>  include/linux/pci-epc.h           |   3 +
>  6 files changed, 656 insertions(+)
>  create mode 100644 drivers/pci/endpoint/pci-ep-doe.c
> 
> diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig
> index 8dad291be8b8..15ae16aaa58f 100644
> --- a/drivers/pci/endpoint/Kconfig
> +++ b/drivers/pci/endpoint/Kconfig
> @@ -36,6 +36,20 @@ config PCI_ENDPOINT_MSI_DOORBELL
>  	  doorbell. The RC can trigger doorbell in EP by writing data to a
>  	  dedicated BAR, which the EP maps to the controller's message address.
>  
> +config PCI_ENDPOINT_DOE
> +	bool "PCI Endpoint Data Object Exchange (DOE) support"
> +	depends on PCI_ENDPOINT
> +	help
> +	  This enables support for Data Object Exchange (DOE) protocol
> +	  on PCI Endpoint controllers. It provides a communication
> +	  mechanism through mailboxes, primarily used for PCIe security
> +	  features.
> +
> +	  Say Y here if you want be able to communicate using PCIe DOE
> +	  mailboxes.
> +
> +	  If unsure, say N.
> +
>  source "drivers/pci/endpoint/functions/Kconfig"
>  
>  endmenu
> diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
> index b4869d52053a..1fa176b6792b 100644
> --- a/drivers/pci/endpoint/Makefile
> +++ b/drivers/pci/endpoint/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_PCI_ENDPOINT_CONFIGFS)	+= pci-ep-cfs.o
>  obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
>  					   pci-epc-mem.o functions/
>  obj-$(CONFIG_PCI_ENDPOINT_MSI_DOORBELL)	+= pci-ep-msi.o
> +obj-$(CONFIG_PCI_ENDPOINT_DOE)		+= pci-ep-doe.o
> diff --git a/drivers/pci/endpoint/pci-ep-doe.c b/drivers/pci/endpoint/pci-ep-doe.c
> new file mode 100644
> index 000000000000..ea6a152461bb
> --- /dev/null
> +++ b/drivers/pci/endpoint/pci-ep-doe.c
> @@ -0,0 +1,594 @@
> +// SPDX-License-Identifier: GPL-2.0-only OR MIT
> +/*
> + * Data Object Exchange for PCIe Endpoint
> + *	PCIe r7.0, sec 6.30 DOE
> + *
> + * Copyright (C) 2026 Texas Instruments Incorporated - https://www.ti.com
> + *	Aksh Garg <a-garg7@ti.com>
> + *	Siddharth Vadapalli <s-vadapalli@ti.com>
> + */
> +
> +#define dev_fmt(fmt) "DOE EP: " fmt
> +
> +#include <linux/bitfield.h>
> +#include <linux/device.h>
> +#include <linux/pci.h>
> +#include <linux/pci-epc.h>
> +#include <linux/pci-doe.h>
> +#include <linux/refcount.h>
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +#include <linux/xarray.h>
> +
> +#include "../pci.h"
> +
> +/* Forward declaration of discovery protocol handler */
> +static int pci_ep_doe_handle_discovery(const void *request, size_t request_sz,
> +				       void **response, size_t *response_sz);
> +
> +/**
> + * struct pci_doe_protocol - DOE protocol handler entry
> + * @vid: Vendor ID
> + * @type: Protocol type
> + * @handler: Handler function pointer
> + */
> +struct pci_doe_protocol {
> +	u16 vid;
> +	u8 type;
> +	pci_doe_protocol_handler_t handler;
> +};
> +
> +/**
> + * struct pci_ep_doe_mb - State for a single DOE mailbox on EP
> + *
> + * This state is used to manage a single DOE mailbox capability on the
> + * endpoint side.
> + *
> + * @epc: PCI endpoint controller this mailbox belongs to
> + * @func_no: Physical function number of the function this mailbox belongs to
> + * @cap_offset: Capability offset
> + * @work_queue: Queue of work items
> + * @flags: Bit array of PCI_DOE_FLAG_* flags
> + * @refs: Refcount to manage mailbox lifetime and ensure safe cleanup
> + */
> +struct pci_ep_doe_mb {
> +	struct pci_epc *epc;
> +	u8 func_no;
> +	u16 cap_offset;
> +	struct workqueue_struct *work_queue;
> +	unsigned long flags;
> +	refcount_t refs;
> +};
> +
> +/**
> + * struct pci_ep_doe_task - Represents a single DOE request/response task
> + *
> + * @feat: DOE feature (vendor ID and type)
> + * @request_pl: Request payload
> + * @request_pl_sz: Size of request payload in bytes
> + * @response_pl: Response buffer
> + * @response_pl_sz: Size of response buffer in bytes
> + * @complete: Completion callback
> + * @work: Work structure for workqueue
> + * @doe_mb: DOE mailbox handling this task
> + */
> +struct pci_ep_doe_task {
> +	struct pci_doe_feature feat;
> +	const void *request_pl;
> +	size_t request_pl_sz;
> +	void *response_pl;
> +	size_t response_pl_sz;
> +	pci_ep_doe_complete_t complete;
> +
> +	/* Initialized by pci_ep_doe_submit_task() */
> +	struct work_struct work;
> +	struct pci_ep_doe_mb *doe_mb;
> +};
> +
> +/*
> + * Global registry of protocol handlers.
> + * When a new DOE protocol, library is added, add an entry to this array.
> + */
> +static const struct pci_doe_protocol pci_doe_protocols[] = {
> +	{
> +		.vid = PCI_VENDOR_ID_PCI_SIG,
> +		.type = PCI_DOE_FEATURE_DISCOVERY,
> +		.handler = pci_ep_doe_handle_discovery,
> +	},
> +};
> +
> +/*
> + * Combines function number and capability offset into a unique lookup key
> + * for storing/retrieving DOE mailboxes in an xarray.

s/Combines/Combine/

> + */
> +#define PCI_DOE_MB_KEY(func, offset) \
> +	(((unsigned long)(func) << 16) | (offset))
> +#define PCI_DOE_PROTOCOL_COUNT        ARRAY_SIZE(pci_doe_protocols)
> +
> +/**
> + * pci_ep_doe_init() - Initialize the DOE framework for a controller in EP mode
> + * @epc: PCI endpoint controller
> + *
> + * Initialize the DOE framework data structures. This only initializes
> + * the xarray that will hold the mailboxes.
> + *
> + * Returns: 0 on success, -errno on failure

s/Returns:/Return:/ (throughout)

Mani suggested "Returns" (from v3 to v4 above), so that's OK too.
https://origin.kernel.org/doc/html/latest/doc-guide/kernel-doc.html
includes four "Return:" examples and one "Returns:" example, and
"Return" fits better in the preferred imperative mood, so I have a
slight preference for that.

> + */
> +int pci_ep_doe_init(struct pci_epc *epc)
> +{
> +	if (!epc)
> +		return -EINVAL;

I doubt this is useful.  Obviously a bug in the caller and I'd rather
take the NULL pointer dereference, which will definitely be noticed,
than assume the buggy caller checks for failure.  The function might
as well be void then, same as pci_doe_init().

> +	xa_init(&epc->doe_mbs);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_init);
> +
> +/**
> + * pci_ep_doe_add_mailbox() - Add a DOE mailbox for a physical function
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: Offset of the DOE capability
> + *
> + * Create and register a DOE mailbox for the specified physical function
> + * and capability offset.
> + *
> + * EPC core driver calls this for each DOE capability discovered in the config
> + * space of each endpoint function if DOE support is available for the EPC.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long key;
> +	int ret;
> +
> +	if (!epc)
> +		return -EINVAL;

Also doubtful about this.

> +	doe_mb = kzalloc_obj(*doe_mb, GFP_KERNEL);
> +	if (!doe_mb)
> +		return -ENOMEM;
> +
> +	doe_mb->epc = epc;
> +	doe_mb->func_no = func_no;
> +	doe_mb->cap_offset = cap_offset;
> +
> +	doe_mb->work_queue = alloc_ordered_workqueue("pci_ep_doe[%s:pf%d:offset%x]", 0,
> +						     dev_name(&epc->dev),
> +						     func_no, cap_offset);
> +	if (!doe_mb->work_queue) {
> +		dev_err(epc->dev.parent,
> +			"[pf%d:offset%x] failed to allocate work queue\n",
> +			func_no, cap_offset);
> +		ret = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	/* Add to xarray with composite key */
> +	key = PCI_DOE_MB_KEY(func_no, cap_offset);
> +	ret = xa_insert(&epc->doe_mbs, key, doe_mb, GFP_KERNEL);
> +	if (ret) {
> +		dev_err(epc->dev.parent,
> +			"[pf%d:offset%x] failed to insert mailbox: %d\n",
> +			func_no, cap_offset, ret);
> +		goto err_destroy;
> +	}
> +
> +	refcount_set(&doe_mb->refs, 1);
> +
> +	dev_dbg(epc->dev.parent,
> +		"DOE mailbox added: pf%d offset 0x%x\n",
> +		func_no, cap_offset);
> +
> +	return 0;
> +
> +err_destroy:
> +	destroy_workqueue(doe_mb->work_queue);
> +err_free:
> +	kfree(doe_mb);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_add_mailbox);
> +
> +/**
> + * pci_ep_doe_cancel_tasks() - Cancel all pending tasks
> + * @doe_mb: DOE mailbox
> + *
> + * Cancel all pending tasks in the mailbox. Mark the mailbox as dead
> + * so no new tasks can be submitted.
> + */
> +static void pci_ep_doe_cancel_tasks(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

Seems like this silently hides caller bugs without even the
possibility of checking for failure?

> +	/* Mark the mailbox as dead */
> +	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
> +
> +	/* Stop all pending work items from starting */
> +	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +}
> +
> +/**
> + * pci_ep_doe_get_mailbox() - Get DOE mailbox by function and offset
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: Offset of the DOE capability
> + *
> + * Internal helper to look up a DOE mailbox by its function number and
> + * capability offset.
> + *
> + * Returns: Pointer to the mailbox or NULL if not found
> + */
> +static struct pci_ep_doe_mb *pci_ep_doe_get_mailbox(struct pci_epc *epc,
> +						    u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long key;
> +
> +	if (!epc)
> +		return NULL;

Same?

> +	key = PCI_DOE_MB_KEY(func_no, cap_offset);
> +
> +	xa_lock(&epc->doe_mbs);
> +
> +	doe_mb = xa_load(&epc->doe_mbs, key);
> +	if (doe_mb && !refcount_inc_not_zero(&doe_mb->refs))
> +		doe_mb = NULL;
> +
> +	xa_unlock(&epc->doe_mbs);
> +
> +	return doe_mb;
> +}
> +
> +/**
> + * pci_ep_doe_put_mailbox() - Release a reference to a DOE mailbox
> + * @doe_mb: The mailbox structure to release
> + *
> + * Drops the reference count. If this was the last active reference,
> + * the memory allocated for the mailbox structure is freed.

s/Drops/Drop/
s/If ... is freed/Free the ... if this was last active .../

> + */
> +static void pci_ep_doe_put_mailbox(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

Omit unless there's a reason for this.

> +	if (refcount_dec_and_test(&doe_mb->refs))
> +		kfree(doe_mb);
> +}
> +
> +/**
> + * pci_ep_doe_find_protocol() - Find protocol handler in static array
> + * @vendor: Vendor ID
> + * @type: Protocol type
> + *
> + * Look up a protocol handler in the static protocol array by matching vendor ID
> + * and protocol type.
> + *
> + * Returns: Handler function pointer or NULL if not found
> + */
> +static pci_doe_protocol_handler_t pci_ep_doe_find_protocol(u16 vendor, u8 type)
> +{
> +	int i;
> +
> +	/* Search static protocol array */
> +	for (i = 0; i < PCI_DOE_PROTOCOL_COUNT; i++) {
> +		if (pci_doe_protocols[i].vid == vendor &&
> +		    pci_doe_protocols[i].type == type)
> +			return pci_doe_protocols[i].handler;
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + * pci_ep_doe_handle_discovery() - Handle Discovery protocol request
> + * @request: Request payload
> + * @request_sz: Request size
> + * @response: Output pointer for response buffer
> + * @response_sz: Output pointer for response size
> + *
> + * Handle the DOE Discovery protocol. The request contains an index specifying
> + * which protocol to query. This function creates a response containing the
> + * vendor ID and protocol type for the requested index, along with the next
> + * index value for further discovery:
> + *
> + * - next_index = 0: Signals this is the last protocol supported
> + * - next_index = n (non-zero): Signals more protocols available,
> + *   query index n next
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_ep_doe_handle_discovery(const void *request, size_t request_sz,
> +				       void **response, size_t *response_sz)
> +{
> +	struct pci_doe_protocol protocol;
> +	u8 requested_index, next_index;
> +	u32 *response_pl;
> +	u32 request_pl;
> +	u16 vendor;
> +	u8 type;
> +
> +	if (request_sz != sizeof(u32))
> +		return -EINVAL;
> +
> +	request_pl = *(u32 *)request;
> +	requested_index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX, request_pl);
> +
> +	if (requested_index >= PCI_DOE_PROTOCOL_COUNT) {
> +		/* No more protocols to report */
> +		vendor = 0;
> +		type = 0;
> +	} else {
> +		/* Get protocol from array at requested_index */
> +		protocol = pci_doe_protocols[requested_index];
> +		vendor = protocol.vid;
> +		type = protocol.type;
> +	}
> +
> +	/* Calculate next index */
> +	next_index = (requested_index + 1 < PCI_DOE_PROTOCOL_COUNT) ? requested_index + 1 : 0;
> +
> +	response_pl = kzalloc_obj(*response_pl, GFP_KERNEL);
> +	if (!response_pl)
> +		return -ENOMEM;
> +
> +	/* Build response */
> +	*response_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, vendor) |
> +		       FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE, type) |
> +		       FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX, next_index);
> +
> +	*response = response_pl;
> +	*response_sz = sizeof(*response_pl);
> +
> +	return 0;
> +}
> +
> +static void signal_task_complete(struct pci_ep_doe_task *task, int status)
> +{
> +	struct pci_ep_doe_mb *doe_mb = task->doe_mb;
> +
> +	task->complete(doe_mb->epc, doe_mb->func_no, doe_mb->cap_offset,
> +		       status, task->feat.vid, task->feat.type,
> +		       task->response_pl, task->response_pl_sz);
> +
> +	/* Clear the CANCEL flag for next DOE request */
> +	clear_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +
> +	kfree(task->request_pl);
> +	kfree(task);
> +
> +	/* Release the mailbox reference acquired during process_request */
> +	pci_ep_doe_put_mailbox(doe_mb);
> +}
> +
> +/**
> + * doe_ep_task_work() - Work function for processing DOE EP tasks
> + * @work: Work structure
> + *
> + * Process a DOE request by calling the appropriate protocol handler.
> + */
> +static void doe_ep_task_work(struct work_struct *work)
> +{
> +	struct pci_ep_doe_task *task = container_of(work, struct pci_ep_doe_task,
> +						    work);
> +	struct pci_ep_doe_mb *doe_mb = task->doe_mb;
> +	pci_doe_protocol_handler_t handler;
> +	int rc;
> +
> +	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
> +		signal_task_complete(task, -EIO);
> +		return;
> +	}
> +
> +	/* Check if request was aborted */
> +	if (test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags)) {
> +		signal_task_complete(task, -ECANCELED);
> +		return;
> +	}
> +
> +	/* Find protocol handler in the array */

Comment seems superfluous, given the function name.

> +	handler = pci_ep_doe_find_protocol(task->feat.vid, task->feat.type);
> +	if (!handler) {
> +		dev_warn_ratelimited(doe_mb->epc->dev.parent,
> +				     "[%d:%x] Unsupported protocol VID=%04x TYPE=%02x\n",
> +				     doe_mb->func_no, doe_mb->cap_offset,
> +				     task->feat.vid, task->feat.type);
> +		signal_task_complete(task, -EOPNOTSUPP);
> +		return;
> +	}
> +
> +	/* Call protocol handler */

Ditto.

> +	rc = handler(task->request_pl, task->request_pl_sz,
> +		     &task->response_pl, &task->response_pl_sz);
> +
> +	signal_task_complete(task, rc);
> +}
> +
> +/**
> + * pci_ep_doe_submit_task() - Submit a task to be processed
> + * @doe_mb: DOE mailbox
> + * @task: Task to submit
> + *
> + * Submit a DOE task to the workqueue for asynchronous processing.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_ep_doe_submit_task(struct pci_ep_doe_mb *doe_mb,
> +				  struct pci_ep_doe_task *task)
> +{
> +	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
> +		return -EIO;
> +
> +	task->doe_mb = doe_mb;
> +	INIT_WORK(&task->work, doe_ep_task_work);
> +	queue_work(doe_mb->work_queue, &task->work);
> +	return 0;
> +}
> +
> +/**
> + * pci_ep_doe_process_request() - Process DOE request on endpoint
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: DOE capability offset
> + * @vendor: Vendor ID from request header
> + * @type: Protocol type from request header
> + * @request: Request payload in CPU-native format
> + * @request_sz: Size of request payload (bytes)
> + * @complete: Callback to invoke upon completion
> + *
> + * Asynchronously process a DOE request received on the endpoint. The request
> + * payload should not include the DOE header (vendor/type/length). Ownership
> + * of the request buffer is transferred to DOE EP core, which frees the buffer
> + * either on error or after the completion callback fires. The protocol handler
> + * will allocate the response buffer, which the caller (controller driver) must
> + * free after use.

I guess signal_task_complete() is where the request buffer is freed
after completion?  Maybe mention the function name directly instead of
just "completion callback", e.g., "by signal_task_complete(), the
completion callback"?

> + * This function returns immediately after queuing the request. The completion
> + * callback will be invoked asynchronously from workqueue context once the
> + * request is processed. The callback receives the function number and capability
> + * offset to identify the mailbox, along with a status code (0 on success, -errno
> + * on failure), and other required arguments.

Wrap to fit in 80 columns.

> + * As per DOE specification, a mailbox processes one request at a time.
> + * Therefore, this function will never be called concurrently for the same
> + * mailbox by different callers.
> + *
> + * The caller is responsible for the conversion of the received DOE request
> + * with le32_to_cpu() before calling this function.
> + * Similarly, it is responsible for converting the response payload with
> + * cpu_to_le32() before sending it back over the DOE mailbox.

Wrap to fill 78-80 columns (or add blank line if you want a new
paragraph, but this looks like all one paragraph).

> + * The caller is also responsible for ensuring that the request size
> + * is within the limits defined by PCI_DOE_MAX_LENGTH.
> + *
> + * Returns: 0 if the request was successfully queued, -errno on failure
> + */
> +int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no, u16 cap_offset,
> +			       u16 vendor, u8 type, void *request, size_t request_sz,
> +			       pci_ep_doe_complete_t complete)

Wrap to fit in 80 columns.

> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	struct pci_ep_doe_task *task;
> +	int rc;
> +
> +	doe_mb = pci_ep_doe_get_mailbox(epc, func_no, cap_offset);
> +	if (!doe_mb) {
> +		kfree(request);
> +		return -ENODEV;
> +	}
> +
> +	task = kzalloc_obj(*task, GFP_ATOMIC);
> +	if (!task) {
> +		kfree(request);
> +		pci_ep_doe_put_mailbox(doe_mb);
> +		return -ENOMEM;
> +	}
> +
> +	task->feat.vid = vendor;
> +	task->feat.type = type;
> +	task->request_pl = request;
> +	task->request_pl_sz = request_sz;
> +	task->response_pl = NULL;
> +	task->response_pl_sz = 0;
> +	task->complete = complete;
> +
> +	rc = pci_ep_doe_submit_task(doe_mb, task);
> +	if (rc) {
> +		kfree(request);
> +		kfree(task);
> +		pci_ep_doe_put_mailbox(doe_mb);
> +		return rc;

Good candidate for error path ladder, as you did in
pci_ep_doe_add_mailbox().

> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_process_request);
> +
> +/**
> + * pci_ep_doe_abort() - Abort DOE operations on a mailbox
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: DOE capability offset
> + *
> + * Abort the queued or in-flight DOE operation for the specified mailbox.
> + * This function is called by the EP controller driver when the RC sets the
> + * ABORT bit in the DOE Control register, and the BUSY bit is set in the
> + * DOE Status Register.
> + *
> + * The function sets the CANCEL flag on the mailbox to prevent queued requests
> + * from starting, and returns immediately. The CANCEL flag gets cleared in
> + * signal_task_complete(), allowing the mailbox to accept new requests.

s/The function sets .../Set .../
s/and returns/and return/

> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +
> +	if (!epc)
> +		return -EINVAL;

?

> +	doe_mb = pci_ep_doe_get_mailbox(epc, func_no, cap_offset);
> +	if (!doe_mb)
> +		return -ENODEV;
> +
> +	/* Set CANCEL flag - worker will abort queued requests */
> +	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +
> +	dev_dbg_ratelimited(epc->dev.parent,
> +			    "DOE mailbox abort initialized: PF%d offset 0x%x\n",
> +			    func_no, cap_offset);
> +
> +	pci_ep_doe_put_mailbox(doe_mb);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_abort);
> +
> +/**
> + * pci_ep_doe_destroy_mb() - Destroy a single DOE mailbox
> + * @doe_mb: DOE mailbox to destroy
> + *
> + * Internal function to destroy a mailbox and free its resources.
> + */
> +static void pci_ep_doe_destroy_mb(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

?

> +	pci_ep_doe_cancel_tasks(doe_mb);
> +
> +	if (doe_mb->work_queue)
> +		destroy_workqueue(doe_mb->work_queue);
> +
> +	pci_ep_doe_put_mailbox(doe_mb);
> +}
> +
> +/**
> + * pci_ep_doe_destroy() - Destroy all DOE mailboxes
> + * @epc: PCI endpoint controller
> + *
> + * Destroy all DOE mailboxes and free associated resources.
> + *
> + * The EPC core driver calls this to free all DOE resources,
> + * if DOE support is available for the EPC.
> + */
> +void pci_ep_doe_destroy(struct pci_epc *epc)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long index;
> +
> +	if (!epc)
> +		return;

?

> +	xa_for_each(&epc->doe_mbs, index, doe_mb) {
> +		xa_erase(&epc->doe_mbs, index);
> +		pci_ep_doe_destroy_mb(doe_mb);
> +	}
> +
> +	xa_destroy(&epc->doe_mbs);
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_destroy);
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 5844deee2b5f..c4a0e25625e3 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -692,6 +692,13 @@ struct pci_doe_feature {
>  	u8 type;
>  };
>  
> +struct pci_epc;
> +
> +typedef void (*pci_ep_doe_complete_t)(struct pci_epc *epc, u8 func_no,
> +				      u16 cap_offset, int status,
> +				      u16 vendor, u8 type,
> +				      void *response_pl, size_t response_pl_sz);
> +
>  #ifdef CONFIG_PCI_DOE
>  void pci_doe_init(struct pci_dev *pdev);
>  void pci_doe_destroy(struct pci_dev *pdev);
> @@ -702,6 +709,38 @@ static inline void pci_doe_destroy(struct pci_dev *pdev) { }
>  static inline void pci_doe_disconnected(struct pci_dev *pdev) { }
>  #endif
>  
> +#ifdef CONFIG_PCI_ENDPOINT_DOE
> +int pci_ep_doe_init(struct pci_epc *epc);
> +int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no, u16 cap_offset);
> +int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no, u16 cap_offset,
> +			       u16 vendor, u8 type, void *request,
> +			       size_t request_sz, pci_ep_doe_complete_t complete);
> +int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset);
> +void pci_ep_doe_destroy(struct pci_epc *epc);
> +#else
> +static inline int pci_ep_doe_init(struct pci_epc *epc) { return -EOPNOTSUPP; }
> +static inline int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no,
> +					 u16 cap_offset)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no,
> +					     u16 cap_offset, u16 vendor, u8 type,
> +					     void *request, size_t request_sz,
> +					     pci_ep_doe_complete_t complete)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset)

Wrap to fit in 80 columns.

> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void pci_ep_doe_destroy(struct pci_epc *epc) { }
> +#endif
> +
>  #ifdef CONFIG_PCI_NPEM
>  void pci_npem_create(struct pci_dev *dev);
>  void pci_npem_remove(struct pci_dev *dev);
> diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
> index abb9b7ae8029..c46e42f3ce78 100644
> --- a/include/linux/pci-doe.h
> +++ b/include/linux/pci-doe.h
> @@ -22,6 +22,11 @@ struct pci_doe_mb;
>  /* Max data object length is 2^18 dwords */
>  #define PCI_DOE_MAX_LENGTH		(1 << 18)
>  
> +typedef int (*pci_doe_protocol_handler_t)(const void *request,
> +					  size_t request_sz,
> +					  void **response,
> +					  size_t *response_sz);
> +
>  struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
>  					u8 type);
>  
> diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
> index 1eca1264815b..dd26294c8175 100644
> --- a/include/linux/pci-epc.h
> +++ b/include/linux/pci-epc.h
> @@ -182,6 +182,9 @@ struct pci_epc {
>  	unsigned long			function_num_map;
>  	int				domain_nr;
>  	bool				init_complete;
> +#ifdef CONFIG_PCI_ENDPOINT_DOE
> +	struct xarray			doe_mbs;
> +#endif
>  };
>  
>  /**
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Shakeel Butt @ 2026-06-11 19:12 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: YoungJun Park, Hao Jia, Johannes Weiner, mhocko, tj, mkoutny,
	roman.gushchin, Nhat Pham, akpm, chengming.zhou, muchun.song,
	cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
	kasong, baoquan.he
In-Reply-To: <airzE7jD9UtyR17J@google.com>

On Thu, Jun 11, 2026 at 05:45:04PM +0000, Yosry Ahmed wrote:
> On Tue, Jun 09, 2026 at 01:19:13PM +0900, YoungJun Park wrote:
> > On Mon, Jun 08, 2026 at 03:27:07PM -0700, Yosry Ahmed wrote:
> > 
> > +Chris +Kairui +Baoquan
> > 
> > Hello
> > 
> > Thanks for inviting me to the discussion, Shakeel.
> > 
> > > > > > Youngjun is working on swap tiers. At the moment he is more interested in
> > > > > > allowing a specific swap device to a memcg or not. I can imagine in future there
> > > > > > will be use-cases where there will be a need to demote data on higher tier swap
> > > > > > to lower tier swap. What would be the appropriate interface?
> > 
> > Speaking of my work on swap tiers, I recently submitted a patch and am
> > currently considering memcg integration:
> > https://lore.kernel.org/linux-mm/20260527062247.3440692-1-youngjun.park@lge.com/
> > 
> > The future use-cases imagined above seem to align with this
> > direction. (BTW, I am currently waiting for reviews/feedback from the memcg
> > folks on this patch. Any reviews would be highly appreciated!)
> > 
> > We could potentially assign a target tier
> > for writeback within the existing memory.zswap.writeback interface. 
> > 
> > For instance, '0' could mean disabled, while non-zero values could represent
> > specific tiers, which would maintain backward compatibility with the current
> > version. Alternatively, if zswap is treated as the default top tier, 
> > the `memory.swap.tiers` interface could potentially replace `memory.zswap.writeback`.
> > 
> > Furthermore, this could be expanded so that each swap tier can demote data
> > user-triggered demotion between swap tiers.
> > 
> > Based on the current patch's ideas combined with my swap tiers concept:
> > 
> > Assuming a hierarchy like:
> > zswap -> tier1 (SSD swap) -> tier2 (HDD swap) -> tier3 (Network swap)
> > 
> > We could configure the active tiers via a setting like `memory.swap.tiers`
> > (tier2 enabled, tier3 enabled).
> > 
> > For example, the concept of `echo "100M zswap_writeback_only > memory.reclaim"`
> > could be extended. A user could run `echo "100M tier2 > memory.reclaim"`
> > to explicitly trigger demotion from tier2 to tier3.
> > (BTW, if we combine these features, my personal preference for the keyword
> > format would be `<size> <demote_prefix><tier_name>`. I think it would be
> > better to explicitly indicate that it is a swap demotion by using a specific
> > prefix followed by the tier name. 
> > Or make demote prefix another key is also possible)
> 
> I am not sure if proactive demotion between swap tiers would be driven
> by memory.reclaim, I am guessing a new interface might be more suitable.
> But yes, you are right that it's very possible that
> 'zswap_writeback_only' with memory.reclaim will become obsolete once
> swap tiering matures and starts supporting things like proactive
> demotion.
> 
> Part of me wants to wait until the swap tiering interfaces are figured
> out so that we don't end up with redundant interfaces, but I also don't
> want to hold Hao's work since it doesn't directly depend on swap
> tiering.
> 
> Shakeel, how do you want to handle this? I think there's a few options:
> 
> 1. Add zswap_writeback_only now, and when we have swap tiering demotion
> it becomes a redundant interface, like memory.zswap.writeback -- or
> maybe we try to deprecate both of them at that point. It's difficult to
> remove interfaces tho, but maybe easier to stop supporting
> zswap_writeback_only.
> 
> 2. Add zswap_writeback_only behind an experimental config option, to
> unblock development but have a line of sight to dropping support once we
> have a swap tiering interface.
> 
> 3. Wait until we figure out the swap tiering interfaces and then add
> the proactive zswap writeback as part of it.
> 
> WDYT?

Is Hao's work needed for some followup work/development? The earliest Hao's
work can is 7.3, so if we aim to figure out swap tiering interfaces in next
couple of weeks then option 3 is the way to go. If swap tiers take more time
then we can discuss other options as well.

However I would need zswap folks (Yosry & Nhat) help in figuring out swap tiers
interfaces. Zswap is the current top tier swap usage in real world. I want
zswap users to eaily (and hopefully transparently) migrate to swap tiers.

^ permalink raw reply

* Re: [PATCH v5 3/4] PCI: endpoint: Add support for DOE initialization and setup in EPC core
From: Bjorn Helgaas @ 2026-06-11 19:12 UTC (permalink / raw)
  To: Aksh Garg
  Cc: linux-pci, linux-doc, mani, kwilczynski, bhelgaas, corbet, kishon,
	skhan, lukas, cassel, alistair, linux-arm-kernel, linux-kernel,
	s-vadapalli, danishanwar, srk
In-Reply-To: <20260610100256.1889111-4-a-garg7@ti.com>

On Wed, Jun 10, 2026 at 03:32:55PM +0530, Aksh Garg wrote:
> Add pci_epc_init_capabilities() in EPC core driver to initialize and
> setup the capabilities supported by the EPC driver. This calls
> pci_epc_doe_setup() to setup the DOE framework for an endpoint controller,
> which discovers the DOE capabilities (extended capability ID 0x2E), and
> registers each discovered DOE mailbox for all the functions in the
> endpoint controller.
> 
> Add pci_epc_deinit_capabilities() in EPC core driver for cleanup of the
> resources used by the capabilities of the EPC driver. This calls
> pci_ep_doe_destroy() to destroy all DOE mailboxes and free associated
> resources.
> 
> Co-developed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Aksh Garg <a-garg7@ti.com>
> ---
> 
> Changes from v4 to v5:
> - Addressed the review comments by Sashiko
> 
> Changes from v3 to v4:
> - Call DOE setup and destroy APIs directly within the EPC core, instead of
>   relying on the EPC drivers to call them individually. EPC drivers do not
>   need to explicitly handle DOE setup, rather the EPC core manages this
>   transparently. (Suggested by Manivannan Sadhasivam).
> - Removed pci_epc_doe_destroy() API, which was just calling pci_ep_doe_destroy().
>   Instead, called pci_ep_doe_destroy() directly during cleanup.
> - Called pci_ep_doe_init() before the "!epc->ops->find_ext_capability" check,
>   because if doe-capable=1 and find_ext_capability() op is undefined, this
>   would not initialize the epc->doe_mbs xarray. However during cleanup, the
>   check "!epc->ops->find_ext_capability" would be unnecessary, and it will
>   try to destroy the epc->doe_mbs xarray even when it was not initialized.
> 
> Changes from v2 to v3:
> - Rebased on 7.1-rc1.
> 
> Changes since v1:
> - New patch added to v2 (not present in v1)
> 
> v4: https://lore.kernel.org/all/20260522052434.802034-4-a-garg7@ti.com/
> v3: https://lore.kernel.org/all/20260427051725.223704-4-a-garg7@ti.com/
> v2: https://lore.kernel.org/all/20260401073022.215805-4-a-garg7@ti.com/
> 
> This patch is introduced based on the feedback provided by Manivannan
> Sadhasivam at [1].
> 
> [1]: https://lore.kernel.org/all/p57x6jleaim5w7t2k3v7tioujnaxuovfpj5euop5ogefvw23se@y5fw3che5p5d/
> 
> 
>  drivers/pci/endpoint/pci-epc-core.c | 104 ++++++++++++++++++++++++++++
>  include/linux/pci-epc.h             |   6 ++
>  2 files changed, 110 insertions(+)
> 
> diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
> index 6c3c58185fc5..e48f40eeed29 100644
> --- a/drivers/pci/endpoint/pci-epc-core.c
> +++ b/drivers/pci/endpoint/pci-epc-core.c
> @@ -14,6 +14,8 @@
>  #include <linux/pci-epf.h>
>  #include <linux/pci-ep-cfs.h>
>  
> +#include "../pci.h"
> +
>  static const struct class pci_epc_class = {
>  	.name = "pci_epc",
>  };
> @@ -842,6 +844,81 @@ void pci_epc_linkdown(struct pci_epc *epc)
>  }
>  EXPORT_SYMBOL_GPL(pci_epc_linkdown);
>  
> +/**
> + * pci_epc_doe_setup() - Discover and setup DOE mailboxes for all functions
> + * @epc: the EPC device on which DOE mailboxes has to be setup
> + *
> + * Discover DOE (Data Object Exchange) capabilities for all physical functions
> + * in the endpoint controller and register DOE mailboxes.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_epc_doe_setup(struct pci_epc *epc)
> +{
> +	u8 func_no, vfunc_no = 0;
> +	u16 cap_offset;
> +	int ret;
> +
> +	if (!epc->ops || !epc->ops->find_ext_capability)
> +		return -EINVAL;

I don't see anything that sets pci_epc_ops.find_ext_capability in this
series, so this looks currently unused and untestable, so likely not
mergeable as-is.  What's the plan for users of this?

> +	/* Discover DOE capabilities for all functions */
> +	for (func_no = 0; func_no < epc->max_functions; func_no++) {
> +		mutex_lock(&epc->lock);
> +		cap_offset = epc->ops->find_ext_capability(epc, func_no,
> +							   vfunc_no, 0,
> +							   PCI_EXT_CAP_ID_DOE);
> +		mutex_unlock(&epc->lock);
> +
> +		while (cap_offset) {
> +			/* Register this DOE mailbox */
> +			ret = pci_ep_doe_add_mailbox(epc, func_no, cap_offset);
> +			if (ret) {
> +				dev_warn(&epc->dev,
> +					 "[pf%d:offset %x] failed to add DOE mailbox\n",
> +					 func_no, cap_offset);
> +			}
> +
> +			mutex_lock(&epc->lock);
> +			cap_offset = epc->ops->find_ext_capability(epc, func_no,
> +								   vfunc_no, cap_offset,
> +								   PCI_EXT_CAP_ID_DOE);
> +			mutex_unlock(&epc->lock);
> +		}
> +	}
> +
> +	dev_dbg(&epc->dev, "DOE mailboxes setup complete\n");
> +	return 0;
> +}
> +
> +/**
> + * pci_epc_init_capabilities() - Initialize EPC capabilities
> + * @epc: the EPC device whose capabilities need to be initialized
> + *
> + * Invoke to initialize capabilities supported by the EPC device.

s/Invoke to initialize/Initialize/

> + */
> +static void pci_epc_init_capabilities(struct pci_epc *epc)
> +{
> +	const struct pci_epc_features *epc_features;
> +	int ret;
> +
> +	epc_features = pci_epc_get_features(epc, 0, 0);
> +	if (!epc_features)
> +		return;
> +
> +	if (IS_ENABLED(CONFIG_PCI_ENDPOINT_DOE) && epc_features->doe_capable) {
> +		ret = pci_ep_doe_init(epc);
> +		if (ret) {
> +			dev_warn(&epc->dev, "DOE initialization failed: %d\n", ret);
> +			return;
> +		}
> +
> +		ret = pci_epc_doe_setup(epc);
> +		if (ret)
> +			dev_warn(&epc->dev, "DOE setup failed: %d\n", ret);
> +	}
> +}
> +
>  /**
>   * pci_epc_init_notify() - Notify the EPF device that EPC device initialization
>   *                         is completed.
> @@ -857,6 +934,9 @@ void pci_epc_init_notify(struct pci_epc *epc)
>  	if (IS_ERR_OR_NULL(epc))
>  		return;
>  
> +	if (!epc->init_complete)
> +		pci_epc_init_capabilities(epc);
> +
>  	mutex_lock(&epc->list_lock);
>  	list_for_each_entry(epf, &epc->pci_epf, list) {
>  		mutex_lock(&epf->lock);
> @@ -890,6 +970,27 @@ void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf)
>  }
>  EXPORT_SYMBOL_GPL(pci_epc_notify_pending_init);
>  
> +/**
> + * pci_epc_deinit_capabilities() - Cleanup EPC capabilities
> + * @epc: the EPC device whose capabilities need to be cleaned up
> + *
> + * Invoke to cleanup capabilities supported by the EPC device,
> + * and free the associated resources.

s/Invoke to cleanup/Clean up/

> + */
> +static void pci_epc_deinit_capabilities(struct pci_epc *epc)
> +{
> +	const struct pci_epc_features *epc_features;
> +
> +	epc_features = pci_epc_get_features(epc, 0, 0);
> +	if (!epc_features)
> +		return;
> +
> +	if (IS_ENABLED(CONFIG_PCI_ENDPOINT_DOE) && epc_features->doe_capable) {
> +		pci_ep_doe_destroy(epc);
> +		dev_dbg(&epc->dev, "DOE mailboxes destroyed\n");
> +	}
> +}
> +
>  /**
>   * pci_epc_deinit_notify() - Notify the EPF device about EPC deinitialization
>   * @epc: the EPC device whose deinitialization is completed
> @@ -903,6 +1004,9 @@ void pci_epc_deinit_notify(struct pci_epc *epc)
>  	if (IS_ERR_OR_NULL(epc))
>  		return;
>  
> +	if (epc->init_complete)
> +		pci_epc_deinit_capabilities(epc);
> +
>  	mutex_lock(&epc->list_lock);
>  	list_for_each_entry(epf, &epc->pci_epf, list) {
>  		mutex_lock(&epf->lock);
> diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
> index dd26294c8175..11474e337db3 100644
> --- a/include/linux/pci-epc.h
> +++ b/include/linux/pci-epc.h
> @@ -84,6 +84,8 @@ struct pci_epc_map {
>   * @start: ops to start the PCI link
>   * @stop: ops to stop the PCI link
>   * @get_features: ops to get the features supported by the EPC
> + * @find_ext_capability: ops to find extended capability offset for a function
> + *			 in endpoint controller
>   * @owner: the module owner containing the ops
>   */
>  struct pci_epc_ops {
> @@ -115,6 +117,8 @@ struct pci_epc_ops {
>  	void	(*stop)(struct pci_epc *epc);
>  	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
>  						       u8 func_no, u8 vfunc_no);
> +	u16	(*find_ext_capability)(struct pci_epc *epc, u8 func_no,
> +				       u8 vfunc_no, u16 start, u8 cap);
>  	struct module *owner;
>  };
>  
> @@ -270,6 +274,7 @@ struct pci_epc_bar_desc {
>   * @msi_capable: indicate if the endpoint function has MSI capability
>   * @msix_capable: indicate if the endpoint function has MSI-X capability
>   * @intx_capable: indicate if the endpoint can raise INTx interrupts
> + * @doe_capable: indicate if the endpoint function has DOE capability
>   * @bar: array specifying the hardware description for each BAR
>   * @align: alignment size required for BAR buffer allocation
>   */
> @@ -280,6 +285,7 @@ struct pci_epc_features {
>  	unsigned int	msi_capable : 1;
>  	unsigned int	msix_capable : 1;
>  	unsigned int	intx_capable : 1;
> +	unsigned int	doe_capable : 1;
>  	struct	pci_epc_bar_desc bar[PCI_STD_NUM_BARS];
>  	size_t	align;
>  };
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v5 4/4] Documentation: PCI: Add documentation for DOE endpoint support
From: Bjorn Helgaas @ 2026-06-11 19:12 UTC (permalink / raw)
  To: Aksh Garg
  Cc: linux-pci, linux-doc, mani, kwilczynski, bhelgaas, corbet, kishon,
	skhan, lukas, cassel, alistair, linux-arm-kernel, linux-kernel,
	s-vadapalli, danishanwar, srk
In-Reply-To: <20260610100256.1889111-5-a-garg7@ti.com>

On Wed, Jun 10, 2026 at 03:32:56PM +0530, Aksh Garg wrote:
> Document the architecture and implementation details for the Data Object
> Exchange (DOE) framework for PCIe Endpoint devices.
> 
> Co-developed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Aksh Garg <a-garg7@ti.com>
> ---
> 
> Changes from v4 to v5:
> - Updated the DOE Abort handling setion.
> 
> Changes from v3 to v4:
> - Updated the maximum size of the DOE object from 256KB to 1MB,
>   as per PCIe spec.
> - Updated the DOE setup and cleanup sections.
> 
> Changes from v2 to v3:
> - Rebased on 7.1-rc1.
> 
> Changes since v1:
> - Squashed the patches [1] and [2], and moved the documentation file
>   to Documentation/PCI/endpoint/pci-endpoint-doe.rst to match the existing
>   naming scheme, as suggested by Niklas Cassel
> - Updated the documentation as per the design and implementaion changes
>   made to previous patches in this series:
>   * Updated for static protocol array instead of dynamic registration
>   * Documented asynchronous callback model
>   * Updated request/response flow with new callback signature
>   * Updated memory ownership: DOE core frees request, driver frees response
>   * Updated initialization and cleanup sections for new APIs
> 
> v4: https://lore.kernel.org/all/20260522052434.802034-5-a-garg7@ti.com/
> v3: https://lore.kernel.org/all/20260427051725.223704-5-a-garg7@ti.com/
> v2: https://lore.kernel.org/all/20260401073022.215805-5-a-garg7@ti.com/
> v1: [1] https://lore.kernel.org/all/20260213123603.420941-2-a-garg7@ti.com/
>     [2] https://lore.kernel.org/all/20260213123603.420941-5-a-garg7@ti.com/
> 
>  Documentation/PCI/endpoint/index.rst          |   1 +
>  .../PCI/endpoint/pci-endpoint-doe.rst         | 333 ++++++++++++++++++
>  2 files changed, 334 insertions(+)
>  create mode 100644 Documentation/PCI/endpoint/pci-endpoint-doe.rst
> 
> diff --git a/Documentation/PCI/endpoint/index.rst b/Documentation/PCI/endpoint/index.rst
> index dd1f62e731c9..7c03d5abd2ef 100644
> --- a/Documentation/PCI/endpoint/index.rst
> +++ b/Documentation/PCI/endpoint/index.rst
> @@ -9,6 +9,7 @@ PCI Endpoint Framework
>  
>     pci-endpoint
>     pci-endpoint-cfs
> +   pci-endpoint-doe
>     pci-test-function
>     pci-test-howto
>     pci-ntb-function
> diff --git a/Documentation/PCI/endpoint/pci-endpoint-doe.rst b/Documentation/PCI/endpoint/pci-endpoint-doe.rst
> new file mode 100644
> index 000000000000..679844e36493
> --- /dev/null
> +++ b/Documentation/PCI/endpoint/pci-endpoint-doe.rst
> @@ -0,0 +1,333 @@
> +.. SPDX-License-Identifier: GPL-2.0-only OR MIT
> +
> +.. include:: <isonum.txt>
> +
> +=============================================
> +Data Object Exchange (DOE) for PCIe Endpoint
> +=============================================
> +
> +:Copyright: |copy| 2026 Texas Instruments Incorporated
> +:Author: Aksh Garg <a-garg7@ti.com>
> +:Co-Author: Siddharth Vadapalli <s-vadapalli@ti.com>
> +
> +Overview
> +========
> +
> +DOE (Data Object Exchange) is a standard PCIe extended capability feature
> +introduced in the Data Object Exchange (DOE) ECN for PCIe r5.0. It is an optional

Cite spec r7.0 as you do below, not ECN.

Wrap all to fit in 75-78 columns.

> +mechanism for system firmware/software running on root complex (host) to perform
> +:ref:`data object <data-object-term>` exchanges with an endpoint function. Each
> +data object is uniquely identified by the Vendor ID of the vendor publishing the

s/vendor ID/Vendor ID/ throughout, for consistency here and to match spec.

> +data object definition and a Data Object Type value assigned by that vendor.
> +
> +Think of DOE as a sophisticated mailbox system built into PCIe. The root complex
> +can send structured requests to the endpoint device through DOE mailboxes, and
> +the endpoint device responds with appropriate data. DOE mailboxes are implemented
> +as PCIe Extended Capabilities in endpoint devices, allowing multiple mailboxes
> +per function, each potentially supporting different data object protocols.
> +
> +The DOE support for root complex devices has already been implemented in
> +``drivers/pci/doe.c``.
> +
> +How DOE Works
> +=============
> +
> +The DOE mailbox operates through a simple request-response model:
> +
> +1. **Host sends request**: The root complex writes a data object (vendor ID, type,
> +   and payload) to the DOE write mailbox register (one DWORD at a time) of the

s/type/Type/
s/payload/Payload/ to match usage below and to indicate that these are
termed defined by the PCIe spec.

> +   endpoint function's config space and sets the GO bit in the DOE Control register

s/GO/DOE Go/ throughout to match spec.

> +   to indicate that a request is ready for processing.
> +2. **Endpoint processes**: The endpoint function reads the request from DOE write
> +   mailbox register, sets the BUSY bit in the DOE Status register, identifies the
> +   protocol of the data object, and executes the appropriate handler.
> +3. **Endpoint responds**: The endpoint function writes the response data object to the
> +   DOE read mailbox register (one DWORD at a time), and sets the READY bit in the DOE
> +   Status register to indicate that the response is ready. If an error occurs during
> +   request processing (such as unsupported protocol or handler failure), the endpoint
> +   sets the ERROR bit in the DOE Status register instead of the READY bit.
> +4. **Host reads response**: The root complex retrieves the response data from the DOE read
> +   mailbox register once the READY bit is set in the DOE Status register, and then writes
> +   any value to this register to indicate a successful read. If the ERROR bit was set,
> +   the root complex discards the response and performs error handling as needed.
> +
> +Each mailbox operates independently and can handle one transaction at a time. The
> +DOE specification supports data objects of size up to 1MB (2\ :sup:`18` dwords).
> +
> +For complete DOE capability details, refer to `PCI Express Base Specification Revision 7.0,
> +Section 6.30 - Data Object Exchange (DOE)`.
> +
> +Key Terminologies
> +=================
> +
> +.. _data-object-term:
> +
> +**Data Object**
> +  A structured, vendor-defined, or standard-defined message exchanged between
> +  root complex and endpoint function via DOE capability registers in configuration
> +  space of the function.
> +
> +**Mailbox**
> +  A DOE capability on the endpoint device, where each physical function can have
> +  multiple mailboxes.
> +
> +**Protocol**
> +  A specific type of DOE communication data object identified by a Vendor ID and Type.
> +
> +**Handler**
> +  A function that processes DOE requests of a specific protocol and generates responses.
> +
> +Architecture of DOE Implementation for Endpoint
> +===============================================
> +
> +.. code-block:: text
> +
> +       +------------------+
> +       |                  |
> +       |   Root Complex   |
> +       |                  |
> +       +--------^---------+
> +                |
> +                | Config space access
> +                |   over PCIe link
> +                |
> +     +----------v-----------+
> +     |                      |
> +     |    PCIe Controller   |
> +     |      as Endpoint     |
> +     |                      |
> +     |  +-----------------+ |
> +     |  |   DOE Mailbox   | |
> +     |  +-------^---------+ |
> +     +----------|-----------+
> +    +-----------|---------------------------------------------------------------+
> +    |           |                                       +--------------------+  |
> +    | +---------v--------+           Allocate           |  +--------------+  |  |
> +    | |                  |-------------------------------->|   Request    |  |  |
> +    | |   EP Controller  |                            +--->|    Buffer    |  |  |
> +    | |      Driver      |             Free           | |  +--------------+  |  |
> +    | |                  |--------------------------+ | |                    |  |
> +    | +--------^---------+                          | | |                    |  |
> +    |          |                                    | | |                    |  |
> +    |          |                                    | | |                    |  |
> +    |          | pci_ep_doe_process_request()       | | |                    |  |
> +    |          |                                    | | |                    |  |
> +    | +--------v---------+             Free         | | |                    |  |
> +    | |                  |----------------------------+ |         DDR        |  |
> +    | |    DOE EP Core   |<----+                    |   |                    |  |
> +    | |  (pci-ep-doe.c)  |     |     Discovery      |   |                    |  |
> +    | |                  |-----+  Protocol Handler  |   |                    |  |
> +    | +--------^---------+                          |   |                    |  |
> +    |          |                                    |   |                    |  |
> +    |          | protocol_handler()                 |   |                    |  |
> +    |          |                                    |   |                    |  |
> +    | +--------v---------+                          |   |                    |  |
> +    | |                  |                          |   |  +--------------+  |  |
> +    | | Protocol Handler |                          +----->|   Response   |  |  |
> +    | |      Module      |-------------------------------->|    Buffer    |  |  |
> +    | | (CMA/SPDM/Other) |           Allocate           |  +--------------+  |  |
> +    | |                  |                              |                    |  |
> +    | +------------------+                              |                    |  |
> +    |                                                   +--------------------+  |
> +    +---------------------------------------------------------------------------+
> +
> +Initialization and Cleanup
> +--------------------------
> +
> +**Framework Initialization and DOE Setup**
> +
> +The EPC core automatically initializes and sets up DOE mailboxes through the
> +``pci_epc_init_capabilities()`` internal function, which is invoked during
> +``pci_epc_init_notify()`` when the controller driver calls this API.
> +Controller drivers do not need to explicitly handle DOE initialization,
> +rather the EPC core manages this transparently.
> +
> +DOE initialization only occurs when the EPC driver reports DOE capability
> +through the ``doe_capable`` flag in its ``pci_epc_features``.
> +
> +This internal function performs the following steps:
> +
> +1. Calls ``pci_ep_doe_init(epc)`` to initialize the xarray data structure
> +   (a resizable array data structure defined in linux) named ``doe_mbs`` that
> +   stores metadata of DOE mailboxes for the controller in ``struct pci_epc``.
> +2. Calls ``pci_epc_doe_setup(epc)`` to discover all DOE capabilities in the
> +   endpoint function's configuration space for each function. For each
> +   discovered DOE capability, calls ``pci_ep_doe_add_mailbox(epc, func_no,
> +   cap_offset)`` to register the mailbox.
> +
> +Each DOE mailbox structure created by ``pci_ep_doe_add_mailbox()`` gets an
> +ordered workqueue allocated for processing DOE requests sequentially for that
> +mailbox, enabling concurrent request handling across different mailboxes. Each
> +mailbox is uniquely identified by the combination of physical function number
> +and capability offset for that controller.
> +
> +**Cleanup**
> +
> +The EPC core automatically cleans up DOE mailboxes through the
> +``pci_epc_deinit_capabilities()`` internal function, which is invoked during
> +``pci_epc_deinit_notify()`` when the controller driver calls this API.
> +Controller drivers do not need to explicitly handle DOE cleanup, rather
> +the EPC core manages this transparently.
> +
> +DOE cleanup only occurs when the EPC device reported DOE capability
> +through the ``doe_capable`` flag in its ``pci_epc_features``.
> +
> +This internal function calls ``pci_ep_doe_destroy(epc)``, which destroys all
> +registered mailboxes, cancels any pending tasks, flushes and destroys the
> +workqueues, and frees all memory allocated to the mailboxes.
> +
> +Protocol Handler Support
> +------------------------
> +
> +Protocol implementations (such as CMA, SPDM, or vendor-specific protocols) are
> +supported through a static array of protocol handlers.
> +
> +When a new DOE protocol library is introduced, its handler function is added to
> +the static ``pci_doe_protocols`` array in ``drivers/pci/endpoint/pci-ep-doe.c``.
> +The discovery protocol (VID = 0x0001 (PCI-SIG vendor ID), Type = 0x00 (discovery
> +protocol)) is included in this static array and handled internally by the
> +DOE EP core.
> +
> +Request Handling
> +----------------
> +
> +The complete flow of a DOE request from the root complex to the response:
> +
> +**Step 1: Root Complex → EP Controller Driver**
> +
> +The root complex writes a DOE request (Vendor ID, Type, and Payload) to the
> +DOE write mailbox register in the endpoint function's configuration space and sets

s/write mailbox register/Write Data Mailbox Register/ to match spec.

Specifically, "in the endpoint function's DOE Capability".

> +the GO bit in the DOE Control register, indicating that the request is ready for
> +processing.
> +
> +**Step 2: EP Controller Driver → DOE EP Core**
> +
> +The controller driver reads the request header to determine the data object
> +length. Based on this length field, it allocates a request buffer in memory
> +(DDR) of the appropriate size. The driver then reads the complete request
> +payload from the DOE write mailbox register and converts the data from
> +little-endian format (the format followed in the PCIe transactions over the
> +link) to CPU-native format using ``le32_to_cpu()``. The driver defines a
> +completion callback function with signature ``void (*complete)(struct pci_epc *epc,
> +u8 func_no, u16 cap_offset, int status, u16 vendor, u8 type, void *response_pl,
> +size_t response_pl_sz)`` to be invoked when the request processing completes.
> +The driver then calls ``pci_ep_doe_process_request(epc, func_no, cap_offset,
> +vendor, type, request, request_sz, complete)`` to hand off the request to the
> +DOE EP core. This function returns immediately after queuing the work
> +(without blocking), and the driver sets the BUSY bit in the DOE Status register.
> +
> +**Step 3: DOE EP Core Processing**
> +
> +The DOE EP core creates a task structure and submits it to the mailbox's ordered
> +workqueue. This ensures that requests for each mailbox are processed
> +sequentially, one at a time, as required by the DOE specification. It looks up
> +the protocol handler based on the Vendor ID and Type from the request header,
> +and executes the handler function.
> +
> +**Step 4: Protocol Handler Execution**
> +
> +The workqueue executes the task by calling the registered protocol handler:
> +``handler(request, request_sz, &response, &response_sz)``. The handler processes
> +the request, allocates a response buffer in memory (DDR), builds the response
> +data, and returns the response pointer and size. For the discovery protocol,
> +the DOE EP core handles this directly without invoking an external handler.
> +
> +**Step 5: DOE EP Core → EP Controller Driver**
> +
> +After the protocol handler completes, the DOE EP core frees the request buffer,
> +and invokes the completion callback provided by the controller driver asynchronously.
> +The callback receives the struct pci_epc, function number, capability offset (to
> +identify the mailbox), status code indicating the result of request processing,
> +vendor ID and type of the data object, the response buffer, and its size.
> +
> +**Step 6: EP Controller Driver → Root Complex**
> +
> +The controller driver converts the response from CPU-native format to
> +little-endian format using ``cpu_to_le32()``, writes the response to DOE read
> +mailbox register, and sets the READY bit in the DOE Status register. The root
> +complex then reads the response from the read mailbox register. Finally, the controller
> +driver frees the response buffer (which the handler allocated).

s/READY/Data Object Read/ to match spec

s/read mailbox/DOE Read Data Mailbox/ throughout to match spec

> +Asynchronous Request Processing
> +-------------------------------
> +
> +The DOE-EP framework implements asynchronous request processing because an
> +endpoint function can have multiple instances of DOE mailboxes, and requests may
> +be interleaved across these mailboxes. Request processing of one mailbox should
> +not result in blocking request processing of other mailboxes. Hence, requests
> +on each mailbox need to be handled in parallel for optimization.
> +
> +For the EP controller driver to handle requests on multiple mailboxes in
> +parallel, ``pci_ep_doe_process_request()`` must be asynchronous. The function
> +returns immediately after submitting the request to the mailbox's workqueue,
> +without waiting for the request to complete. A completion callback provided by
> +the controller driver is invoked asynchronously when request processing
> +finishes. This asynchronous design enables concurrent processing of requests
> +across different mailboxes.
> +
> +Abort Handling
> +--------------
> +
> +The DOE specification allows the root complex to abort ongoing DOE operations
> +by setting the ABORT bit in the DOE Control register.

Use spec name for ABORT

> +**Trigger**
> +
> +When the root complex sets the ABORT bit, the EP controller driver detects this
> +condition (typically in an interrupt handler or register polling routine). The
> +action taken depends on the timing of the abort:
> +
> +- **ABORT before request transfer**: If the ABORT bit is set before the root complex
> +  transfers the request to the mailbox registers, the controller driver should not
> +  call ``pci_ep_doe_abort()`` API.
> +
> +- **ABORT during request transfer**: If the ABORT bit is set while the root complex
> +  is still transferring the request to the mailbox registers, the controller driver
> +  should discard the request, and should not call ``pci_ep_doe_abort()`` and
> +  ``pci_ep_doe_process_request()`` APIs in the respective IRQ handlers.
> +
> +- **ABORT after request submission**: If the ABORT bit is set after the request
> +  has been fully received and submitted to the DOE EP core via
> +  ``pci_ep_doe_process_request()``, the controller driver must call
> +  ``pci_ep_doe_abort(epc, func_no, cap_offset)`` for the affected mailbox to
> +  perform abort sequence in the DOE EP core.
> +
> +**Abort Sequence**
> +
> +The abort function sets the CANCEL flag on the mailbox to prevent queued requests
> +from starting. Instead of waiting for the workqueue to flush, it returns immediately.
> +
> +The CANCEL flag gets cleared after invoking the completion callback, allowing the
> +mailbox to accept new requests.

I guess "CANCEL" is a software thing.  Using the spec names for the
hardware bits will help distinguish CANCEL, ABORT, ERROR, BUSY, etc.

> +Queued requests that have not started execution will be aborted with an error
> +status. The currently executing request will complete normally, and the controller
> +will reject the response if it arrives after the abort sequence has been triggered.
> +
> +.. note::
> +   Independent of when the ABORT bit is triggered, the controller driver must
> +   clear the ERROR, BUSY, and READY bits in the DOE Status register after
> +   completing the abort operation to reset the mailbox to an idle state.
> +
> +Error Handling
> +--------------
> +
> +Errors can occur during DOE request processing for various reasons, such as
> +unsupported protocols, handler failures, or memory allocation failures.
> +
> +**Error Detection**
> +
> +When an error occurs during DOE request processing, the DOE EP core propagates this error
> +back to the controller driver either through the ``pci_ep_doe_process_request()`` return value,
> +or the status code passed to the completion callback.
> +
> +**Error Response**
> +
> +When the controller driver receives an error code, it sets the ERROR bit in the DOE Status
> +register instead of writing a response to the read mailbox register, and frees the buffers.
> +
> +API Reference
> +=============
> +
> +.. kernel-doc:: drivers/pci/endpoint/pci-ep-doe.c
> +   :export:
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [RFC V2 0/3] lib/vsprintf: Add support for pgtable entries
From: Matthew Wilcox @ 2026-06-11 19:15 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: linux-mm, Andy Shevchenko, Rasmus Villemoes, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt, Jonathan Corbet, Andrew Morton,
	David Hildenbrand, linux-kernel, linux-doc
In-Reply-To: <20260610043545.3725735-1-anshuman.khandual@arm.com>

On Wed, Jun 10, 2026 at 05:35:42AM +0100, Anshuman Khandual wrote:
> Printing page table entries has been a common requirement both in generic
> and platform memory management for various purposes. Hence let's create a
> dedicated printk format for such entries which will also help standardize
> pgtable printing across different platforms.

You didn't address my objection here:

https://lore.kernel.org/linux-mm/aFQP8LzVMctf6XH5@casper.infradead.org/

ie there is now no typechecking possible.  So you've made it more
dangerous.  I reiterate my NACK to the concept, not to the implementation.

^ permalink raw reply

* Re: [PATCH] Documentation: arch: fix brackets
From: Randy Dunlap @ 2026-06-11 19:31 UTC (permalink / raw)
  To: Manuel Ebner, Vineet Gupta, Jonathan Corbet, Shuah Khan,
	Krzysztof Kozlowski, Peter Griffin, Alim Akhtar, Catalin Marinas,
	Will Deacon, Madhavan Srinivasan, Michael Ellerman,
	Nicholas Piggin, Christophe Leroy,
	open list:SYNOPSYS ARC ARCHITECTURE, open list:DOCUMENTATION,
	open list,
	moderated list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
In-Reply-To: <20260611183525.153058-2-manuelebner@mailbox.org>

Hi,
A couple of comments but overall this is a nice cleanup.


On 6/11/26 11:35 AM, Manuel Ebner wrote:
> Add missing and remove needless parentheses, brackets and curly braces.
> 
> Signed-off-by: Manuel Ebner <manuelebner@mailbox.org>
> ---
>  Documentation/arch/arc/arc.rst                 |  2 +-
>  .../arm/samsung/clksrc-change-registers.awk    |  2 +-
>  Documentation/arch/arm/vlocks.rst              |  4 ++--
>  .../arch/arm64/memory-tagging-extension.rst    |  2 +-
>  Documentation/arch/powerpc/vas-api.rst         |  2 +-
>  Documentation/arch/sparc/oradax/dax-hv-api.txt | 18 +++++++++---------
>  Documentation/arch/sparc/oradax/oracle-dax.rst |  3 ++-
>  7 files changed, 17 insertions(+), 16 deletions(-)
> 
> diff --git a/Documentation/arch/arc/arc.rst b/Documentation/arch/arc/arc.rst
> index 6c4d978f3f4e..553851f43be7 100644
> --- a/Documentation/arch/arc/arc.rst
> +++ b/Documentation/arch/arc/arc.rst
> @@ -36,7 +36,7 @@ Important note on ARC processors configurability
>  
>  ARC processors are highly configurable and several configurable options
>  are supported in Linux. Some options are transparent to software
> -(i.e cache geometries, some can be detected at runtime and configured
> +(i.e cache geometries), some can be detected at runtime and configured

   (i.e., cache geometries),
or even
   (e.g., cache geometries),

>  and used accordingly, while some need to be explicitly selected or configured
>  in the kernel's configuration utility (AKA "make menuconfig").
>  


> diff --git a/Documentation/arch/sparc/oradax/oracle-dax.rst b/Documentation/arch/sparc/oradax/oracle-dax.rst
> index d1e14d572918..67867ea7be40 100644
> --- a/Documentation/arch/sparc/oradax/oracle-dax.rst
> +++ b/Documentation/arch/sparc/oradax/oracle-dax.rst
> @@ -438,7 +438,8 @@ that in user land::
>  The output bitmap is ready for consumption immediately after the
>  completion status indicates success.
>  
> -Excer[t from UltraSPARC Virtual Machine Specification
> +Excer?t from UltraSPARC Virtual Machine Specification

   Excerpt

> +i guess this is wrong, but i don't know what's correct
>  =====================================================
>  
>   .. include:: dax-hv-api.txt

and then you can add
Reviewed-by: Randy Dunlap <rdunlap@infradead.org>

thanks.
-- 
~Randy

^ permalink raw reply

* Re: [RFC V2 0/3] lib/vsprintf: Add support for pgtable entries
From: Andy Shevchenko @ 2026-06-11 19:33 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Anshuman Khandual, linux-mm, Rasmus Villemoes, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt, Jonathan Corbet, Andrew Morton,
	David Hildenbrand, linux-kernel, linux-doc
In-Reply-To: <aisJbeVVxxNuYxQ6@casper.infradead.org>

On Thu, Jun 11, 2026 at 08:15:57PM +0100, Matthew Wilcox wrote:
> On Wed, Jun 10, 2026 at 05:35:42AM +0100, Anshuman Khandual wrote:
> > Printing page table entries has been a common requirement both in generic
> > and platform memory management for various purposes. Hence let's create a
> > dedicated printk format for such entries which will also help standardize
> > pgtable printing across different platforms.
> 
> You didn't address my objection here:
> 
> https://lore.kernel.org/linux-mm/aFQP8LzVMctf6XH5@casper.infradead.org/
> 
> ie there is now no typechecking possible.  So you've made it more
> dangerous.  I reiterate my NACK to the concept, not to the implementation.

But this is more of a global question, how do we check the validity of
the parameters of pointer extensions in the kernel? Does anybody go to
commit into GCC plugin or so for this job?

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [RFC V2 0/3] lib/vsprintf: Add support for pgtable entries
From: Matthew Wilcox @ 2026-06-11 19:37 UTC (permalink / raw)
  To: Andy Shevchenko
  Cc: Anshuman Khandual, linux-mm, Rasmus Villemoes, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt, Jonathan Corbet, Andrew Morton,
	David Hildenbrand, linux-kernel, linux-doc
In-Reply-To: <aisNjxHW369cbKiq@ashevche-desk.local>

On Thu, Jun 11, 2026 at 10:33:35PM +0300, Andy Shevchenko wrote:
> On Thu, Jun 11, 2026 at 08:15:57PM +0100, Matthew Wilcox wrote:
> > On Wed, Jun 10, 2026 at 05:35:42AM +0100, Anshuman Khandual wrote:
> > > Printing page table entries has been a common requirement both in generic
> > > and platform memory management for various purposes. Hence let's create a
> > > dedicated printk format for such entries which will also help standardize
> > > pgtable printing across different platforms.
> > 
> > You didn't address my objection here:
> > 
> > https://lore.kernel.org/linux-mm/aFQP8LzVMctf6XH5@casper.infradead.org/
> > 
> > ie there is now no typechecking possible.  So you've made it more
> > dangerous.  I reiterate my NACK to the concept, not to the implementation.
> 
> But this is more of a global question, how do we check the validity of
> the parameters of pointer extensions in the kernel? Does anybody go to
> commit into GCC plugin or so for this job?

I agree that it's a global question that it would be great for somebody
to answer.  But it's specifically a problem for this patchset because:

 - It's really easy to get confused about which page table level you're
   working on.  And hugetlbfs deliberately increases that confusion.
 - Different levels of the page tables actually do have different sizes
   on some architectures, so if you think you're looking at a pointer to a
   64-bit quantity when it's really a pointer to a 32-bit quantity, things
   Will Go Wrong (or vice-versa.  And some architctures are big-endian)
 - But on x86-64, Everything Is Fine because all levels of the page table
   are basically identical, so you'll never notice there's a problem.

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox