Linux Documentation
 help / color / mirror / Atom feed
* Re: [PATCH v6 10/20] nfsd: add notification handlers for dir events
From: Jeff Layton @ 2026-06-12 18:36 UTC (permalink / raw)
  To: Chuck Lever, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <b94c3e40-0520-4e83-9b4f-53a9325cecfe@app.fastmail.com>

On Fri, 2026-06-12 at 13:51 -0400, Chuck Lever wrote:
> On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> > Add the necessary parts to accept a fsnotify callback for directory
> > change event and create a CB_NOTIFY request for it. When a dir nfsd_file
> > is created set a handle_event callback to handle the notification.
> > 
> > Use that to allocate a nfsd_notify_event object and then hand off a
> > reference to each delegation's CB_NOTIFY. If anything fails along the
> > way, recall any affected delegations.
> > 
> > Signed-off-by: Jeff Layton <jlayton@kernel.org>
> > ---
> 
> > diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> > index ca4dd2f969eb..59378751d596 100644
> > --- a/fs/nfsd/nfs4callback.c
> > +++ b/fs/nfsd/nfs4callback.c
> 
> > @@ -904,13 +908,45 @@ static void nfs4_xdr_enc_cb_notify(struct rpc_rqst *req,
> >  	encode_cb_sequence4args(xdr, cb, &hdr);
> > 
> >  	/*
> > -	 * FIXME: get stateid and fh from delegation. Inline the cna_changes
> > -	 * buffer, and zero it.
> > +	 * nfsd4_cb_notify_prepare() sized the payload against a single page,
> > +	 * but did not account for the compound, sequence, stateid, and
> > +	 * filehandle encoded here. If the variable-length encode overflows the
> > +	 * backchannel send buffer, roll back to before the operation so that a
> > +	 * truncated CB_NOTIFY is never placed on the wire.
> >  	 */
> > -	xdrgen_encode_CB_NOTIFY4args(xdr, &args);
> > +	start = xdr_stream_pos(xdr);
> > +
> > +	p = xdr_reserve_space(xdr, 4);
> > +	if (!p)
> > +		goto out_err;
> > +	*p = cpu_to_be32(OP_CB_NOTIFY);
> 
> Please use xdr_stream_encode_u32 for this purpose.
> 

Ok

> 
> > +
> > +	args.cna_stateid.seqid = dp->dl_stid.sc_stateid.si_generation;
> > +	memcpy(&args.cna_stateid.other, &dp->dl_stid.sc_stateid.si_opaque,
> > +	       ARRAY_SIZE(args.cna_stateid.other));
> > +	args.cna_fh.len = dp->dl_stid.sc_file->fi_fhandle.fh_size;
> > +	args.cna_fh.data = dp->dl_stid.sc_file->fi_fhandle.fh_raw;
> > +	args.cna_changes.count = ncn->ncn_nf_cnt;
> > +	args.cna_changes.element = ncn->ncn_nf;
> > +	if (!xdrgen_encode_CB_NOTIFY4args(xdr, &args))
> > +		goto out_err;
> > 
> >  	hdr.nops++;
> >  	encode_cb_nops(&hdr);
> > +	return;
> > +
> > +out_err:
> > +	/*
> > +	 * Drop the CB_NOTIFY op and emit a valid CB_SEQUENCE-only compound so
> > +	 * the client still advances its slot. Flag the failure so the done
> > +	 * handler recalls the delegation and the missed notification is not
> > +	 * silently lost. The flag is written here in the transmit path and read
> > +	 * in the done handler; the two are serialized phases of the same
> > +	 * rpc_task, so no additional barrier is needed.
> > +	 */
> > +	ncn->ncn_encode_err = true;
> 
> This flag is zeroed only once, at allocation time in alloc_init_dir_deleg().
> It is never cleared in nfsd4_cb_notify_prepare().
> 
> Since nfsd4_cb_notify_release() can requeue the callback (via
> nfsd4_run_cb_notify) when events arrive while a callback is in flight,
> ->prepare may encode cleanly and return true, but nfsd4_cb_notify_done()
> still observes the stale ncn_encode_err == true and calls
> nfsd_break_one_deleg() -- discarding a good notification and recalling
> the delegation unnecessarily.
> 

Ok, so we need to reset this in ->prepare.

> 
> > +	xdr_truncate_encode(xdr, start);
> > +	encode_cb_nops(&hdr);
> >  }
> > 
> >  static int nfs4_xdr_dec_cb_notify(struct rpc_rqst *rqstp,
> 
> > diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> > index 0a15d7f3b543..513cbc1a583f 100644
> > --- a/fs/nfsd/nfs4state.c
> > +++ b/fs/nfsd/nfs4state.c
> 
> > @@ -3471,19 +3472,146 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
> >  	nfs4_put_stid(&dp->dl_stid);
> >  }
> > 
> > +static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
> > +{
> > +	bool queued;
> > +
> > +	if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
> > +		return;
> > +
> > +	/*
> > +	 * We're assuming the state code never drops its reference
> > +	 * without first removing the lease.  Since we're in this lease
> > +	 * callback (and since the lease code is serialized by the
> > +	 * flc_lock) we know the server hasn't removed the lease yet, and
> > +	 * we know it's safe to take a reference.
> > +	 */
> > +	refcount_inc(&dp->dl_stid.sc_count);
> > +	queued = nfsd4_run_cb(&dp->dl_recall);
> > +	WARN_ON_ONCE(!queued);
> > +	if (!queued) {
> > +		refcount_dec(&dp->dl_stid.sc_count);
> > +		clear_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags);
> > +	}
> > +}
> 
> nfsd_break_one_deleg() does an unconditional
> refcount_inc(&dp->dl_stid.sc_count), and its comment justifies this
> with "the lease code is serialized by the flc_lock." That invariant
> holds when called from nfsd_break_deleg_cb() under flc_lock, but
> nfsd4_cb_notify_prepare() runs on a workqueue WITHOUT flc_lock. Its
> out_recall: path calls nfsd_break_one_deleg(dp)
> directly. The delegation can be concurrently destroyed with sc_count
> already at zero, making this an inc-from-zero.
> 
> The dispatch path nfsd4_run_cb_notify already does this correctly with
> refcount_inc_not_zero. The out_recall path needs the same guard (skip
> the recall / bail if the refcount is already zero).
> 
> I notice that the last unapplied patch ("nfsd: add
> support to CB_NOTIFY for dir attribute changes") rewrites the guard
> "if (count > NOTIFY4_EVENT_QUEUE_SIZE)" into "if (count > limit)" with
> limit = NOTIFY4_EVENT_QUEUE_SIZE - 1 when NOTIFY4_CHANGE_DIR_ATTRS is
> requested. That turns the previously-dead overflow branch into a live,
> routine path to out_recall, which adds another normal-operation route
> into this unlocked recall.
> 

This wart has been there a long time, and we just papered over it with
the lock.

I think we need to do a refcount_inc_not_zero() in
nfsd_break_one_deleg() and just return without queuing the callback if
it's already at 0. That means that the recall is racing with the lease
teardown, so I think the right thing to do is to not send the recall in
that case.
-- 
Jeff Layton <jlayton@kernel.org>

^ permalink raw reply

* Re: [PATCH v6 20/20] nfsd: add support to CB_NOTIFY for dir attribute changes
From: Chuck Lever @ 2026-06-12 18:21 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-20-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> If the client requested dir attribute change notifications, send those
> alongside any set of add/remove/rename events. Note that the server will
> still recall the delegation on a SETATTR, so these are only sent for
> changes to child dirents.
>
> The child filehandle returned in these notifications is composed by
> setup_notify_fhandle() without going through fh_compose(), so it does
> not get a MAC appended. On exports configured with NFSEXP_SIGN_FH the
> client would then get back an unsigned filehandle that fh_verify()
> rejects as stale. Pass the delegation's export down to
> setup_notify_fhandle() and append the MAC with fh_append_mac() when the
> export requires signed filehandles; if signing fails, drop the
> filehandle attribute rather than handing out an unusable one.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4state.c | 25 ++++++++++++++++--
>  fs/nfsd/nfs4xdr.c   | 73 +++++++++++++++++++++++++++++++++++++++++++++--------
>  fs/nfsd/xdr4.h      |  2 ++
>  3 files changed, 88 insertions(+), 12 deletions(-)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 12627afb604f..e394278fb92e 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -3503,10 +3503,15 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
>  	struct nfsd_notify_event *events[NOTIFY4_EVENT_QUEUE_SIZE];
>  	struct xdr_buf xdr = { .buflen = PAGE_SIZE * NOTIFY4_PAGE_ARRAY_SIZE,
>  			       .pages  = ncn->ncn_pages };
> +	int limit = NOTIFY4_EVENT_QUEUE_SIZE;

When a client requests NOTIFY4_CHANGE_DIR_ATTRS, the CB_NOTIFY event
queue can fill to NOTIFY4_EVENT_QUEUE_SIZE (3) events while the consumer
only accepts 2 (it reserves a slot for the dir-attr-change entry). The
resulting overflow path in nfsd4_cb_notify_prepare() recalls the
delegation without draining the queue, and nfsd4_cb_notify_release()
then requeues the same callback indefinitely.


>  	struct xdr_stream stream;
>  	struct nfsd_file *nf;
> -	int count, i;
>  	bool error = false;
> +	int count, i;
> +
> +	/* Save a slot for dir attr update if requested */
> +	if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS))
> +		--limit;
> 
>  	xdr_init_encode_pages(&stream, &xdr);
> 
> @@ -3520,7 +3525,7 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
>  	}
> 
>  	/* we can't keep up! */
> -	if (count > NOTIFY4_EVENT_QUEUE_SIZE) {
> +	if (count > limit) {
>  		spin_unlock(&ncn->ncn_lock);
>  		goto out_recall;
>  	}
> @@ -3567,6 +3572,22 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback 
> *cb)
>  		nfsd_notify_event_put(nne);
>  	}
>  	if (!error) {
> +		if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)) {
> +			u32 *maskp = (u32 *)xdr_reserve_space(&stream, sizeof(*maskp));
> +
> +			if (maskp) {
> +				u8 *p = nfsd4_encode_dir_attr_change(&stream, dp, nf);
> +
> +				if (p) {
> +					*maskp = BIT(NOTIFY4_CHANGE_DIR_ATTRS);
> +					ncn->ncn_nf[count].notify_mask.count = 1;
> +					ncn->ncn_nf[count].notify_mask.element = maskp;
> +					ncn->ncn_nf[count].notify_vals.data = p;
> +					ncn->ncn_nf[count].notify_vals.len = (u8 *)stream.p - p;
> +					++count;
> +				}
> +			}
> +		}

Nit:

When xdr_reserve_space() for maskp succeeds but nfsd4_encode_dir_attr_change()
returns NULL, the 4-byte reservation is never rolled back and *maskp is never
written, yet the function still takes the success path (return true). Unlike
the child-event loop, this branch does not escalate to error = true.

This is probably benign only because nfs4_xdr_enc_cb_notify re-encodes from
the ncn_nf[] array (and count was not incremented), so the garbage hole is
never transmitted.


>  		ncn->ncn_nf_cnt = count;
>  		nfsd_file_put(nf);
>  		return true;
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 1e3c360c06cd..7dd8476028d6 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4199,7 +4199,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, 
> struct xdr_stream *xdr,
> 
>  static bool
>  setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
> -		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
> +		     struct nfsd_file *nf, struct svc_export *exp,
> +		     struct nfsd4_fattr_args *args)
>  {
>  	int fileid_type, fsid_len, maxsize, flags = 0;
>  	struct knfsd_fh *fhp = &args->fhandle;

The function dereferences the new exp parameter unconditionally.

The argument is dp->dl_stid.sc_export, read unlocked at
nfs4xdr.c:4297 and handed down. An in-flight CB_NOTIFY callback holds a
sc_count reference but NOT an export reference. drop_stid_export() can
run concurrently (admin revoke / unexport), NULL sc_export, and drop what
may be the last export reference, freeing the svc_export while the
callback dereferences it.


> @@ -4227,6 +4228,17 @@ setup_notify_fhandle(struct dentry *dentry, 
> struct nfs4_file *fi,
> 
>  	fhp->fh_fileid_type = fileid_type;
>  	fhp->fh_size += maxsize * 4;
> +
> +	/*
> +	 * fh_compose() appends a MAC to filehandles on signed exports; this
> +	 * hand-rolled filehandle must do the same or the client will get back
> +	 * an unsigned filehandle that fh_verify() later rejects as stale.
> +	 * If we can't sign it, don't hand it out at all.
> +	 */
> +	if (exp && (exp->ex_flags & NFSEXP_SIGN_FH))
> +		if (!fh_append_mac(fhp, NFS4_FHSIZE, exp->cd->net))
> +			return false;
> +
>  	return true;
>  }
> 
> @@ -4240,11 +4252,11 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
>  			  struct nfsd_file *nf, char *name, u32 namelen)
>  {
>  	struct nfs4_file *fi = dp->dl_stid.sc_file;
> -	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
> -			      .dentry = dentry };
> +	struct path path = nf->nf_file->f_path;
>  	struct nfsd4_fattr_args args = { };
>  	uint32_t *attrmask;
>  	__be32 status;
> +	bool parent;
>  	int ret;
> 
>  	/* Reserve space for attrmask */
> @@ -4256,6 +4268,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
>  	ne->ne_file.len = namelen;
>  	ne->ne_attrs.attrmask.element = attrmask;
> 
> +	parent = (dentry == path.dentry);
> +	path.dentry = dentry;
> +
>  	/* FIXME: d_find_alias for inode ? */
>  	if (!path.dentry || !d_inode(path.dentry))
>  		goto noattrs;
> @@ -4271,15 +4286,21 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
> 
>  	args.change_attr = nfsd4_change_attribute(&args.stat);
> 
> -	attrmask[0] = dp->dl_child_attrs[0];
> -	attrmask[1] = dp->dl_child_attrs[1];
> -	attrmask[2] = 0;
> +	if (parent) {
> +		attrmask[0] = dp->dl_dir_attrs[0];
> +		attrmask[1] = dp->dl_dir_attrs[1];
> +	} else {
> +		attrmask[0] = dp->dl_child_attrs[0];
> +		attrmask[1] = dp->dl_child_attrs[1];
> 
> -	if (!setup_notify_fhandle(dentry, fi, nf, &args))
> -		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
> +		if (!setup_notify_fhandle(dentry, fi, nf,
> +					  dp->dl_stid.sc_export, &args))
> +			attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
> 
> -	if (!(args.stat.result_mask & STATX_BTIME))
> -		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
> +		if (!(args.stat.result_mask & STATX_BTIME))
> +			attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
> +	}
> +	attrmask[2] = 0;
> 
>  	ne->ne_attrs.attrmask.count = 2;
>  	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
> @@ -4392,6 +4413,38 @@ u8 *nfsd4_encode_notify_event(struct xdr_stream 
> *xdr, struct nfsd_notify_event *
>  	return NULL;
>  }
> 
> +/**
> + * nfsd4_encode_dir_attr_change
> + * @xdr: stream to which to encode the fattr4
> + * @dp: delegation where the event occurred
> + * @nf: nfsd_file opened on the directory
> + *
> + * Encode a dir attr change event.
> + */
> +u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct 
> nfs4_delegation *dp,
> +				 struct nfsd_file *nf)
> +{
> +	struct dentry *dentry = nf->nf_file->f_path.dentry;
> +	struct notify_attr4 na = { };
> +	bool ret;
> +	u8 *p = NULL;
> +
> +	if (!(dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)))
> +		return NULL;

It looks like this if() re-checks dl_notify_mask even though its
sole caller already gated on the identical check.

nfsd4_encode_notify_event() does not repeat its caller's check.
The guard is unreachable from current callers.


> +
> +	/* RFC 8881 s10.4.3: ne_file must be a zero-length string for dir 
> attrs */
> +	ret = nfsd4_setup_notify_entry4(&na.na_changed_entry, xdr,
> +					dentry, dp, nf, "", 0);
> +
> +	/* Don't bother with the event if we're not encoding attrs */
> +	if (ret && na.na_changed_entry.ne_attrs.attr_vals.len) {
> +		p = (u8 *)xdr->p;
> +		if (!xdrgen_encode_notify_attr4(xdr, &na))
> +			p = NULL;
> +	}
> +	return p;
> +}
> +
>  static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
>  				struct xdr_buf *buf, __be32 *p, int bytes)
>  {
> diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
> index 62ac790428be..805c7122eb93 100644
> --- a/fs/nfsd/xdr4.h
> +++ b/fs/nfsd/xdr4.h
> @@ -973,6 +973,8 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int 
> words,
>  u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct 
> nfsd_notify_event *nne,
>  			      struct nfs4_delegation *dd, struct nfsd_file *nf,
>  			      u32 *notify_mask);
> +u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct 
> nfs4_delegation *dp,
> +				 struct nfsd_file *nf);
>  extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
>  		struct nfsd4_compound_state *, union nfsd4_op_u *u);
>  extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
>
> -- 
> 2.54.0

-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v3 1/4] mm/zswap: Make shrink_worker writeback cursor per-memcg
From: Yosry Ahmed @ 2026-06-12 18:15 UTC (permalink / raw)
  To: Shakeel Butt
  Cc: Hao Jia, Nhat Pham, akpm, tj, hannes, mhocko, mkoutny,
	chengming.zhou, muchun.song, roman.gushchin, cgroups, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <aiw2JB1lZV9xuNSp@linux.dev>

On Fri, Jun 12, 2026 at 9:40 AM Shakeel Butt <shakeel.butt@linux.dev> wrote:
>
> On Thu, Jun 11, 2026 at 05:39:16PM +0000, Yosry Ahmed wrote:
> > On Tue, Jun 09, 2026 at 11:18:26AM +0800, Hao Jia wrote:
> > >
> > >
> > > On 2026/6/9 02:01, Nhat Pham wrote:
> > > > On Mon, Jun 8, 2026 at 9:48 AM Yosry Ahmed <yosry@kernel.org> wrote:
> > > > >
> > > > > > But OTOH, this does seem like a recipe for inefficient reclaim. We
> > > > > > might exhaust hotter memory of a cgroup while sparing colder memory of
> > > > > > another cgroup... But maybe if they're all cold anyway, then who
> > > > > > cares, and eventually you'll get to the cold stuff of other child?
> > > > >
> > > > > Forgot to respond to this part, the unfairness is limited to the batch
> > > > > size per-invocation, so it should be fine as long as you don't divide
> > > > > the amount over 100 iterations for some reason. Also yes, all memory
> > > > > in zswap is cold, the relative coldness is not that important (e.g.
> > > > > compared to relative coldness during reclaim).
> > > >
> > > > Ok then yeah, I think we should shelve per-memcg cursor for the next
> > > > version. Down the line, if we have more data that unfairness is an
> > > > issue, we can always fix it. One step at a time :)
> > >
> > > Thanks a lot to Yosry, Nhat, and Shakeel for the great suggestions!
> > >
> > > Let me summarize what I plan to do in the next version to make sure we are
> > > on the same page:
> > >
> > >  - Drop the per-memcg cursor and keep the root cgroup cursor
> > > (zswap_next_shrink) logic intact.
> > >  - Stick to using the zswap_writeback_only key, and change the proactive
> > > writeback size to use the compressed size.
> > >  - Consolidate and reuse the logic between shrink_worker() and
> > > shrink_memcg(). Enable batch writeback in the shrink_worker() path, while
> > > keeping the writeback behavior in the zswap_store() path unchanged.
> > >
> > > Please let me know if I missed or misunderstood anything. Thanks again for
> > > clearing things up!
> >
> > Sorry for the late response, yes I think this makes sense. However, I
> > have some comment about how this interacts with swap tiering, let me
> > reply to the other thread.
> >
>
> I think the swap tiers interaction will be figured out over next cycle. However
> Hao can/should continue to push and we may decide to let it in orthogonal to
> swap tiers.

Yeah I think there are a lot of changes we discussed outside of the
memcg interface, so maybe keep the interface as-is for now, work on a
new version with the other changes, and we can finalize the interface
at the end?

^ permalink raw reply

* Re: [PATCH v6 19/20] nfsd: track requested dir attributes
From: Chuck Lever @ 2026-06-12 18:13 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-19-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> Track the union of requested and supported dir attributes in the
> delegation. In a later patch this will be used to ensure that we
> only encode the attributes in that union when sending
> add/remove/rename updates.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4proc.c  |  9 ++++++---
>  fs/nfsd/nfs4state.c | 20 ++++++++++++++++----
>  fs/nfsd/state.h     |  2 ++
>  3 files changed, 24 insertions(+), 7 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index caec82e77081..9e86f5907f06 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -2530,9 +2530,10 @@ nfsd4_verify(struct svc_rqst *rqstp, struct 
> nfsd4_compound_state *cstate,
>  	return status == nfserr_same ? nfs_ok : status;
>  }
> 
> -#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_REMOVE_ENTRY) |	\
> -				 BIT(NOTIFY4_ADD_ENTRY) |	\
> -				 BIT(NOTIFY4_RENAME_ENTRY) |	\
> +#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_CHANGE_DIR_ATTRS) |	\
> +				 BIT(NOTIFY4_REMOVE_ENTRY) |		\
> +				 BIT(NOTIFY4_ADD_ENTRY) |		\
> +				 BIT(NOTIFY4_RENAME_ENTRY) |		\
>  				 BIT(NOTIFY4_GFLAG_EXTEND))
> 
>  static __be32
> @@ -2579,6 +2580,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
>  	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, 
> sizeof(gdd->gddr_stateid));
>  	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
>  	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
> +	gdd->gddr_dir_attributes[0] = dd->dl_dir_attrs[0];
> +	gdd->gddr_dir_attributes[1] = dd->dl_dir_attrs[1];
>  	nfs4_put_stid(&dd->dl_stid);
>  	return nfs_ok;
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 0e6e008c121e..12627afb604f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -9945,6 +9945,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst 
> *rqstp, struct dentry *dentry,
>  				 FATTR4_WORD1_TIME_MODIFY |	\
>  				 FATTR4_WORD1_TIME_CREATE)
> 
> +#define GDD_WORD0_DIR_ATTRS	(FATTR4_WORD0_CHANGE |		\
> +				 FATTR4_WORD0_SIZE)
> +
> +#define GDD_WORD1_DIR_ATTRS	(FATTR4_WORD1_NUMLINKS |	\
> +				 FATTR4_WORD1_SPACE_USED |	\
> +				 FATTR4_WORD1_TIME_ACCESS |	\
> +				 FATTR4_WORD1_TIME_METADATA |	\
> +				 FATTR4_WORD1_TIME_MODIFY)
> +
>  /**
>   * nfsd_get_dir_deleg - attempt to get a directory delegation
>   * @cstate: compound state
> @@ -10013,14 +10022,17 @@ nfsd_get_dir_deleg(struct 
> nfsd4_compound_state *cstate,
>  		dp->dl_stid.sc_export =
>  			exp_get(cstate->current_fh.fh_export);
> 
> -	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
> -	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
> -
>  	/*
>  	 * NB: gddr_notification[0] represents the notifications that
>  	 * will be granted to the client
>  	 */
> -	fl = nfs4_alloc_init_lease(dp, gdd->gddr_notification[0]);
> +	dp->dl_notify_mask = gdd->gddr_notification[0];
> +	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
> +	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
> +	dp->dl_dir_attrs[0] = gdd->gdda_dir_attributes[0] & GDD_WORD0_DIR_ATTRS;
> +	dp->dl_dir_attrs[1] = gdd->gdda_dir_attributes[1] & GDD_WORD1_DIR_ATTRS;
> +
> +	fl = nfs4_alloc_init_lease(dp, dp->dl_notify_mask);
>  	if (!fl)
>  		goto out_put_stid;
> 
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index 0763893bfd48..17be4011740d 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -299,7 +299,9 @@ struct nfs4_delegation {
>  	struct timespec64	dl_ctime;
> 
>  	/* For dir delegations */
> +	uint32_t		dl_notify_mask;
>  	uint32_t		dl_child_attrs[2];
> +	uint32_t		dl_dir_attrs[2];

Nit: Maybe these should be u32. uint32_t is a user space type.


>  };
> 
>  static inline bool deleg_is_read(u32 dl_type)
>

Bisectability: After this patch is applied, a client that requests
NOTIFY4_CHANGE_DIR_ATTRS now gets that bit echoed in gddr_notification,
but the callback path still only maps/encodes add, remove, and rename
notifications (nfsd_notify_to_ignore(), nfsd_fsnotify_recalc_mask(),
and nfsd4_encode_notify_event() have no dir-attr case). That lets the
server grant a directory delegation while promising dir-attribute
CB_NOTIFYs it cannot send until the follow-up support lands, so this
bit should not be advertised in this patch.


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v6 18/20] nfsd: properly track requested child attributes
From: Chuck Lever @ 2026-06-12 18:10 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-18-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> Track the union of requested and supported child attributes in the
> delegation, and only encode the attributes in that union when sending
> add/remove/rename updates.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4proc.c  |  2 ++
>  fs/nfsd/nfs4state.c | 18 ++++++++++++++++++
>  fs/nfsd/nfs4xdr.c   | 15 ++++++---------
>  fs/nfsd/state.h     |  3 +++
>  4 files changed, 29 insertions(+), 9 deletions(-)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 29f7339dc220..caec82e77081 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -2577,6 +2577,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
> 
>  	gdd->gddrnf_status = GDD4_OK;
>  	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, 
> sizeof(gdd->gddr_stateid));
> +	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
> +	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
>  	nfs4_put_stid(&dd->dl_stid);
>  	return nfs_ok;
>  }
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index aa99783ce901..0e6e008c121e 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -9930,6 +9930,21 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst 
> *rqstp, struct dentry *dentry,
>  	return status;
>  }
> 
> +#define GDD_WORD0_CHILD_ATTRS	(FATTR4_WORD0_TYPE |		\
> +				 FATTR4_WORD0_CHANGE |		\
> +				 FATTR4_WORD0_SIZE |		\
> +				 FATTR4_WORD0_FILEID |		\
> +				 FATTR4_WORD0_FILEHANDLE)
> +
> +#define GDD_WORD1_CHILD_ATTRS	(FATTR4_WORD1_MODE |		\
> +				 FATTR4_WORD1_NUMLINKS |	\
> +				 FATTR4_WORD1_RAWDEV |		\
> +				 FATTR4_WORD1_SPACE_USED |	\
> +				 FATTR4_WORD1_TIME_ACCESS |	\
> +				 FATTR4_WORD1_TIME_METADATA |	\
> +				 FATTR4_WORD1_TIME_MODIFY |	\
> +				 FATTR4_WORD1_TIME_CREATE)
> +
>  /**
>   * nfsd_get_dir_deleg - attempt to get a directory delegation
>   * @cstate: compound state
> @@ -9998,6 +10013,9 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
>  		dp->dl_stid.sc_export =
>  			exp_get(cstate->current_fh.fh_export);
> 
> +	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & 
> GDD_WORD0_CHILD_ATTRS;
> +	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & 
> GDD_WORD1_CHILD_ATTRS;
> +
>  	/*
>  	 * NB: gddr_notification[0] represents the notifications that
>  	 * will be granted to the client
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 15ccd54ffdb6..1e3c360c06cd 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4271,18 +4271,15 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
> 
>  	args.change_attr = nfsd4_change_attribute(&args.stat);
> 
> -	attrmask[0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
> -		      FATTR4_WORD0_SIZE | FATTR4_WORD0_FILEID;
> -	attrmask[1] = FATTR4_WORD1_MODE | FATTR4_WORD1_NUMLINKS | 
> FATTR4_WORD1_RAWDEV |
> -		      FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS |
> -		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
> +	attrmask[0] = dp->dl_child_attrs[0];
> +	attrmask[1] = dp->dl_child_attrs[1];
>  	attrmask[2] = 0;
> 
> -	if (setup_notify_fhandle(dentry, fi, nf, &args))
> -		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
> +	if (!setup_notify_fhandle(dentry, fi, nf, &args))
> +		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
> 
> -	if (args.stat.result_mask & STATX_BTIME)
> -		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
> +	if (!(args.stat.result_mask & STATX_BTIME))
> +		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
> 
>  	ne->ne_attrs.attrmask.count = 2;
>  	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index d912e3d04dd7..0763893bfd48 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -297,6 +297,9 @@ struct nfs4_delegation {
>  	struct timespec64	dl_atime;
>  	struct timespec64	dl_mtime;
>  	struct timespec64	dl_ctime;
> +
> +	/* For dir delegations */
> +	uint32_t		dl_child_attrs[2];
>  };
> 
>  static inline bool deleg_is_read(u32 dl_type)
>

When a client requests any supported child attribute in word 1, this can
make gddr_child_attributes[1] non-zero, so nfsd4_encode_bitmap4() emits a
two-word bitmap. nfsd4_get_dir_delegation_rsize() still budgets only the
old one-word child-attribute bitmap before executing this non-idempotent
op, so a compound near the reply/slot limit can grant a directory
delegation and then fail encoding with NFS4ERR_RESOURCE/REP_TOO_BIG,
leaving the client without the returned stateid.


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v6 17/20] nfsd: add the filehandle to returned attributes in CB_NOTIFY
From: Chuck Lever @ 2026-06-12 18:08 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-17-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> nfsd's usual fh_compose routine requires a svc_export and fills out a
> svc_fh. In the context of a CB_NOTIFY there is no such export to
> consult.
>
> Add a new routine that composes a filehandle with only a parent
> filehandle and nfs4_file. Use that to fill out the fhandle field in the
> nfsd4_fattr_args.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4xdr.c | 37 +++++++++++++++++++++++++++++++++++++
>  1 file changed, 37 insertions(+)
>
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 7b19248b1503..15ccd54ffdb6 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c
> @@ -4197,6 +4197,39 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, 
> struct xdr_stream *xdr,
>  	goto out;
>  }
> 
> +static bool
> +setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
> +		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
> +{
> +	int fileid_type, fsid_len, maxsize, flags = 0;
> +	struct knfsd_fh *fhp = &args->fhandle;
> +	struct inode *inode = d_inode(dentry);
> +	struct inode *parent = NULL;
> +	struct fid *fid;
> +
> +	fsid_len = key_len(fi->fi_fhandle.fh_fsid_type);
> +	fhp->fh_size = 4 + fsid_len;
> +
> +	/* Copy first 4 bytes + fsid */
> +	memcpy(&fhp->fh_raw, &fi->fi_fhandle.fh_raw, fhp->fh_size);
> +
> +	fid = (struct fid *)(fh_fsid(fhp) + fsid_len/4);
> +	maxsize = (NFS4_FHSIZE - fhp->fh_size)/4;
> +
> +	if (fi->fi_connectable && !S_ISDIR(inode->i_mode)) {
> +		parent = d_inode(nf->nf_file->f_path.dentry);
> +		flags = EXPORT_FH_CONNECTABLE;
> +	}
> +
> +	fileid_type = exportfs_encode_inode_fh(inode, fid, &maxsize, parent, 
> flags);
> +	if (fileid_type < 0 || fileid_type == FILEID_INVALID)
> +		return false;
> +
> +	fhp->fh_fileid_type = fileid_type;
> +	fhp->fh_size += maxsize * 4;
> +	return true;
> +}
> +
>  #define CB_NOTIFY_STATX_REQUEST_MASK (STATX_BASIC_STATS   | \
>  				      STATX_BTIME	  | \
>  				      STATX_CHANGE_COOKIE)
> @@ -4206,6 +4239,7 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
>  			  struct dentry *dentry, struct nfs4_delegation *dp,
>  			  struct nfsd_file *nf, char *name, u32 namelen)
>  {
> +	struct nfs4_file *fi = dp->dl_stid.sc_file;
>  	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
>  			      .dentry = dentry };
>  	struct nfsd4_fattr_args args = { };
> @@ -4244,6 +4278,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 
> *ne, struct xdr_stream *xdr,
>  		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
>  	attrmask[2] = 0;
> 
> +	if (setup_notify_fhandle(dentry, fi, nf, &args))
> +		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
> +
>  	if (args.stat.result_mask & STATX_BTIME)
>  		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
> 

Codex flagged setup_notify_fhandle() for constructing a child FILEHANDLE
attribute without calling fh_append_mac(): for exports with sign_fh,
fh_compose() appends a MAC, and nfsd_set_fh_dentry() rejects every
non-root signed-export handle whose MAC is absent or mismatched, so a
client using the CB_NOTIFY filehandle gets a stale/bad handle. It
recommends signing the constructed handle or suppressing the attribute
when the export requires signed filehandles.

A client that does not receive the FH falls back to a LOOKUP, so
suppression degrades gracefully.

-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH] Documentation: leds: fix broken reference to the multicolor LED ABI
From: Randy Dunlap @ 2026-06-12 18:08 UTC (permalink / raw)
  To: Shardul Deshpande, Nam Tran, Lee Jones, Pavel Machek,
	Jonathan Corbet, Shuah Khan, linux-leds, linux-doc, linux-kernel
In-Reply-To: <20260612171528.728111-1-iamsharduld@gmail.com>



On 6/12/26 10:15 AM, Shardul Deshpande wrote:
> The reference pointed to a non-existent .rst file.  The ABI file is named
> sysfs-class-led-multicolor (without extension), so fix the reference to
> match the actual file and resolve the warning from
> tools/docs/documentation-file-ref-check.
> 
> Signed-off-by: Shardul Deshpande <iamsharduld@gmail.com>

Acked-by: Randy Dunlap <rdunlap@infradead.org>

Thanks.

> ---
>  Documentation/leds/leds-lp5812.rst | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/Documentation/leds/leds-lp5812.rst b/Documentation/leds/leds-lp5812.rst
> index c2a6368d5..12e757d45 100644
> --- a/Documentation/leds/leds-lp5812.rst
> +++ b/Documentation/leds/leds-lp5812.rst
> @@ -20,7 +20,7 @@ Sysfs Interface
>  ===============
>  
>  This driver uses the standard multicolor LED class interfaces defined
> -in Documentation/ABI/testing/sysfs-class-led-multicolor.rst.
> +in Documentation/ABI/testing/sysfs-class-led-multicolor.
>  
>  Each LP5812 LED output appears under ``/sys/class/leds/`` with its
>  assigned label (for example ``LED_A``).

-- 
~Randy

^ permalink raw reply

* Re: [PATCH v6 16/20] nfsd: add a fi_connectable flag to struct nfs4_file
From: Chuck Lever @ 2026-06-12 18:06 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-16-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> When encoding a filehandle for a CB_NOTIFY, there is no svc_export
> available, but the server needs to know whether to encode a connectable
> filehandle. Add a flag to the nfs4_file that tells whether the
> svc_export under which a directory delegation was acquired has subtree
> checking enabled, in which case it needs connectable filehandles.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4state.c | 1 +
>  fs/nfsd/state.h     | 1 +
>  2 files changed, 2 insertions(+)
>
> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 513cbc1a583f..aa99783ce901 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c
> @@ -5231,6 +5231,7 @@ static void nfsd4_file_init(const struct svc_fh 
> *fh, struct nfs4_file *fp)
>  	memset(fp->fi_access, 0, sizeof(fp->fi_access));
>  	fp->fi_aliased = false;
>  	fp->fi_inode = d_inode(fh->fh_dentry);
> +	fp->fi_connectable = !(fh->fh_export->ex_flags & 
> NFSEXP_NOSUBTREECHECK);
>  #ifdef CONFIG_NFSD_PNFS
>  	INIT_LIST_HEAD(&fp->fi_lo_states);
>  	atomic_set(&fp->fi_lo_recalls, 0);
> diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
> index f8457e0f2b57..d912e3d04dd7 100644
> --- a/fs/nfsd/state.h
> +++ b/fs/nfsd/state.h
> @@ -761,6 +761,7 @@ struct nfs4_file {
>  	int			fi_delegees;
>  	struct knfsd_fh		fi_fhandle;
>  	bool			fi_had_conflict;
> +	bool			fi_connectable;
>  #ifdef CONFIG_NFSD_PNFS
>  	struct list_head	fi_lo_states;
>  	atomic_t		fi_lo_recalls;
>

When two clients use exports of the same directory root that
differ only in subtree_check/no_subtree_check, the root filehandle
is the same and nfsd4_file_hash_insert() can reuse the same
nfs4_file. This makes fi_connectable depend on whichever export
first initialized the shared object, so a later directory
delegation acquired under the other export can encode CB_NOTIFY
child filehandles with the wrong connectability.

Therefore, the delegation's sc_export is the per-export state to
derive connectability from, and the export is already available
via dp->dl_stid.sc_export.


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v6 15/20] nfsd: allow encoding a filehandle into fattr4 without a svc_fh
From: Chuck Lever @ 2026-06-12 18:03 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-15-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> The current fattr4 encoder requires a svc_fh in order to encode the
> filehandle. This is not available in a CB_NOTIFY callback. Add a a new
> "fhandle" field to struct nfsd4_fattr_args and copy the filehandle into
> there from the svc_fh. CB_NOTIFY will populate it via other means.
>
> A filehandle composed this way may still need a MAC appended on signed
> exports, so generalize fh_append_mac() to operate on a bare knfsd_fh
> (plus its maximum size and net) rather than a svc_fh.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4xdr.c | 36 +++++++++++++++++++++---------------
>  fs/nfsd/nfsfh.c   | 10 +++++-----
>  fs/nfsd/nfsfh.h   |  1 +
>  3 files changed, 27 insertions(+), 20 deletions(-)
>
> diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
> index 4fb61d05a4a7..7b19248b1503 100644
> --- a/fs/nfsd/nfs4xdr.c
> +++ b/fs/nfsd/nfs4xdr.c

> @@ -4015,19 +4016,24 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, 
> struct xdr_stream *xdr,
>  		if (err)
>  			goto out_nfserr;
>  	}
> -	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
> -	    !fhp) {
> -		tempfh = kmalloc_obj(struct svc_fh);
> -		status = nfserr_jukebox;
> -		if (!tempfh)
> -			goto out;
> -		fh_init(tempfh, NFS4_FHSIZE);
> -		status = fh_compose(tempfh, exp, dentry, NULL);
> -		if (status)
> -			goto out;
> -		args.fhp = tempfh;
> -	} else
> -		args.fhp = fhp;
> +
> +	args.fhp = fhp;
> +	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID))) {
> +		if (!args.fhp) {
> +			tempfh = kmalloc_obj(struct svc_fh);
> +			status = nfserr_jukebox;
> +			if (!tempfh)
> +				goto out;
> +			fh_init(tempfh, NFS4_FHSIZE);
> +			status = fh_compose(tempfh, exp, dentry, NULL);
> +			if (status)
> +				goto out;
> +			args.fhp = tempfh;
> +		}
> +		if (args.fhp)

Nit: here, "args.fhp" is never false.

Note that nfsd4_encode_fattr4_fsid() calls fsid_source(args->fhp)
without a NULL check. After this patch is applied, filehandle
encoding is svc_fh-free but FSID encoding is not, and the two
share the same attrmask gate:

   (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID).

No current caller trips it: the CB_NOTIFY path does not request
FSID. But a future CB_NOTIFY attrset that adds FSID would
dereference a NULL fhp.


> +			fh_copy_shallow(&args.fhandle, &args.fhp->fh_handle);
> +	}
> +
>  	if (attrmask[0] & (FATTR4_WORD0_CASE_INSENSITIVE |
>  			   FATTR4_WORD0_CASE_PRESERVING)) {
>  		/*


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v6 11/20] nfsd: apply the notify mask to the delegation when requested
From: Chuck Lever @ 2026-06-12 17:57 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-11-4c45080e5f3f@kernel.org>



On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> If the client requests a directory delegation with notifications
> enabled, set the appropriate return mask in gddr_notification[0]. This
> will ensure the lease acquisition sets the appropriate ignore mask.
>
> If the client doesn't set NOTIFY4_GFLAG_EXTEND, then don't offer any
> notifications, as nfsd won't provide directory offset information, and
> "classic" notifications require them.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---
>  fs/nfsd/nfs4proc.c | 12 ++++++++++++
>  1 file changed, 12 insertions(+)
>
> diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
> index 0c37d7c6d28c..29f7339dc220 100644
> --- a/fs/nfsd/nfs4proc.c
> +++ b/fs/nfsd/nfs4proc.c
> @@ -2530,12 +2530,18 @@ nfsd4_verify(struct svc_rqst *rqstp, struct 
> nfsd4_compound_state *cstate,
>  	return status == nfserr_same ? nfs_ok : status;
>  }
> 
> +#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_REMOVE_ENTRY) |	\
> +				 BIT(NOTIFY4_ADD_ENTRY) |	\
> +				 BIT(NOTIFY4_RENAME_ENTRY) |	\
> +				 BIT(NOTIFY4_GFLAG_EXTEND))
> +
>  static __be32
>  nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
>  			 struct nfsd4_compound_state *cstate,
>  			 union nfsd4_op_u *u)
>  {
>  	struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
> +	u32 requested = gdd->gdda_notification_types[0];
>  	struct nfs4_delegation *dd;
>  	struct nfsd_file *nf;
>  	__be32 status;
> @@ -2544,6 +2550,12 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
>  	if (status != nfs_ok)
>  		return status;
> 
> +	/* No notifications if you don't set NOTIFY4_GFLAG_EXTEND! */
> +	if (!(requested & BIT(NOTIFY4_GFLAG_EXTEND)))
> +		requested = 0;
> +
> +	gdd->gddr_notification[0] = requested & SUPPORTED_NOTIFY_MASK;
> +
>  	/*
>  	 * RFC 8881, section 18.39.3 says:
>  	 *
>

When a client requests NOTIFY4_GFLAG_EXTEND | NOTIFY4_CFLAG_ORDER
plus ADD/REMOVE/RENAME, the assignment still grants the content
notification bits because it only requires GFLAG_EXTEND. The rest
of NFSD's CB_NOTIFY encoder does not store that order-aware request
and emits zero/absent cookie and previous-entry information, which
is only safe for order-unaware clients.

An order-aware client can then keep an ordered directory cache from
unusable notifications instead of having the delegation recalled.

The bis draft requires order info for order-aware clients, or recall:

   - 27.4.5 (REMOVE): "If the client is order-aware, the server will send
     the cookie value as part of this."
   - 16.2.13: order-aware == NOTIFY4_CFLAG_ORDER set OR NOTIFY4_GFLAG_EXTEND
     reset.
   - 16.2.11.3: "If the client is concerned with entry order and these
     notifications ... cannot be sent for any other reason, then the
     delegation is recalled."

This patch's own first rule (drop everything when GFLAG_EXTEND is
reset) is exactly the order-aware -> no-notifications principle
for legacy RFC8881 clients. It misses the parallel case:
GFLAG_EXTEND set PLUS CFLAG_ORDER set is also order-aware.

This finding is latent when this patch is first applied, but the
NFSD's negotiation is incorrect for any future order-aware client.

I don't see this issue addressed by a subsequent patch in this
series.


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v6 10/20] nfsd: add notification handlers for dir events
From: Chuck Lever @ 2026-06-12 17:51 UTC (permalink / raw)
  To: Jeff Layton, NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs
In-Reply-To: <20260611-dir-deleg-v6-10-4c45080e5f3f@kernel.org>


On Thu, Jun 11, 2026, at 1:50 PM, Jeff Layton wrote:
> Add the necessary parts to accept a fsnotify callback for directory
> change event and create a CB_NOTIFY request for it. When a dir nfsd_file
> is created set a handle_event callback to handle the notification.
>
> Use that to allocate a nfsd_notify_event object and then hand off a
> reference to each delegation's CB_NOTIFY. If anything fails along the
> way, recall any affected delegations.
>
> Signed-off-by: Jeff Layton <jlayton@kernel.org>
> ---

> diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
> index ca4dd2f969eb..59378751d596 100644
> --- a/fs/nfsd/nfs4callback.c
> +++ b/fs/nfsd/nfs4callback.c

> @@ -904,13 +908,45 @@ static void nfs4_xdr_enc_cb_notify(struct rpc_rqst *req,
>  	encode_cb_sequence4args(xdr, cb, &hdr);
> 
>  	/*
> -	 * FIXME: get stateid and fh from delegation. Inline the cna_changes
> -	 * buffer, and zero it.
> +	 * nfsd4_cb_notify_prepare() sized the payload against a single page,
> +	 * but did not account for the compound, sequence, stateid, and
> +	 * filehandle encoded here. If the variable-length encode overflows the
> +	 * backchannel send buffer, roll back to before the operation so that a
> +	 * truncated CB_NOTIFY is never placed on the wire.
>  	 */
> -	xdrgen_encode_CB_NOTIFY4args(xdr, &args);
> +	start = xdr_stream_pos(xdr);
> +
> +	p = xdr_reserve_space(xdr, 4);
> +	if (!p)
> +		goto out_err;
> +	*p = cpu_to_be32(OP_CB_NOTIFY);

Please use xdr_stream_encode_u32 for this purpose.


> +
> +	args.cna_stateid.seqid = dp->dl_stid.sc_stateid.si_generation;
> +	memcpy(&args.cna_stateid.other, &dp->dl_stid.sc_stateid.si_opaque,
> +	       ARRAY_SIZE(args.cna_stateid.other));
> +	args.cna_fh.len = dp->dl_stid.sc_file->fi_fhandle.fh_size;
> +	args.cna_fh.data = dp->dl_stid.sc_file->fi_fhandle.fh_raw;
> +	args.cna_changes.count = ncn->ncn_nf_cnt;
> +	args.cna_changes.element = ncn->ncn_nf;
> +	if (!xdrgen_encode_CB_NOTIFY4args(xdr, &args))
> +		goto out_err;
> 
>  	hdr.nops++;
>  	encode_cb_nops(&hdr);
> +	return;
> +
> +out_err:
> +	/*
> +	 * Drop the CB_NOTIFY op and emit a valid CB_SEQUENCE-only compound so
> +	 * the client still advances its slot. Flag the failure so the done
> +	 * handler recalls the delegation and the missed notification is not
> +	 * silently lost. The flag is written here in the transmit path and read
> +	 * in the done handler; the two are serialized phases of the same
> +	 * rpc_task, so no additional barrier is needed.
> +	 */
> +	ncn->ncn_encode_err = true;

This flag is zeroed only once, at allocation time in alloc_init_dir_deleg().
It is never cleared in nfsd4_cb_notify_prepare().

Since nfsd4_cb_notify_release() can requeue the callback (via
nfsd4_run_cb_notify) when events arrive while a callback is in flight,
->prepare may encode cleanly and return true, but nfsd4_cb_notify_done()
still observes the stale ncn_encode_err == true and calls
nfsd_break_one_deleg() -- discarding a good notification and recalling
the delegation unnecessarily.


> +	xdr_truncate_encode(xdr, start);
> +	encode_cb_nops(&hdr);
>  }
> 
>  static int nfs4_xdr_dec_cb_notify(struct rpc_rqst *rqstp,

> diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
> index 0a15d7f3b543..513cbc1a583f 100644
> --- a/fs/nfsd/nfs4state.c
> +++ b/fs/nfsd/nfs4state.c

> @@ -3471,19 +3472,146 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
>  	nfs4_put_stid(&dp->dl_stid);
>  }
> 
> +static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
> +{
> +	bool queued;
> +
> +	if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
> +		return;
> +
> +	/*
> +	 * We're assuming the state code never drops its reference
> +	 * without first removing the lease.  Since we're in this lease
> +	 * callback (and since the lease code is serialized by the
> +	 * flc_lock) we know the server hasn't removed the lease yet, and
> +	 * we know it's safe to take a reference.
> +	 */
> +	refcount_inc(&dp->dl_stid.sc_count);
> +	queued = nfsd4_run_cb(&dp->dl_recall);
> +	WARN_ON_ONCE(!queued);
> +	if (!queued) {
> +		refcount_dec(&dp->dl_stid.sc_count);
> +		clear_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags);
> +	}
> +}

nfsd_break_one_deleg() does an unconditional
refcount_inc(&dp->dl_stid.sc_count), and its comment justifies this
with "the lease code is serialized by the flc_lock." That invariant
holds when called from nfsd_break_deleg_cb() under flc_lock, but
nfsd4_cb_notify_prepare() runs on a workqueue WITHOUT flc_lock. Its
out_recall: path calls nfsd_break_one_deleg(dp)
directly. The delegation can be concurrently destroyed with sc_count
already at zero, making this an inc-from-zero.

The dispatch path nfsd4_run_cb_notify already does this correctly with
refcount_inc_not_zero. The out_recall path needs the same guard (skip
the recall / bail if the refcount is already zero).

I notice that the last unapplied patch ("nfsd: add
support to CB_NOTIFY for dir attribute changes") rewrites the guard
"if (count > NOTIFY4_EVENT_QUEUE_SIZE)" into "if (count > limit)" with
limit = NOTIFY4_EVENT_QUEUE_SIZE - 1 when NOTIFY4_CHANGE_DIR_ATTRS is
requested. That turns the previously-dead overflow branch into a live,
routine path to out_recall, which adds another normal-operation route
into this unlocked recall.


-- 
Chuck Lever

^ permalink raw reply

* Re: [PATCH v3 02/12] x86/resctrl: Add data structures and definitions for PLZA configuration
From: Moger, Babu @ 2026-06-12 17:49 UTC (permalink / raw)
  To: Reinette Chatre, Babu Moger, corbet, tony.luck, Dave.Martin,
	james.morse, tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <190bf049-4928-411b-ab5c-30d39817f118@amd.com>

Hi Reinette,

On 6/12/2026 12:32 PM, Moger, Babu wrote:
> Hi Reinette,
> 
> On 6/11/2026 6:40 PM, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 4/30/26 4:24 PM, Babu Moger wrote:
>>> Privilege Level Zero Association (PLZA) is configured per logical 
>>> processor
>>> via MSR_IA32_PQR_PLZA_ASSOC (0xc00003fc). Software must program RMID and
>>> CLOSID association fields and their enable bits using the layout defined
>>> for the MSR.
>>>
>>> Define MSR_IA32_PQR_PLZA_ASSOC and the RMID_EN, CLOSID_EN, and 
>>> PLZA_EN bit
>>> masks in asm/msr-index.h. Add union msr_pqr_plza_assoc in arch resctrl
>>> internal.h
>>
>> Above paragraph captures what can be seen from the patch. Please check 
>> entire
>> series for this since many changelogs in this series verbatim 
>> describes the code
>> changes in patch without helping reader understand why those changes 
>> are made.
>>
> 
> Sure. Will rewrite the changelog. And will check other patches also.
> 
>>
>>>
>>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>>> ---
>>
>>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/ 
>>> msr-index.h
>>> index 9dc6b610e4e2..623628d3c643 100644
>>> --- a/arch/x86/include/asm/msr-index.h
>>> +++ b/arch/x86/include/asm/msr-index.h
>>> @@ -1287,10 +1287,17 @@
>>>   /* - AMD: */
>>>   #define MSR_IA32_MBA_BW_BASE        0xc0000200
>>>   #define MSR_IA32_SMBA_BW_BASE        0xc0000280
>>> +#define MSR_IA32_PQR_PLZA_ASSOC        0xc00003fc
>>>   #define MSR_IA32_L3_QOS_ABMC_CFG    0xc00003fd
>>>   #define MSR_IA32_L3_QOS_EXT_CFG        0xc00003ff
>>>   #define MSR_IA32_EVT_CFG_BASE        0xc0000400
>>> +/* Lower 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>>> +#define RMID_EN                BIT(31)
>>> +/* Upper 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>>> +#define CLOSID_EN            BIT(15)
>>> +#define PLZA_EN                BIT(31)
>>> +
>>
>> This is unexpected. So far resctrl has only defined the MSR numbers in 
>> this file, not
>> the individual fields. This seems a legitimate use of msr-index.h but 
>> creates inconsistency
>> with how the fields of the other resctrl registers are defined. This 
>> may be ok so I am
>> looking past this for now. Since I am not familiar with this use I am 
>> looking at other
>> patterns of this and it seems that the register fields are usually 
>> defined right after
>> the register to make this relationship clear and also use more verbose 
>> naming to establish
>> this relationship ... I do not think such cryptic names should be used 
>> without context
>> in such a global scope. Please compare with how other fields are 
>> defined at this scope.
> 
> Sure. Will use the names tony suggested.
> https://lore.kernel.org/lkml/ 
> SJ1PR11MB6083C069F99FAB8A0BEB8518FC182@SJ1PR11MB6083.namprd11.prod.outlook.com/
> 
> Also will moving the register "MSR_IA32_PQR_PLZA_ASSOC" together with 
> BIT definition. It will break the sorting order. Hope that is not a 
> problem.

Never mind. I don't need the bit definitions anymore. I don't need to 
move the register.

Thanks
Babu


^ permalink raw reply

* Re: RE: [PATCH v3 02/12] x86/resctrl: Add data structures and definitions for PLZA configuration
From: Moger, Babu @ 2026-06-12 17:46 UTC (permalink / raw)
  To: Luck, Tony, Chatre, Reinette, Babu Moger, corbet@lwn.net,
	Dave.Martin@arm.com, james.morse@arm.com, tglx@kernel.org,
	bp@alien8.de, dave.hansen@linux.intel.com
  Cc: skhan@linuxfoundation.org, x86@kernel.org, mingo@redhat.com,
	hpa@zytor.com, akpm@linux-foundation.org, rdunlap@infradead.org,
	pawan.kumar.gupta@linux.intel.com, feng.tang@linux.alibaba.com,
	dapeng1.mi@linux.intel.com, kees@kernel.org, elver@google.com,
	lirongqing@baidu.com, paulmck@kernel.org, bhelgaas@google.com,
	seanjc@google.com, alexandre.chartre@oracle.com,
	yazen.ghannam@amd.com, peterz@infradead.org, Bae, Chang Seok,
	kim.phillips@amd.com, xin@zytor.com, naveen@kernel.org,
	thomas.lendacky@amd.com, linux-doc@vger.kernel.org,
	linux-kernel@vger.kernel.org, Eranian, Stephane,
	peternewman@google.com
In-Reply-To: <SJ1PR11MB6083C069F99FAB8A0BEB8518FC182@SJ1PR11MB6083.namprd11.prod.outlook.com>

Hi Tony,


On 6/12/2026 10:40 AM, Luck, Tony wrote:
>>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>>> index 9dc6b610e4e2..623628d3c643 100644
>>> --- a/arch/x86/include/asm/msr-index.h
>>> +++ b/arch/x86/include/asm/msr-index.h
>>> @@ -1287,10 +1287,17 @@
>>>   /* - AMD: */
>>>   #define MSR_IA32_MBA_BW_BASE               0xc0000200
>>>   #define MSR_IA32_SMBA_BW_BASE              0xc0000280
>>> +#define MSR_IA32_PQR_PLZA_ASSOC            0xc00003fc
>>>   #define MSR_IA32_L3_QOS_ABMC_CFG   0xc00003fd
>>>   #define MSR_IA32_L3_QOS_EXT_CFG            0xc00003ff
>>>   #define MSR_IA32_EVT_CFG_BASE              0xc0000400
>>>
>>> +/* Lower 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>>> +#define RMID_EN                            BIT(31)
>>> +/* Upper 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>>> +#define CLOSID_EN                  BIT(15)
>>> +#define PLZA_EN                            BIT(31)
>>> +
>>
>> This is unexpected. So far resctrl has only defined the MSR numbers in this file, not
>> the individual fields. This seems a legitimate use of msr-index.h but creates inconsistency
>> with how the fields of the other resctrl registers are defined. This may be ok so I am
>> looking past this for now. Since I am not familiar with this use I am looking at other
>> patterns of this and it seems that the register fields are usually defined right after
>> the register to make this relationship clear and also use more verbose naming to establish
>> this relationship ... I do not think such cryptic names should be used without context
>> in such a global scope. Please compare with how other fields are defined at this scope.
> 
> There's also patches in flight to treat MSRs as a single "u64" and move away from
> the low level implementation detail that the RDMSR/WRMSR instructions split into
> upper/lower halves.
> 
> All the kernel interfaces are moving to rdmsrq() and wrmsrq() (together with related
> functions).

Ack.

> 
> So maybe:
> 
> #define PQR_PLZA_RMID_EN        BIT_ULL(31)
> #define PQR_PLZA_CLOSID_EN      BIT_ULL(47)
> #define PQR_PLZA_PLZA_EN        BIT_ULL(63)
> 
> [modify with whatever addition prefix characters seem necessary]
> 

Actually, I don’t need these changes anymore—they were carried over from 
a previous version. Thanks for making the updates, though.

Thanks
Babu


^ permalink raw reply

* Re: [mic:next 15/15] htmldocs: Documentation/userspace-api/landlock.rst:768: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils]
From: Mickaël Salaün @ 2026-06-12 17:35 UTC (permalink / raw)
  To: Randy Dunlap; +Cc: kernel test robot, Matthieu Buffet, oe-kbuild-all, linux-doc
In-Reply-To: <1cc99145-0316-44f5-b134-2b4f90b326c4@infradead.org>

On Fri, Jun 12, 2026 at 08:35:57AM -0700, Randy Dunlap wrote:
> 
> 
> On 6/12/26 12:52 AM, kernel test robot wrote:
> > tree:   https://git.kernel.org/pub/scm/linux/kernel/git/mic/linux.git next
> > head:   a6f0a6f5377fae42a8028f63c89d544c68f24b60
> > commit: a6f0a6f5377fae42a8028f63c89d544c68f24b60 [15/15] landlock: Add documentation for UDP support
> > compiler: clang version 22.0.0git (https://github.com/llvm/llvm-project f43d6834093b19baf79beda8c0337ab020ac5f17)
> > docutils: docutils (Docutils 0.21.2, Python 3.13.5, on linux)
> > reproduce: (https://download.01.org/0day-ci/archive/20260612/202606120923.1nYYlfdb-lkp@intel.com/reproduce)
> > 
> > If you fix the issue in a separate patch/commit (i.e. not just a new version of
> > the same patch/commit), kindly add following tags
> > | Reported-by: kernel test robot <lkp@intel.com>
> > | Closes: https://lore.kernel.org/oe-kbuild-all/202606120923.1nYYlfdb-lkp@intel.com/
> > 
> > All warnings (new ones prefixed by >>):
> > 
> >    Scope flags
> >    ~~~~~~~~~~~ [docutils]
> >>> Documentation/userspace-api/landlock.rst:768: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils]
> >>> Documentation/userspace-api/landlock.rst:768: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils]
> >    Documentation/userspace-api/landlock:596: ./include/uapi/linux/landlock.h:40: ERROR: Unknown target name: "filesystem flags". [docutils]
> >    Documentation/userspace-api/landlock:596: ./include/uapi/linux/landlock.h:45: ERROR: Unknown target name: "network flags". [docutils]
> >    Documentation/userspace-api/landlock:596: ./include/uapi/linux/landlock.h:50: ERROR: Unknown target name: "scope flags". [docutils]
> >    Documentation/userspace-api/landlock:596: ./include/uapi/linux/landlock.h:24: ERROR: Unknown target name: "filesystem flags". [docutils]
> >    Documentation/userspace-api/landlock:605: ./include/uapi/linux/landlock.h:168: ERROR: Unknown target name: "filesystem flags". [docutils]
> > 
> > 
> 
> In case it's not obvious:
> 
> > vim +768 Documentation/userspace-api/landlock.rst
> > 
> >    767	
> >  > 768	Starting with the Landlock ABI version 10, it is possible to restrict
> >    769	setting the local port of UDP sockets with the
> >    770	``LANDLOCK_ACCESS_NET_BIND_UDP`` right. This includes restricting the
> >    771	ability to trigger autobind of an ephemeral port by the kernel by e.g.
> >    772	sending a first datagram or setting the remote peer of a socket.
> >    773	The ``LANDLOCK_ACCESS_NET_CONNECT_SEND_UDP`` right controls setting the
> >    774	remote port of UDP sockets (via :manpage:`connect(2)), and sending
> 
>                                             missing ending           `

Thanks, fixed!

> 
> >    775	datagrams to an explicit remote port (ignoring any destination set on
> >    776	UDP sockets, via e.g. :manpage:`sendto(2)).
> 
>                                        same here
> 
> >    777	
> > 
> > --
> > 0-DAY CI Kernel Test Service
> > https://github.com/intel/lkp-tests/wiki
> > 
> 
> -- 
> ~Randy
> 

^ permalink raw reply

* Re: [PATCH v3 02/12] x86/resctrl: Add data structures and definitions for PLZA configuration
From: Moger, Babu @ 2026-06-12 17:32 UTC (permalink / raw)
  To: Reinette Chatre, Babu Moger, corbet, tony.luck, Dave.Martin,
	james.morse, tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <db9c0b3e-184c-4100-b59a-91f6e818fd31@intel.com>

Hi Reinette,

On 6/11/2026 6:40 PM, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/30/26 4:24 PM, Babu Moger wrote:
>> Privilege Level Zero Association (PLZA) is configured per logical processor
>> via MSR_IA32_PQR_PLZA_ASSOC (0xc00003fc). Software must program RMID and
>> CLOSID association fields and their enable bits using the layout defined
>> for the MSR.
>>
>> Define MSR_IA32_PQR_PLZA_ASSOC and the RMID_EN, CLOSID_EN, and PLZA_EN bit
>> masks in asm/msr-index.h. Add union msr_pqr_plza_assoc in arch resctrl
>> internal.h
> 
> Above paragraph captures what can be seen from the patch. Please check entire
> series for this since many changelogs in this series verbatim describes the code
> changes in patch without helping reader understand why those changes are made.
> 

Sure. Will rewrite the changelog. And will check other patches also.

> 
>>
>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>> ---
> 
>> diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
>> index 9dc6b610e4e2..623628d3c643 100644
>> --- a/arch/x86/include/asm/msr-index.h
>> +++ b/arch/x86/include/asm/msr-index.h
>> @@ -1287,10 +1287,17 @@
>>   /* - AMD: */
>>   #define MSR_IA32_MBA_BW_BASE		0xc0000200
>>   #define MSR_IA32_SMBA_BW_BASE		0xc0000280
>> +#define MSR_IA32_PQR_PLZA_ASSOC		0xc00003fc
>>   #define MSR_IA32_L3_QOS_ABMC_CFG	0xc00003fd
>>   #define MSR_IA32_L3_QOS_EXT_CFG		0xc00003ff
>>   #define MSR_IA32_EVT_CFG_BASE		0xc0000400
>>   
>> +/* Lower 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>> +#define RMID_EN				BIT(31)
>> +/* Upper 32 bits of MSR_IA32_PQR_PLZA_ASSOC */
>> +#define CLOSID_EN			BIT(15)
>> +#define PLZA_EN				BIT(31)
>> +
> 
> This is unexpected. So far resctrl has only defined the MSR numbers in this file, not
> the individual fields. This seems a legitimate use of msr-index.h but creates inconsistency
> with how the fields of the other resctrl registers are defined. This may be ok so I am
> looking past this for now. Since I am not familiar with this use I am looking at other
> patterns of this and it seems that the register fields are usually defined right after
> the register to make this relationship clear and also use more verbose naming to establish
> this relationship ... I do not think such cryptic names should be used without context
> in such a global scope. Please compare with how other fields are defined at this scope.

Sure. Will use the names tony suggested.
https://lore.kernel.org/lkml/SJ1PR11MB6083C069F99FAB8A0BEB8518FC182@SJ1PR11MB6083.namprd11.prod.outlook.com/

Also will moving the register "MSR_IA32_PQR_PLZA_ASSOC" together with 
BIT definition. It will break the sorting order. Hope that is not a problem.
> 
>> diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
>> index e3cfa0c10e92..1c2f87ffb0ea 100644
>> --- a/arch/x86/kernel/cpu/resctrl/internal.h
>> +++ b/arch/x86/kernel/cpu/resctrl/internal.h
>> @@ -222,6 +222,33 @@ union l3_qos_abmc_cfg {
>>   	unsigned long full;
>>   };
>>   
>> +/*
>> + * PLZA is programmed by writing to MSR_IA32_PQR_PLZA_ASSOC. Bitfield
>> + * layout for MSR_IA32_PQR_PLZA_ASSOC (Privilege Level Zero Association).
> 
> These comments are valuable to describe how resctrl should interact with
> this register so it would help to be specific and document any and all
> constraints.
> 
> For example, I seem to remember that all fields except PLZA_EN are required
> to be identical on all CPUs. Please document that and any other constraints here.
> 
>> + *
>> + * @rmid		: The RMID to be configured for PLZA.
> 
> What does "to be configured" mean? It seems to imply that when resctrl
> writes to @rmid then the setting does not take immediate effect but would
> take effect at some future "configure" time?
> 
>> + * @reserved1		: Reserved.
>> + * @rmid_en		: Associate RMID or not.
> 
> Please elaborate ... what is RMID associated with? What does "or not" imply?
> Here it will help to document relationship with MSR_IA32_PQR_ASSOC.
> 
>> + * @closid		: The CLOSID to be configured for PLZA.
>> + * @reserved2		: Reserved.
>> + * @closid_en		: Associate CLOSID or not.
> 
> Same comments as for RMID
> 
>> + * @reserved3		: Reserved.
>> + * @plza_en		: Configure PLZA or not.
> 
> plza_en implies "enable" but the comment mentions "configure". Considering
> the other fields are "to be configured" there seems to be relationship but
> that is not documented at all. For example, if @plza_en is 1 and resctrl modifies
> @rmid should resctrl write "1" to @plza_en again to "configure" the new RMID?
> 
> Please add specific detail to help understand how best to interact with this
> register.

Sure. Will re-write this whole comments.

Thanks
Babu


^ permalink raw reply

* Re: [PATCH] v2 Documentation: arch: fix brackets
From: Krzysztof Kozlowski @ 2026-06-12 17:26 UTC (permalink / raw)
  To: Manuel Ebner, Vineet Gupta, Jonathan Corbet, Shuah Khan,
	Peter Griffin, Alim Akhtar, Catalin Marinas, Will Deacon,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy, open list:SYNOPSYS ARC ARCHITECTURE,
	open list:DOCUMENTATION, open list,
	moderated list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
  Cc: Randy Dunlap
In-Reply-To: <20260612095432.177759-2-manuelebner@mailbox.org>

On 12/06/2026 11:54, Manuel Ebner wrote:
> Add missing and remove needless parentheses, brackets and curly braces.
> Fix typos.
> 
> Signed-off-by: Manuel Ebner <manuelebner@mailbox.org>
> Reviewed-by: Randy Dunlap <rdunlap@infradead.org>
> ---
> [v1] -> [v2]
> "(i.e cache geometries)" -> "(e.g., cache geometries)"
> "Excer[t" -> "Excerpt"
> add Reviewed-by: Randy Dunlap

You mixed up subject.

Please version your patches correctly, e.g. use b4 or git format-patch
-vX.

Best regards,
Krzysztof

^ permalink raw reply

* [PATCH] Documentation: leds: fix broken reference to the multicolor LED ABI
From: Shardul Deshpande @ 2026-06-12 17:15 UTC (permalink / raw)
  To: Nam Tran, Lee Jones, Pavel Machek, Jonathan Corbet, Shuah Khan,
	linux-leds, linux-doc, linux-kernel

The reference pointed to a non-existent .rst file.  The ABI file is named
sysfs-class-led-multicolor (without extension), so fix the reference to
match the actual file and resolve the warning from
tools/docs/documentation-file-ref-check.

Signed-off-by: Shardul Deshpande <iamsharduld@gmail.com>
---
 Documentation/leds/leds-lp5812.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/leds/leds-lp5812.rst b/Documentation/leds/leds-lp5812.rst
index c2a6368d5..12e757d45 100644
--- a/Documentation/leds/leds-lp5812.rst
+++ b/Documentation/leds/leds-lp5812.rst
@@ -20,7 +20,7 @@ Sysfs Interface
 ===============
 
 This driver uses the standard multicolor LED class interfaces defined
-in Documentation/ABI/testing/sysfs-class-led-multicolor.rst.
+in Documentation/ABI/testing/sysfs-class-led-multicolor.
 
 Each LP5812 LED output appears under ``/sys/class/leds/`` with its
 assigned label (for example ``LED_A``).
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH] Docs/mm/damon/design: fix a typo in the Address Unit section
From: Andrew Morton @ 2026-06-12 17:03 UTC (permalink / raw)
  To: Shardul Deshpande
  Cc: SeongJae Park, David Hildenbrand, Lorenzo Stoakes,
	Liam R. Howlett, Vlastimil Babka, Mike Rapoport,
	Suren Baghdasaryan, Michal Hocko, Jonathan Corbet, Shuah Khan,
	damon, linux-mm, linux-doc, linux-kernel
In-Reply-To: <20260612154054.720363-1-iamsharduld@gmail.com>

On Fri, 12 Jun 2026 21:10:54 +0530 Shardul Deshpande <iamsharduld@gmail.com> wrote:

> The "Address Unit" section misspelled the C type that the DAMON core
> layer uses for monitoring target address ranges.  Correct it to read
> "unsigned long".
> 
> ...
>
> --- a/Documentation/mm/damon/design.rst
> +++ b/Documentation/mm/damon/design.rst
> @@ -140,7 +140,7 @@ as Idle page tracking does.
>  Address Unit
>  ------------
>  
> -DAMON core layer uses ``unsinged long`` type for monitoring target address
> +DAMON core layer uses ``unsigned long`` type for monitoring target address
>  ranges.  In some cases, the address space for a given operations set could be
>  too large to be handled with the type.  ARM (32-bit) with large physical
>  address extension is an example.  For such cases, a per-operations set

Well we don't want our longs to be singed.

`grep -ri singed .' shows quite a few typos.  "assinged" is popular. 
Perhaps you could prepare a patch which fixes them all and adds
"assinged" to scripts/spelling.txt?

^ permalink raw reply

* [swap tier discussion] Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Shakeel Butt @ 2026-06-12 17:02 UTC (permalink / raw)
  To: YoungJun Park
  Cc: Yosry Ahmed, Hao Jia, Johannes Weiner, mhocko, tj, mkoutny,
	roman.gushchin, Nhat Pham, akpm, chengming.zhou, muchun.song,
	cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
	kasong, baoquan.he, joshua.hahnjy
In-Reply-To: <aiu06fbV7rWqY0Bm@yjaykim-PowerEdge-T330>

Changed the subject to separate the discussion on swap tiers.

On Fri, Jun 12, 2026 at 04:27:37PM +0900, YoungJun Park wrote:
> On Thu, Jun 11, 2026 at 12:12:40PM -0700, Shakeel Butt wrote:
> > On Thu, Jun 11, 2026 at 05:45:04PM +0000, Yosry Ahmed wrote:
> > > On Tue, Jun 09, 2026 at 01:19:13PM +0900, YoungJun Park wrote:
> > > > On Mon, Jun 08, 2026 at 03:27:07PM -0700, Yosry Ahmed wrote:
> > > > 
> > > > +Chris +Kairui +Baoquan
> > > > 
> > > > Hello
> > > > 
> > > > Thanks for inviting me to the discussion, Shakeel.
> > > > 
> > > > > > > > Youngjun is working on swap tiers. At the moment he is more interested in
> > > > > > > > allowing a specific swap device to a memcg or not. I can imagine in future there
> > > > > > > > will be use-cases where there will be a need to demote data on higher tier swap
> > > > > > > > to lower tier swap. What would be the appropriate interface?
> > > > 
> > > > Speaking of my work on swap tiers, I recently submitted a patch and am
> > > > currently considering memcg integration:
> > > > https://lore.kernel.org/linux-mm/20260527062247.3440692-1-youngjun.park@lge.com/
> > > > 
> > > > The future use-cases imagined above seem to align with this
> > > > direction. (BTW, I am currently waiting for reviews/feedback from the memcg
> > > > folks on this patch. Any reviews would be highly appreciated!)
> > > > 
> > > > We could potentially assign a target tier
> > > > for writeback within the existing memory.zswap.writeback interface. 
> > > > 
> > > > For instance, '0' could mean disabled, while non-zero values could represent
> > > > specific tiers, which would maintain backward compatibility with the current
> > > > version. Alternatively, if zswap is treated as the default top tier, 
> > > > the `memory.swap.tiers` interface could potentially replace `memory.zswap.writeback`.
> > > > 
> > > > Furthermore, this could be expanded so that each swap tier can demote data
> > > > user-triggered demotion between swap tiers.
> > > > 
> > > > Based on the current patch's ideas combined with my swap tiers concept:
> > > > 
> > > > Assuming a hierarchy like:
> > > > zswap -> tier1 (SSD swap) -> tier2 (HDD swap) -> tier3 (Network swap)
> > > > 
> > > > We could configure the active tiers via a setting like `memory.swap.tiers`
> > > > (tier2 enabled, tier3 enabled).
> > > > 
> > > > For example, the concept of `echo "100M zswap_writeback_only > memory.reclaim"`
> > > > could be extended. A user could run `echo "100M tier2 > memory.reclaim"`
> > > > to explicitly trigger demotion from tier2 to tier3.
> > > > (BTW, if we combine these features, my personal preference for the keyword
> > > > format would be `<size> <demote_prefix><tier_name>`. I think it would be
> > > > better to explicitly indicate that it is a swap demotion by using a specific
> > > > prefix followed by the tier name. 
> > > > Or make demote prefix another key is also possible)
> > > 
> > > I am not sure if proactive demotion between swap tiers would be driven
> > > by memory.reclaim, I am guessing a new interface might be more suitable.
> > > But yes, you are right that it's very possible that
> > > 'zswap_writeback_only' with memory.reclaim will become obsolete once
> > > swap tiering matures and starts supporting things like proactive
> > > demotion.
> > > 
> > > Part of me wants to wait until the swap tiering interfaces are figured
> > > out so that we don't end up with redundant interfaces, but I also don't
> > > want to hold Hao's work since it doesn't directly depend on swap
> > > tiering.
> > However I would need zswap folks (Yosry & Nhat) help in figuring out swap tiers
> > interfaces. Zswap is the current top tier swap usage in real world. I want
> > zswap users to eaily (and hopefully transparently) migrate to swap tiers.
> 
> > > Shakeel, how do you want to handle this? I think there's a few options:
> > > 
> > > 1. Add zswap_writeback_only now, and when we have swap tiering demotion
> > > it becomes a redundant interface, like memory.zswap.writeback -- or
> > > maybe we try to deprecate both of them at that point. It's difficult to
> > > remove interfaces tho, but maybe easier to stop supporting
> > > zswap_writeback_only.
> > > 
> > > 2. Add zswap_writeback_only behind an experimental config option, to
> > > unblock development but have a line of sight to dropping support once we
> > > have a swap tiering interface.
> > > 
> > > 3. Wait until we figure out the swap tiering interfaces and then add
> > > the proactive zswap writeback as part of it.
> > > 
> > > WDYT?
> > 
> > Is Hao's work needed for some followup work/development? The earliest Hao's
> > work can is 7.3, so if we aim to figure out swap tiering interfaces in next
> > couple of weeks then option 3 is the way to go. If swap tiers take more time
> > then we can discuss other options as well.
> > However I would need zswap folks (Yosry & Nhat) help in figuring out swap tiers
> > interfaces. Zswap is the current top tier swap usage in real world. I want
> > zswap users to eaily (and hopefully transparently) migrate to swap tiers.
> 
> I am looking forward to the discussion on this interface!
> 
> To help boost the discussion and progress, I would like to share a few of my thoughts.
> We could either introduce a new interface to trigger demotion/promotion,
> or we could reuse the existing one (using tier just internally)
> 
> Based on the memcg interface currently proposed in swap_tier
> (memory.swap.tiers, memory.swap.tiers.effective), I think it aligns well
> with the current direction. It provides a foundation for selectively
> targeting devices in tier order.

Here instead of cpuset like interface, we may want more zswap like interface
where you can put limit on the usage i.e. memory.swap.tier*.max. We can start
with allowing only two values i.e. 0 and max which effectively will be the
same as what you need.

I will respond to your other points later when I have time.

> 
> To summarize the discussions so far, the following points align well.
> 
> - Per-cgroup swap control, as I suggested.
> - Proactive zswap writeback (Hao's usecase)
> - Swap device target demotion(if it wants selective, then it is more better), as you mentioned:
>   https://lore.kernel.org/linux-mm/aicZ-5GX9De3MAU7@linux.dev/
> - Virtual Swap on/off in the future, as Nhat mentioned:
>   https://lore.kernel.org/linux-mm/20260528212955.1912856-1-nphamcs@gmail.com/
> - The memory.zswap.writeback alternative (no hierarchy model conflict)
> - zswap is first swap tier.
> - Promotion. (Also better for selectve usage)
> - tier based swap policy (e.g round-robin...)
> 
> To accelerate this work, I believe we should reach a consensus and
> merge the currently proposed swap_tier interface :)
> 
> If the above approach is difficult, I would like to suggest an
> alternative for progress with the memcg interfaces removed:
> 
> 1) We could make zswap the first tier and create
> a use case where memory.zswap.writeback internally is handled by tier logic.
> 
> 2) Or simply merge the swap_tier infrastructure itself first.
> 
> This would allow the swap_tier infrastructure to be merged and discussed
> more easily.
> 
> If it takes longer to adopt swap_tier anyway, by doing so we progress next step
> as a experimental feature.
> 
> - Apply per-cgroup swap as an experimental (debugfs) feature.
> - Apply Hao's use case experimentally or as it is as Yosry suggested.
> (future migration to swap tier)
> 
> How do you think?
> 
> (FYI: My emails to kernel.org are failing due to internal server issues.)
> 
> Thank you 
> Youngjun Park

^ permalink raw reply

* Re: [PATCH v3 01/12] x86/resctrl: Support Privilege-Level Zero Association (PLZA)
From: Moger, Babu @ 2026-06-12 17:00 UTC (permalink / raw)
  To: Reinette Chatre, Babu Moger, corbet, tony.luck, Dave.Martin,
	james.morse, tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <a737ae9e-9cbc-46bb-b565-0b888e69f0ea@amd.com>

Hi Reinette,

Missed typo again.

On 6/12/2026 11:56 AM, Moger, Babu wrote:
> Hi Reinette,
> 
> On 6/11/2026 6:23 PM, Reinette Chatre wrote:
>> Hi Babu,
>>
>> On 4/30/26 4:24 PM, Babu Moger wrote:
>>> Customers have identified an issue while using the QoS resource Control
>>
>> "Control" -> "control"?
>>
> 
> ack
> 
>>> feature. If a memory bandwidth associated with a CLOSID is aggressively
>>
>> "a memory bandwidth" -> "memory bandwidth"?
> 
> ack.
> 
>>
>>> throttled, and it moves into Kernel mode, the Kernel operations are also
>>
>> What does "it" refer to here? From text it seems to be the "CLOSID" 
>> but that
>> does not sound right? Should "it" instead be something like "a task 
>> with that
>> CLOSID"?
> 
> sure.
> 
>>
>> "Kernel" -> "kernel"?
> 
> ack.
>>
>>> aggressively throttled. This can stall forward progress and eventually
>>> degrade overall system performance. AMD hardware supports a feature
>>> Privilege-Level Zero Association (PLZA) to change the association of the
>>> thread as soon as it begins executing.
>>
>> "change the association of the thread as soon as it begins executing." 
>> I am
>> not able to parse this.
> 
> How about ?
> 
> Customers have identified an issue while using the QoS resource Control

Control > control

Thanks

Babu


^ permalink raw reply

* Re: [PATCH v3 01/12] x86/resctrl: Support Privilege-Level Zero Association (PLZA)
From: Moger, Babu @ 2026-06-12 16:56 UTC (permalink / raw)
  To: Reinette Chatre, Babu Moger, corbet, tony.luck, Dave.Martin,
	james.morse, tglx, bp, dave.hansen
  Cc: skhan, x86, mingo, hpa, akpm, rdunlap, pawan.kumar.gupta,
	feng.tang, dapeng1.mi, kees, elver, lirongqing, paulmck, bhelgaas,
	seanjc, alexandre.chartre, yazen.ghannam, peterz, chang.seok.bae,
	kim.phillips, xin, naveen, thomas.lendacky, linux-doc,
	linux-kernel, eranian, peternewman
In-Reply-To: <081b5cd6-37a3-4aaf-862b-b41e9536bb66@intel.com>

Hi Reinette,

On 6/11/2026 6:23 PM, Reinette Chatre wrote:
> Hi Babu,
> 
> On 4/30/26 4:24 PM, Babu Moger wrote:
>> Customers have identified an issue while using the QoS resource Control
> 
> "Control" -> "control"?
> 

ack

>> feature. If a memory bandwidth associated with a CLOSID is aggressively
> 
> "a memory bandwidth" -> "memory bandwidth"?

ack.

> 
>> throttled, and it moves into Kernel mode, the Kernel operations are also
> 
> What does "it" refer to here? From text it seems to be the "CLOSID" but that
> does not sound right? Should "it" instead be something like "a task with that
> CLOSID"?

sure.

> 
> "Kernel" -> "kernel"?

ack.
> 
>> aggressively throttled. This can stall forward progress and eventually
>> degrade overall system performance. AMD hardware supports a feature
>> Privilege-Level Zero Association (PLZA) to change the association of the
>> thread as soon as it begins executing.
> 
> "change the association of the thread as soon as it begins executing." I am
> not able to parse this.

How about ?

Customers have identified an issue while using the QoS resource Control
feature. If memory bandwidth associated with a CLOSID is aggressively
throttled, and a task with that CLOSID moves into kernel mode, the 
kernel operations are also aggressively throttled. This can stall 
forward progress and eventually degrade overall system performance.
AMD hardware supports a feature Privilege-Level Zero Association (PLZA)
to change the CPU association at the user-to-kernel transition, so the 
kernel execution can use a different association than user mode.

Privilege-Level Zero Association (PLZA) allows the user to specify a 
CLOSID and/or RMID associated with execution in Privilege-Level Zero. 
When enabled on a CPU, as the CPU enters Privilege-Level Zero, 
allocation and monitoring for that CPU will be associated with the PLZA 
CLOSID and/or RMID. Otherwise, the CPU will be associated with the 
CLOSID and RMID given by PQR_ASSOC.


>>
>> Privilege-Level Zero Association (PLZA) allows the user to specify a CLOSID
>> and/or RMID associated with execution in Privilege-Level Zero. When enabled
>> on a HW thread, when the thread enters Privilege-Level Zero, transactions
> 
> Could you please use consistent terminology throughout this series? This patch
> uses "HW thread"/"thread", the next patch then switches to "logical processor",
> and then by patch #4 the term seems to settle on "CPU". Could this just be
> "CPU" from here and throughout series to be consistent and easier to read?
> 
> What is meant with "transactions"?  Is this just about memory transactions?
> Using this term combined with earlier "memory bandwidth" related problem description
> hints that this feature just impacts memory bandwidth allocation but from what
> I understand this impacts all allocation (CLOSID of all resources) and monitoring.
> 
> Could "transactions" be replaced with "allocation and monitoring" and be
> more accurate?
> 
>> associated with that thread will be associated with the PLZA CLOSID and/or
>> RMID. Otherwise, the HW thread will be associated with the CLOSID and RMID
>> identified by PQR_ASSOC.
>>
>> Add PLZA support to resctrl and introduce a kernel parameter that allows
>> enabling or disabling the feature at boot time.
>>
>> The GLBE feature details are documented in:
> 
> "GLBE" -> "PLZA"?
> 

ack.

>>
>>    AMD64 Zen6 Platform Quality of Service (PQOS) Extensions:
>>    Publication # 69193 Revision: 1.00, Issue Date: March 2026
>>
>> available at https://bugzilla.kernel.org/show_bug.cgi?id=206537
> 
> Please follow same style as what you used in the assignable counter enabling where
> this URL is provided via a "Link:" tag and then the text can refer to it. Specifically,
> 	Link: https://bugzilla.kernel.org/show_bug.cgi?id=206537 # [1]
> 

Sure.

>>
>> Signed-off-by: Babu Moger <babu.moger@amd.com>
>> ---
>> v3: Code did not change. Patch order cahnged.
>>      Added documentation link.
>>
>> v2: Rebased on top of the latest tip.
>> ---
>>   Documentation/admin-guide/kernel-parameters.txt | 2 +-
>>   arch/x86/include/asm/cpufeatures.h              | 1 +
>>   arch/x86/kernel/cpu/resctrl/core.c              | 2 ++
>>   arch/x86/kernel/cpu/scattered.c                 | 1 +
> 
> Please split changes to other subsystems and make these changes
> obvious with their own subject prefix to avoid sneaking changes into
> other subsystems via resctrl.
> 

Ok. Will be two patches.
1. For Documentation/admin-guide/kernel-parameters.txt
2.  arch/x86/include/asm/cpufeatures.h
     arch/x86/kernel/cpu/resctrl/core.c
     arch/x86/kernel/cpu/scattered.c

thanks
Babu

^ permalink raw reply

* Re: configurable block error injection v5
From: Jens Axboe @ 2026-06-12 16:44 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jonathan Corbet, Damien Le Moal, Hannes Reinecke, Keith Busch,
	linux-block, linux-doc
In-Reply-To: <20260611140703.2401204-1-hch@lst.de>


On Thu, 11 Jun 2026 16:06:43 +0200, Christoph Hellwig wrote:
> this series adds a new configurable block error injection facility.
> We already have a few to inject block errors, but unfortunately most
> of them are either not very useful or hard to use, or both:
> 
>  - The fail_make_request failure injection point can't distinguish
>    different commands, different ranges in the file and can only injection
>    plain I/O errors.
>  - the should_fail_bio 'dynamic' failure injection has all the same issues
>    as fail_make_request
>  - dm-error can only fail all command in the table using BLK_STS_IOERR
>    and requires setting up a new block device
>  - dm-flakey and dm-dust allow all kinds of configurability, but still
>    don't have good error selection, no good support for non-read/write
>    commands and are limited to the dm table alignment requirements,
>    which for zoned devices enforces setting them up for an entire zone.
>    They also once again require setting up a stacked block device,
>    which is really annoying in harnesses like xfstests
> 
> [...]

Applied, thanks!

[1/4] block: add a macro to initialize the status table
      commit: 8c8ebed16581faf3b3e97336aeca3d8226c4435f
[2/4] block: add a "tag" for block status codes
      commit: ce351560b714403acfdeed86ef96675d229da837
[3/4] block: add a str_to_blk_op helper
      commit: d39a63ead381c7ee93cd938ea2d759c17343b522
[4/4] block: add configurable error injection
      commit: e8dcf2d142bd720c8334233ad6cfdf00f0e76b7f

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH v3 1/4] mm/zswap: Make shrink_worker writeback cursor per-memcg
From: Shakeel Butt @ 2026-06-12 16:40 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: Hao Jia, Nhat Pham, akpm, tj, hannes, mhocko, mkoutny,
	chengming.zhou, muchun.song, roman.gushchin, cgroups, linux-mm,
	linux-kernel, linux-doc, Hao Jia
In-Reply-To: <airypNnKrJJ54k_0@google.com>

On Thu, Jun 11, 2026 at 05:39:16PM +0000, Yosry Ahmed wrote:
> On Tue, Jun 09, 2026 at 11:18:26AM +0800, Hao Jia wrote:
> > 
> > 
> > On 2026/6/9 02:01, Nhat Pham wrote:
> > > On Mon, Jun 8, 2026 at 9:48 AM Yosry Ahmed <yosry@kernel.org> wrote:
> > > > 
> > > > > But OTOH, this does seem like a recipe for inefficient reclaim. We
> > > > > might exhaust hotter memory of a cgroup while sparing colder memory of
> > > > > another cgroup... But maybe if they're all cold anyway, then who
> > > > > cares, and eventually you'll get to the cold stuff of other child?
> > > > 
> > > > Forgot to respond to this part, the unfairness is limited to the batch
> > > > size per-invocation, so it should be fine as long as you don't divide
> > > > the amount over 100 iterations for some reason. Also yes, all memory
> > > > in zswap is cold, the relative coldness is not that important (e.g.
> > > > compared to relative coldness during reclaim).
> > > 
> > > Ok then yeah, I think we should shelve per-memcg cursor for the next
> > > version. Down the line, if we have more data that unfairness is an
> > > issue, we can always fix it. One step at a time :)
> > 
> > Thanks a lot to Yosry, Nhat, and Shakeel for the great suggestions!
> > 
> > Let me summarize what I plan to do in the next version to make sure we are
> > on the same page:
> > 
> >  - Drop the per-memcg cursor and keep the root cgroup cursor
> > (zswap_next_shrink) logic intact.
> >  - Stick to using the zswap_writeback_only key, and change the proactive
> > writeback size to use the compressed size.
> >  - Consolidate and reuse the logic between shrink_worker() and
> > shrink_memcg(). Enable batch writeback in the shrink_worker() path, while
> > keeping the writeback behavior in the zswap_store() path unchanged.
> > 
> > Please let me know if I missed or misunderstood anything. Thanks again for
> > clearing things up!
> 
> Sorry for the late response, yes I think this makes sense. However, I
> have some comment about how this interacts with swap tiering, let me
> reply to the other thread.
> 

I think the swap tiers interaction will be figured out over next cycle. However
Hao can/should continue to push and we may decide to let it in orthogonal to
swap tiers.

^ permalink raw reply

* [PATCH v5 5/5] KVM: selftests: Add nested pre-fault test for arm64
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Add an arm64 nested-virt selftest for KVM_PRE_FAULT_MEMORY. The guest
enters vEL1 and exits to userspace with a nested/shadow stage-2 MMU as
the vCPU's last-run context.

Before prefaulting, userspace enables HCR_EL2.VM and points VTTBR_EL2 at
an empty nested stage-2 root. A prefault implementation that incorrectly
treats the userspace GPA as an L2 IPA will fail the ioctl; the correct
path swaps to the canonical stage-2 and succeeds.

Restore the original nested state before resuming the guest, then touch
the prefaulted range to check that vEL1 still runs correctly.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 tools/testing/selftests/kvm/Makefile.kvm      |   1 +
 .../kvm/arm64/nv_pre_fault_memory_test.c      | 200 ++++++++++++++++++
 2 files changed, 201 insertions(+)
 create mode 100644 tools/testing/selftests/kvm/arm64/nv_pre_fault_memory_test.c

diff --git a/tools/testing/selftests/kvm/Makefile.kvm b/tools/testing/selftests/kvm/Makefile.kvm
index 4609d8f23e38..63d79245b47d 100644
--- a/tools/testing/selftests/kvm/Makefile.kvm
+++ b/tools/testing/selftests/kvm/Makefile.kvm
@@ -170,6 +170,7 @@ TEST_GEN_PROGS_arm64 += arm64/debug-exceptions
 TEST_GEN_PROGS_arm64 += arm64/hello_el2
 TEST_GEN_PROGS_arm64 += arm64/host_sve
 TEST_GEN_PROGS_arm64 += arm64/hypercalls
+TEST_GEN_PROGS_arm64 += arm64/nv_pre_fault_memory_test
 TEST_GEN_PROGS_arm64 += arm64/external_aborts
 TEST_GEN_PROGS_arm64 += arm64/page_fault_test
 TEST_GEN_PROGS_arm64 += arm64/psci_test
diff --git a/tools/testing/selftests/kvm/arm64/nv_pre_fault_memory_test.c b/tools/testing/selftests/kvm/arm64/nv_pre_fault_memory_test.c
new file mode 100644
index 000000000000..2bbd5540599c
--- /dev/null
+++ b/tools/testing/selftests/kvm/arm64/nv_pre_fault_memory_test.c
@@ -0,0 +1,200 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * nv_pre_fault_memory_test - Test KVM_PRE_FAULT_MEMORY on a vCPU whose
+ * last-run context is nested.
+ *
+ * The guest starts at vEL2, mirrors its EL2 translation regime into the
+ * real EL1 registers, drops HCR_EL2.TGE and ERETs to vEL1, then exits to
+ * userspace from vEL1 so that the vCPU's last-run context selects a
+ * shadow stage-2 MMU. Userspace then enables an empty nested stage-2
+ * before prefaulting. Prefaulting must target the canonical stage-2,
+ * regardless of the vCPU's nested state.
+ */
+#include "kvm_util.h"
+#include "processor.h"
+#include "test_util.h"
+#include "ucall.h"
+
+#include <asm/sysreg.h>
+#include <linux/sizes.h>
+
+#define TEST_MEM_SLOT		10
+#define NESTED_S2_ROOT_SLOT	11
+#define TEST_MEM_SIZE		SZ_2M
+#define TEST_MEM_GPA		SZ_1G
+#define NESTED_S2_ROOT_GPA	(TEST_MEM_GPA + TEST_MEM_SIZE)
+
+struct nested_s2_state {
+	u64 hcr_el2;
+	u64 vttbr_el2;
+};
+
+static void guest_el1_code(void)
+{
+	u64 offset;
+
+	GUEST_ASSERT_EQ(get_current_el(), 1);
+
+	/* Exit to userspace with the vEL1 (nested) context live. */
+	GUEST_SYNC(1);
+
+	/*
+	 * Touch the prefaulted range. vstage-2 is disabled, so the shadow
+	 * stage-2 is a 1:1 view of the canonical IPA space.
+	 */
+	for (offset = 0; offset < TEST_MEM_SIZE; offset += SZ_4K)
+		READ_ONCE(*(u64 *)(TEST_MEM_GPA + offset));
+
+	GUEST_DONE();
+}
+
+static void guest_code(void)
+{
+	u64 sp;
+
+	GUEST_ASSERT_EQ(get_current_el(), 2);
+
+	/*
+	 * Mirror the EL2 translation regime into the real EL1 registers so
+	 * that vEL1 runs on the test's stage-1 page tables. With E2H=1, the
+	 * _EL1 accessors read the EL2 registers, and the _EL12 accessors
+	 * write the real EL1 registers.
+	 */
+	write_sysreg_s(read_sysreg(sctlr_el1), SYS_SCTLR_EL12);
+	write_sysreg_s(read_sysreg(tcr_el1), SYS_TCR_EL12);
+	write_sysreg_s(read_sysreg(ttbr0_el1), SYS_TTBR0_EL12);
+	write_sysreg_s(read_sysreg(mair_el1), SYS_MAIR_EL12);
+	write_sysreg_s(read_sysreg(cpacr_el1), SYS_CPACR_EL12);
+
+	/* Run vEL1 on the same stack. */
+	asm volatile("mov %0, sp" : "=r"(sp));
+	write_sysreg(sp, sp_el1);
+
+	/*
+	 * Drop TGE so that vEL1 is a nested context rather than host EL0.
+	 * KVM backs it with a shadow stage-2 MMU even though vstage-2 is
+	 * disabled (HCR_EL2.VM=0).
+	 */
+	write_sysreg(read_sysreg(hcr_el2) & ~HCR_EL2_TGE, hcr_el2);
+	isb();
+
+	write_sysreg(PSR_MODE_EL1h | PSR_F_BIT | PSR_I_BIT | PSR_A_BIT |
+		     PSR_D_BIT, spsr_el2);
+	write_sysreg((u64)guest_el1_code, elr_el2);
+	asm volatile("eret");
+
+	GUEST_ASSERT(false);
+}
+
+static void pre_fault(struct kvm_vcpu *vcpu, u64 gpa, u64 size)
+{
+	struct kvm_pre_fault_memory range = {
+		.gpa = gpa,
+		.size = size,
+	};
+	int ret;
+
+	do {
+		ret = __vcpu_ioctl(vcpu, KVM_PRE_FAULT_MEMORY, &range);
+	} while (ret < 0 && errno == EINTR);
+
+	TEST_ASSERT(!ret, "KVM_PRE_FAULT_MEMORY failed, ret: %d errno: %d",
+		    ret, errno);
+	TEST_ASSERT_EQ(range.size, 0);
+}
+
+static struct nested_s2_state enable_empty_nested_s2(struct kvm_vcpu *vcpu)
+{
+	struct nested_s2_state state = {
+		.hcr_el2 = vcpu_get_reg(vcpu, KVM_ARM64_SYS_REG(SYS_HCR_EL2)),
+		.vttbr_el2 = vcpu_get_reg(vcpu,
+					   KVM_ARM64_SYS_REG(SYS_VTTBR_EL2)),
+	};
+
+	TEST_ASSERT(!(state.hcr_el2 & HCR_EL2_TGE),
+		    "vCPU should be in nested/vEL1 context");
+
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VTTBR_EL2),
+		     NESTED_S2_ROOT_GPA);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_HCR_EL2),
+		     state.hcr_el2 | HCR_EL2_VM);
+
+	return state;
+}
+
+static void restore_nested_s2(struct kvm_vcpu *vcpu,
+			      struct nested_s2_state *state)
+{
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_HCR_EL2), state->hcr_el2);
+	vcpu_set_reg(vcpu, KVM_ARM64_SYS_REG(SYS_VTTBR_EL2),
+		     state->vttbr_el2);
+}
+
+int main(void)
+{
+	struct nested_s2_state s2;
+	struct kvm_vcpu_init init;
+	struct kvm_vcpu *vcpu;
+	struct kvm_vm *vm;
+	struct ucall uc;
+	u64 npages;
+
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_ARM_EL2));
+	TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
+
+	vm = vm_create(1);
+
+	kvm_get_default_vcpu_target(vm, &init);
+	init.features[0] |= BIT(KVM_ARM_VCPU_HAS_EL2);
+	vcpu = aarch64_vcpu_add(vm, 0, &init, guest_code);
+	kvm_arch_vm_finalize_vcpus(vm);
+
+	npages = TEST_MEM_SIZE / vm->page_size;
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, TEST_MEM_GPA,
+				    TEST_MEM_SLOT, npages, 0);
+	virt_map(vm, TEST_MEM_GPA, TEST_MEM_GPA, npages);
+
+	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+				    NESTED_S2_ROOT_GPA, NESTED_S2_ROOT_SLOT,
+				    1, 0);
+
+	/* Run the guest until it has ERET'd from vEL2 to vEL1. */
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_SYNC:
+		TEST_ASSERT_EQ(uc.args[1], 1);
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	default:
+		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+	}
+
+	/*
+	 * The vCPU's last-run context is vEL1, backed by a shadow stage-2
+	 * MMU. Enable nested stage-2 with an empty root so that the ioctl
+	 * fails if it tries to interpret the userspace GPA as an L2 IPA.
+	 * Prefault in two halves so that the second ioctl exercises a
+	 * repeated shadow-MMU attach and canonical stage-2 swap.
+	 */
+	s2 = enable_empty_nested_s2(vcpu);
+	pre_fault(vcpu, TEST_MEM_GPA, TEST_MEM_SIZE / 2);
+	pre_fault(vcpu, TEST_MEM_GPA + TEST_MEM_SIZE / 2, TEST_MEM_SIZE / 2);
+	restore_nested_s2(vcpu, &s2);
+
+	/* Resume at vEL1 and touch the prefaulted range. */
+	vcpu_run(vcpu);
+	switch (get_ucall(vcpu, &uc)) {
+	case UCALL_DONE:
+		break;
+	case UCALL_ABORT:
+		REPORT_GUEST_ASSERT(uc);
+		break;
+	default:
+		TEST_FAIL("Unhandled ucall: %ld", uc.cmd);
+	}
+
+	kvm_vm_free(vm);
+	return 0;
+}
-- 
2.43.0


^ permalink raw reply related

* [PATCH v5 4/5] KVM: selftests: Add option for different backing in pre-fault tests
From: Jack Thomson @ 2026-06-12 16:23 UTC (permalink / raw)
  To: maz, oupton, pbonzini
  Cc: joey.gouly, seiden, suzuki.poulose, yuzenghui, catalin.marinas,
	will, shuah, corbet, vladimir.murzin, linux-arm-kernel, kvmarm,
	kvm, linux-kernel, linux-kselftest, linux-doc, isaku.yamahata,
	Jack Thomson
In-Reply-To: <20260612162354.73378-1-jackabt.amazon@gmail.com>

From: Jack Thomson <jackabt@amazon.com>

Add a -s option to specify different memory backing types for the
pre-fault tests (e.g. anonymous, hugetlb), allowing testing of the
pre-fault functionality across different memory configurations.

Signed-off-by: Jack Thomson <jackabt@amazon.com>
---
 .../selftests/kvm/pre_fault_memory_test.c     | 51 +++++++++++++------
 1 file changed, 36 insertions(+), 15 deletions(-)

diff --git a/tools/testing/selftests/kvm/pre_fault_memory_test.c b/tools/testing/selftests/kvm/pre_fault_memory_test.c
index 9f5f0d1a5db1..c850cf28e86a 100644
--- a/tools/testing/selftests/kvm/pre_fault_memory_test.c
+++ b/tools/testing/selftests/kvm/pre_fault_memory_test.c
@@ -45,6 +45,7 @@ struct slot_worker_data {
 	struct kvm_vm *vm;
 	gpa_t gpa;
 	u32 flags;
+	enum vm_mem_backing_src_type mem_backing_src;
 	bool worker_ready;
 	bool prefault_ready;
 	bool recreate_slot;
@@ -65,14 +66,16 @@ static void *delete_slot_worker(void *__data)
 	while (!READ_ONCE(data->recreate_slot))
 		cpu_relax();
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS, data->gpa,
+	vm_userspace_mem_region_add(vm, data->mem_backing_src, data->gpa,
 				    TEST_SLOT, test_config.test_num_pages, data->flags);
 
 	return NULL;
 }
 
 static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
-			     u64 size, u64 expected_left, bool private)
+			     u64 size, u64 expected_left,
+			     enum vm_mem_backing_src_type mem_backing_src,
+			     bool private)
 {
 	struct kvm_pre_fault_memory range = {
 		.gpa = base_gpa + offset,
@@ -83,6 +86,7 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 		.vm = vcpu->vm,
 		.gpa = base_gpa,
 		.flags = private ? KVM_MEM_GUEST_MEMFD : 0,
+		.mem_backing_src = mem_backing_src,
 	};
 	bool slot_recreated = false;
 	pthread_t slot_worker;
@@ -172,11 +176,13 @@ static void pre_fault_memory(struct kvm_vcpu *vcpu, u64 base_gpa, u64 offset,
 struct test_params {
 	unsigned long vm_type;
 	bool private;
+	enum vm_mem_backing_src_type mem_backing_src;
 };
 
 static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 {
 	gpa_t gpa, gva, alignment, guest_page_size, host_page_size;
+	gpa_t backing_src_pagesz, mem_page_size;
 	struct test_params *p = arg;
 	const struct vm_shape shape = {
 		.mode = guest_mode,
@@ -188,24 +194,28 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	struct ucall uc;
 
 	pr_info("Testing guest mode: %s\n", vm_guest_mode_string(guest_mode));
+	pr_info("Testing memory backing src type: %s\n",
+		vm_mem_backing_src_alias(p->mem_backing_src)->name);
 
 	vm = vm_create_shape_with_one_vcpu(shape, &vcpu, guest_code);
 
 	guest_page_size = vm_guest_mode_params[guest_mode].page_size;
 	host_page_size = getpagesize();
+	backing_src_pagesz = get_backing_src_pagesz(p->mem_backing_src);
+	mem_page_size = max(host_page_size, backing_src_pagesz);
 
 	test_config.page_size = guest_page_size;
 	test_config.test_size = align_up(TEST_BASE_SIZE + test_config.page_size,
-					 host_page_size);
+					 mem_page_size);
 	test_config.test_num_pages = vm_calc_num_guest_pages(vm->mode, test_config.test_size);
 
 	gpa = (vm->max_gfn - test_config.test_num_pages) * test_config.page_size;
 	alignment = SZ_2M;
-	alignment = max(alignment, host_page_size);
+	alignment = max(alignment, mem_page_size);
 	gpa = align_down(gpa, alignment);
 	gva = gpa & ((1ULL << (vm->va_bits - 1)) - 1);
 
-	vm_userspace_mem_region_add(vm, VM_MEM_SRC_ANONYMOUS,
+	vm_userspace_mem_region_add(vm, p->mem_backing_src,
 				    gpa, TEST_SLOT, test_config.test_num_pages,
 				    p->private ? KVM_MEM_GUEST_MEMFD : 0);
 	virt_map(vm, gva, gpa, test_config.test_num_pages);
@@ -213,14 +223,18 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	if (p->private)
 		vm_mem_set_private(vm, gpa, test_config.test_size);
 
-	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0,
+			 p->mem_backing_src, p->private);
 	/* Retry the same range after the first prefault attempt. */
-	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0, p->private);
+	pre_fault_memory(vcpu, gpa, 0, test_config.test_size, 0,
+			 p->mem_backing_src, p->private);
 	pre_fault_memory(vcpu, gpa,
 			 test_config.test_size - host_page_size,
-			 host_page_size * 2, host_page_size, p->private);
+			 host_page_size * 2, host_page_size,
+			 p->mem_backing_src, p->private);
 	pre_fault_memory(vcpu, gpa, test_config.test_size,
-			 host_page_size, host_page_size, p->private);
+			 host_page_size, host_page_size,
+			 p->mem_backing_src, p->private);
 
 	vcpu_args_set(vcpu, 1, gva);
 
@@ -249,11 +263,13 @@ static void __test_pre_fault_memory(enum vm_guest_mode guest_mode, void *arg)
 	kvm_vm_free(vm);
 }
 
-static void test_pre_fault_memory(unsigned long vm_type, bool private)
+static void test_pre_fault_memory(unsigned long vm_type, enum vm_mem_backing_src_type backing_src,
+				  bool private)
 {
 	struct test_params p = {
 		.vm_type = vm_type,
 		.private = private,
+		.mem_backing_src = backing_src,
 	};
 
 	if (vm_type && !(kvm_check_cap(KVM_CAP_VM_TYPES) & BIT(vm_type))) {
@@ -267,23 +283,28 @@ static void test_pre_fault_memory(unsigned long vm_type, bool private)
 static void help(char *name)
 {
 	puts("");
-	printf("usage: %s [-h] [-m mode]\n", name);
+	printf("usage: %s [-h] [-m mode] [-s mem-type]\n", name);
 	puts("");
 	guest_modes_help();
+	backing_src_help("-s");
 	puts("");
 }
 
 int main(int argc, char *argv[])
 {
+	enum vm_mem_backing_src_type backing = DEFAULT_VM_MEM_SRC;
 	int opt;
 
 	guest_modes_append_default();
 
-	while ((opt = getopt(argc, argv, "hm:")) != -1) {
+	while ((opt = getopt(argc, argv, "hm:s:")) != -1) {
 		switch (opt) {
 		case 'm':
 			guest_modes_cmdline(optarg);
 			break;
+		case 's':
+			backing = parse_backing_src_type(optarg);
+			break;
 		case 'h':
 		default:
 			help(argv[0]);
@@ -293,10 +314,10 @@ int main(int argc, char *argv[])
 
 	TEST_REQUIRE(kvm_check_cap(KVM_CAP_PRE_FAULT_MEMORY));
 
-	test_pre_fault_memory(0, false);
+	test_pre_fault_memory(0, backing, false);
 #ifdef __x86_64__
-	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, false);
-	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, true);
+	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, false);
+	test_pre_fault_memory(KVM_X86_SW_PROTECTED_VM, backing, true);
 #endif
 	return 0;
 }
-- 
2.43.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox