Linux Documentation

Linux Documentation
 help / color / mirror / Atom feed

* Re: [PATCH v5 3/4] PCI: endpoint: Add support for DOE initialization and setup in EPC core
From: Bjorn Helgaas @ 2026-06-11 19:12 UTC (permalink / raw)
  To: Aksh Garg
  Cc: linux-pci, linux-doc, mani, kwilczynski, bhelgaas, corbet, kishon,
	skhan, lukas, cassel, alistair, linux-arm-kernel, linux-kernel,
	s-vadapalli, danishanwar, srk
In-Reply-To: <20260610100256.1889111-4-a-garg7@ti.com>

On Wed, Jun 10, 2026 at 03:32:55PM +0530, Aksh Garg wrote:
> Add pci_epc_init_capabilities() in EPC core driver to initialize and
> setup the capabilities supported by the EPC driver. This calls
> pci_epc_doe_setup() to setup the DOE framework for an endpoint controller,
> which discovers the DOE capabilities (extended capability ID 0x2E), and
> registers each discovered DOE mailbox for all the functions in the
> endpoint controller.
> 
> Add pci_epc_deinit_capabilities() in EPC core driver for cleanup of the
> resources used by the capabilities of the EPC driver. This calls
> pci_ep_doe_destroy() to destroy all DOE mailboxes and free associated
> resources.
> 
> Co-developed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Aksh Garg <a-garg7@ti.com>
> ---
> 
> Changes from v4 to v5:
> - Addressed the review comments by Sashiko
> 
> Changes from v3 to v4:
> - Call DOE setup and destroy APIs directly within the EPC core, instead of
>   relying on the EPC drivers to call them individually. EPC drivers do not
>   need to explicitly handle DOE setup, rather the EPC core manages this
>   transparently. (Suggested by Manivannan Sadhasivam).
> - Removed pci_epc_doe_destroy() API, which was just calling pci_ep_doe_destroy().
>   Instead, called pci_ep_doe_destroy() directly during cleanup.
> - Called pci_ep_doe_init() before the "!epc->ops->find_ext_capability" check,
>   because if doe-capable=1 and find_ext_capability() op is undefined, this
>   would not initialize the epc->doe_mbs xarray. However during cleanup, the
>   check "!epc->ops->find_ext_capability" would be unnecessary, and it will
>   try to destroy the epc->doe_mbs xarray even when it was not initialized.
> 
> Changes from v2 to v3:
> - Rebased on 7.1-rc1.
> 
> Changes since v1:
> - New patch added to v2 (not present in v1)
> 
> v4: https://lore.kernel.org/all/20260522052434.802034-4-a-garg7@ti.com/
> v3: https://lore.kernel.org/all/20260427051725.223704-4-a-garg7@ti.com/
> v2: https://lore.kernel.org/all/20260401073022.215805-4-a-garg7@ti.com/
> 
> This patch is introduced based on the feedback provided by Manivannan
> Sadhasivam at [1].
> 
> [1]: https://lore.kernel.org/all/p57x6jleaim5w7t2k3v7tioujnaxuovfpj5euop5ogefvw23se@y5fw3che5p5d/
> 
> 
>  drivers/pci/endpoint/pci-epc-core.c | 104 ++++++++++++++++++++++++++++
>  include/linux/pci-epc.h             |   6 ++
>  2 files changed, 110 insertions(+)
> 
> diff --git a/drivers/pci/endpoint/pci-epc-core.c b/drivers/pci/endpoint/pci-epc-core.c
> index 6c3c58185fc5..e48f40eeed29 100644
> --- a/drivers/pci/endpoint/pci-epc-core.c
> +++ b/drivers/pci/endpoint/pci-epc-core.c
> @@ -14,6 +14,8 @@
>  #include <linux/pci-epf.h>
>  #include <linux/pci-ep-cfs.h>
>  
> +#include "../pci.h"
> +
>  static const struct class pci_epc_class = {
>  	.name = "pci_epc",
>  };
> @@ -842,6 +844,81 @@ void pci_epc_linkdown(struct pci_epc *epc)
>  }
>  EXPORT_SYMBOL_GPL(pci_epc_linkdown);
>  
> +/**
> + * pci_epc_doe_setup() - Discover and setup DOE mailboxes for all functions
> + * @epc: the EPC device on which DOE mailboxes has to be setup
> + *
> + * Discover DOE (Data Object Exchange) capabilities for all physical functions
> + * in the endpoint controller and register DOE mailboxes.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_epc_doe_setup(struct pci_epc *epc)
> +{
> +	u8 func_no, vfunc_no = 0;
> +	u16 cap_offset;
> +	int ret;
> +
> +	if (!epc->ops || !epc->ops->find_ext_capability)
> +		return -EINVAL;

I don't see anything that sets pci_epc_ops.find_ext_capability in this
series, so this looks currently unused and untestable, so likely not
mergeable as-is.  What's the plan for users of this?

> +	/* Discover DOE capabilities for all functions */
> +	for (func_no = 0; func_no < epc->max_functions; func_no++) {
> +		mutex_lock(&epc->lock);
> +		cap_offset = epc->ops->find_ext_capability(epc, func_no,
> +							   vfunc_no, 0,
> +							   PCI_EXT_CAP_ID_DOE);
> +		mutex_unlock(&epc->lock);
> +
> +		while (cap_offset) {
> +			/* Register this DOE mailbox */
> +			ret = pci_ep_doe_add_mailbox(epc, func_no, cap_offset);
> +			if (ret) {
> +				dev_warn(&epc->dev,
> +					 "[pf%d:offset %x] failed to add DOE mailbox\n",
> +					 func_no, cap_offset);
> +			}
> +
> +			mutex_lock(&epc->lock);
> +			cap_offset = epc->ops->find_ext_capability(epc, func_no,
> +								   vfunc_no, cap_offset,
> +								   PCI_EXT_CAP_ID_DOE);
> +			mutex_unlock(&epc->lock);
> +		}
> +	}
> +
> +	dev_dbg(&epc->dev, "DOE mailboxes setup complete\n");
> +	return 0;
> +}
> +
> +/**
> + * pci_epc_init_capabilities() - Initialize EPC capabilities
> + * @epc: the EPC device whose capabilities need to be initialized
> + *
> + * Invoke to initialize capabilities supported by the EPC device.

s/Invoke to initialize/Initialize/

> + */
> +static void pci_epc_init_capabilities(struct pci_epc *epc)
> +{
> +	const struct pci_epc_features *epc_features;
> +	int ret;
> +
> +	epc_features = pci_epc_get_features(epc, 0, 0);
> +	if (!epc_features)
> +		return;
> +
> +	if (IS_ENABLED(CONFIG_PCI_ENDPOINT_DOE) && epc_features->doe_capable) {
> +		ret = pci_ep_doe_init(epc);
> +		if (ret) {
> +			dev_warn(&epc->dev, "DOE initialization failed: %d\n", ret);
> +			return;
> +		}
> +
> +		ret = pci_epc_doe_setup(epc);
> +		if (ret)
> +			dev_warn(&epc->dev, "DOE setup failed: %d\n", ret);
> +	}
> +}
> +
>  /**
>   * pci_epc_init_notify() - Notify the EPF device that EPC device initialization
>   *                         is completed.
> @@ -857,6 +934,9 @@ void pci_epc_init_notify(struct pci_epc *epc)
>  	if (IS_ERR_OR_NULL(epc))
>  		return;
>  
> +	if (!epc->init_complete)
> +		pci_epc_init_capabilities(epc);
> +
>  	mutex_lock(&epc->list_lock);
>  	list_for_each_entry(epf, &epc->pci_epf, list) {
>  		mutex_lock(&epf->lock);
> @@ -890,6 +970,27 @@ void pci_epc_notify_pending_init(struct pci_epc *epc, struct pci_epf *epf)
>  }
>  EXPORT_SYMBOL_GPL(pci_epc_notify_pending_init);
>  
> +/**
> + * pci_epc_deinit_capabilities() - Cleanup EPC capabilities
> + * @epc: the EPC device whose capabilities need to be cleaned up
> + *
> + * Invoke to cleanup capabilities supported by the EPC device,
> + * and free the associated resources.

s/Invoke to cleanup/Clean up/

> + */
> +static void pci_epc_deinit_capabilities(struct pci_epc *epc)
> +{
> +	const struct pci_epc_features *epc_features;
> +
> +	epc_features = pci_epc_get_features(epc, 0, 0);
> +	if (!epc_features)
> +		return;
> +
> +	if (IS_ENABLED(CONFIG_PCI_ENDPOINT_DOE) && epc_features->doe_capable) {
> +		pci_ep_doe_destroy(epc);
> +		dev_dbg(&epc->dev, "DOE mailboxes destroyed\n");
> +	}
> +}
> +
>  /**
>   * pci_epc_deinit_notify() - Notify the EPF device about EPC deinitialization
>   * @epc: the EPC device whose deinitialization is completed
> @@ -903,6 +1004,9 @@ void pci_epc_deinit_notify(struct pci_epc *epc)
>  	if (IS_ERR_OR_NULL(epc))
>  		return;
>  
> +	if (epc->init_complete)
> +		pci_epc_deinit_capabilities(epc);
> +
>  	mutex_lock(&epc->list_lock);
>  	list_for_each_entry(epf, &epc->pci_epf, list) {
>  		mutex_lock(&epf->lock);
> diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
> index dd26294c8175..11474e337db3 100644
> --- a/include/linux/pci-epc.h
> +++ b/include/linux/pci-epc.h
> @@ -84,6 +84,8 @@ struct pci_epc_map {
>   * @start: ops to start the PCI link
>   * @stop: ops to stop the PCI link
>   * @get_features: ops to get the features supported by the EPC
> + * @find_ext_capability: ops to find extended capability offset for a function
> + *			 in endpoint controller
>   * @owner: the module owner containing the ops
>   */
>  struct pci_epc_ops {
> @@ -115,6 +117,8 @@ struct pci_epc_ops {
>  	void	(*stop)(struct pci_epc *epc);
>  	const struct pci_epc_features* (*get_features)(struct pci_epc *epc,
>  						       u8 func_no, u8 vfunc_no);
> +	u16	(*find_ext_capability)(struct pci_epc *epc, u8 func_no,
> +				       u8 vfunc_no, u16 start, u8 cap);
>  	struct module *owner;
>  };
>  
> @@ -270,6 +274,7 @@ struct pci_epc_bar_desc {
>   * @msi_capable: indicate if the endpoint function has MSI capability
>   * @msix_capable: indicate if the endpoint function has MSI-X capability
>   * @intx_capable: indicate if the endpoint can raise INTx interrupts
> + * @doe_capable: indicate if the endpoint function has DOE capability
>   * @bar: array specifying the hardware description for each BAR
>   * @align: alignment size required for BAR buffer allocation
>   */
> @@ -280,6 +285,7 @@ struct pci_epc_features {
>  	unsigned int	msi_capable : 1;
>  	unsigned int	msix_capable : 1;
>  	unsigned int	intx_capable : 1;
> +	unsigned int	doe_capable : 1;
>  	struct	pci_epc_bar_desc bar[PCI_STD_NUM_BARS];
>  	size_t	align;
>  };
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [PATCH v3 2/4] mm/zswap: Implement proactive writeback
From: Shakeel Butt @ 2026-06-11 19:12 UTC (permalink / raw)
  To: Yosry Ahmed
  Cc: YoungJun Park, Hao Jia, Johannes Weiner, mhocko, tj, mkoutny,
	roman.gushchin, Nhat Pham, akpm, chengming.zhou, muchun.song,
	cgroups, linux-mm, linux-kernel, linux-doc, Hao Jia, chrisl,
	kasong, baoquan.he
In-Reply-To: <airzE7jD9UtyR17J@google.com>

On Thu, Jun 11, 2026 at 05:45:04PM +0000, Yosry Ahmed wrote:
> On Tue, Jun 09, 2026 at 01:19:13PM +0900, YoungJun Park wrote:
> > On Mon, Jun 08, 2026 at 03:27:07PM -0700, Yosry Ahmed wrote:
> > 
> > +Chris +Kairui +Baoquan
> > 
> > Hello
> > 
> > Thanks for inviting me to the discussion, Shakeel.
> > 
> > > > > > Youngjun is working on swap tiers. At the moment he is more interested in
> > > > > > allowing a specific swap device to a memcg or not. I can imagine in future there
> > > > > > will be use-cases where there will be a need to demote data on higher tier swap
> > > > > > to lower tier swap. What would be the appropriate interface?
> > 
> > Speaking of my work on swap tiers, I recently submitted a patch and am
> > currently considering memcg integration:
> > https://lore.kernel.org/linux-mm/20260527062247.3440692-1-youngjun.park@lge.com/
> > 
> > The future use-cases imagined above seem to align with this
> > direction. (BTW, I am currently waiting for reviews/feedback from the memcg
> > folks on this patch. Any reviews would be highly appreciated!)
> > 
> > We could potentially assign a target tier
> > for writeback within the existing memory.zswap.writeback interface. 
> > 
> > For instance, '0' could mean disabled, while non-zero values could represent
> > specific tiers, which would maintain backward compatibility with the current
> > version. Alternatively, if zswap is treated as the default top tier, 
> > the `memory.swap.tiers` interface could potentially replace `memory.zswap.writeback`.
> > 
> > Furthermore, this could be expanded so that each swap tier can demote data
> > user-triggered demotion between swap tiers.
> > 
> > Based on the current patch's ideas combined with my swap tiers concept:
> > 
> > Assuming a hierarchy like:
> > zswap -> tier1 (SSD swap) -> tier2 (HDD swap) -> tier3 (Network swap)
> > 
> > We could configure the active tiers via a setting like `memory.swap.tiers`
> > (tier2 enabled, tier3 enabled).
> > 
> > For example, the concept of `echo "100M zswap_writeback_only > memory.reclaim"`
> > could be extended. A user could run `echo "100M tier2 > memory.reclaim"`
> > to explicitly trigger demotion from tier2 to tier3.
> > (BTW, if we combine these features, my personal preference for the keyword
> > format would be `<size> <demote_prefix><tier_name>`. I think it would be
> > better to explicitly indicate that it is a swap demotion by using a specific
> > prefix followed by the tier name. 
> > Or make demote prefix another key is also possible)
> 
> I am not sure if proactive demotion between swap tiers would be driven
> by memory.reclaim, I am guessing a new interface might be more suitable.
> But yes, you are right that it's very possible that
> 'zswap_writeback_only' with memory.reclaim will become obsolete once
> swap tiering matures and starts supporting things like proactive
> demotion.
> 
> Part of me wants to wait until the swap tiering interfaces are figured
> out so that we don't end up with redundant interfaces, but I also don't
> want to hold Hao's work since it doesn't directly depend on swap
> tiering.
> 
> Shakeel, how do you want to handle this? I think there's a few options:
> 
> 1. Add zswap_writeback_only now, and when we have swap tiering demotion
> it becomes a redundant interface, like memory.zswap.writeback -- or
> maybe we try to deprecate both of them at that point. It's difficult to
> remove interfaces tho, but maybe easier to stop supporting
> zswap_writeback_only.
> 
> 2. Add zswap_writeback_only behind an experimental config option, to
> unblock development but have a line of sight to dropping support once we
> have a swap tiering interface.
> 
> 3. Wait until we figure out the swap tiering interfaces and then add
> the proactive zswap writeback as part of it.
> 
> WDYT?

Is Hao's work needed for some followup work/development? The earliest Hao's
work can is 7.3, so if we aim to figure out swap tiering interfaces in next
couple of weeks then option 3 is the way to go. If swap tiers take more time
then we can discuss other options as well.

However I would need zswap folks (Yosry & Nhat) help in figuring out swap tiers
interfaces. Zswap is the current top tier swap usage in real world. I want
zswap users to eaily (and hopefully transparently) migrate to swap tiers.

^ permalink raw reply

* Re: [PATCH v5 2/4] PCI: endpoint: Add DOE mailbox support for endpoint functions
From: Bjorn Helgaas @ 2026-06-11 19:11 UTC (permalink / raw)
  To: Aksh Garg
  Cc: linux-pci, linux-doc, mani, kwilczynski, bhelgaas, corbet, kishon,
	skhan, lukas, cassel, alistair, linux-arm-kernel, linux-kernel,
	s-vadapalli, danishanwar, srk
In-Reply-To: <20260610100256.1889111-3-a-garg7@ti.com>

On Wed, Jun 10, 2026 at 03:32:54PM +0530, Aksh Garg wrote:
> DOE (Data Object Exchange) is a standard PCIe extended capability
> feature introduced in the Data Object Exchange (DOE) ECN for
> PCIe r5.0. It provides a communication mechanism primarily used for
> implementing PCIe security features such as device authentication, and
> secure link establishment. Think of DOE as a sophisticated mailbox
> system built into PCIe. The root complex can send structured requests
> to the endpoint device through DOE mailboxes, and the endpoint device
> responds with appropriate data.

Please cite a spec revision and section instead of the ECN because
it's easier to find the spec than the ECN.  E.g., "PCIe r7.0, sec
6.30" or similar.

> Add the DOE support for PCIe endpoint devices, enabling endpoint
> functions to process the DOE requests from the host. The implementation
> provides framework APIs for EPC core driver and controller drivers to
> register mailboxes, and request processing with workqueues ensuring
> sequential handling per mailbox, and parallel handling across mailboxes.
> The Discovery protocol is handled internally by the DOE core.
> 
> This implementation complements the existing DOE implementation for
> root complex in drivers/pci/doe.c.
> 
> Co-developed-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Siddharth Vadapalli <s-vadapalli@ti.com>
> Signed-off-by: Aksh Garg <a-garg7@ti.com>
> ---
> 
> Changes from v4 to v5:
> - Addressed the review comments by Sashiko
> - Added refcount per DOE Mailbox to fix Use-After-Free bug
> - Change in the Abort Sequence:
>   * Instead of waiting on flush_workqueue() to clear the CANCEL flag,
>     return immediately after setting the CANCEL flag. The CANCEL flag
>     gets cleared in signal_task_complete(), allowing the mailbox to
>     accept new requests
>   * Abort sequence handling in various scenarios is updated and explained
>     in the documentation at PATCH 4/4
> 
> Changes from v3 to v4:
> - Used 'Returns' instead of 'RETURNS' in the function docstrings to
>   comply with kernel-doc format, as suggested by Manivannan Sadhasivam.
> - In pci_ep_doe_process_request(), changed the type of request buffer
>   from "const void *" to "void *", as the ownership is transferred to
>   DOE-EP framework, which is responsible to free the buffer.
> - Added "struct pci_epc *epc" to typedef "pci_ep_doe_complete_t", to be
>   used by the EPC driver.
> 
> Changes from v2 to v3:
> - Rebased on 7.1-rc1.
> 
> Changes since v1:
> - Moved the DOE-EP core file to drivers/pci/endpoint/pci-ep-doe.c, and
>   corresponding Kconfig and Makefile to match the existing naming scheme,
>   as suggested by Niklas Cassel.
> - Renamed the config from PCI_DOE_EP to PCI_ENDPOINT_DOE
> - Moved the function declarations that need not be visible outside the
>   PCI core to drivers/pci/pci.h instead to include/linux/pci-doe.h as
>   suggested by Lukas Wunner
> - Converted from synchronous to asynchronous request processing:
>   * Removed wait_for_completion() from pci_ep_doe_process_request()
>   * Function returns immediately after queuing to workqueue, hence
>     removed private data for completion in the task structure
>   * Added completion callback as an additional argument to
>     pci_ep_doe_process_request(), which takes the response and status
>     parameters as arguments (along with other required arguments), hence
>     removed task_status in the task structure
>   * Created a typedef pci_ep_doe_complete_t for completion callback
>   * Removed the pci_ep_doe_task_complete() function, as it would not be
>     required anymore with these changes
>   * Moved from INIT_WORK_ONSTACK() to INIT_WORK(), to initialize the work
>     on heap instead of stack
>   * signal_task_complete() now invokes the completion callback, once the
>     protocol handler completes its task
> - Changed from dynamic xarray-based protocol registration to static array:
>   * Removed the register/unregister protocol APIs
>   * Replaced the dynamic xarray with static array of struct pci_doe_protocol
>   * Added discovery protocol to static array, instead of treating it specially,
>     hence removed the special handling for Discovery protocol in
>     doe_ep_task_work()
>   * Updated pci_ep_doe_handle_discovery() and pci_ep_doe_find_protocol()
>     accordingly.
> - Memory Management:
>   * DOE core frees request buffer in signal_task_complete()
>     or during error handling
>   * pci_ep_doe_process_request() defines response_pl and response_pl_sz
>     as NULL and 0 respectively, whose pointer is passed to the protocol
>     handler, hence removed the arguments void **response, size_t *response_sz
>     to this function.
> - Task structure refactoring:
>   * Response buffer: void **response_pl to void *response_pl
>   * Response size: size_t *response_pl_sz to size_t response_pl_sz
>   * Changed the completion callback to type pci_ep_doe_complete_t
>   * Removed void *private and int task_status
> - Updated documentation comments of the functions according to the changes
> 
> v4: https://lore.kernel.org/all/20260522052434.802034-3-a-garg7@ti.com/
> v3: https://lore.kernel.org/all/20260427051725.223704-3-a-garg7@ti.com/
> v2: https://lore.kernel.org/all/20260401073022.215805-3-a-garg7@ti.com/
> v1: https://lore.kernel.org/all/20260213123603.420941-4-a-garg7@ti.com/
> 
>  drivers/pci/endpoint/Kconfig      |  14 +
>  drivers/pci/endpoint/Makefile     |   1 +
>  drivers/pci/endpoint/pci-ep-doe.c | 594 ++++++++++++++++++++++++++++++
>  drivers/pci/pci.h                 |  39 ++
>  include/linux/pci-doe.h           |   5 +
>  include/linux/pci-epc.h           |   3 +
>  6 files changed, 656 insertions(+)
>  create mode 100644 drivers/pci/endpoint/pci-ep-doe.c
> 
> diff --git a/drivers/pci/endpoint/Kconfig b/drivers/pci/endpoint/Kconfig
> index 8dad291be8b8..15ae16aaa58f 100644
> --- a/drivers/pci/endpoint/Kconfig
> +++ b/drivers/pci/endpoint/Kconfig
> @@ -36,6 +36,20 @@ config PCI_ENDPOINT_MSI_DOORBELL
>  	  doorbell. The RC can trigger doorbell in EP by writing data to a
>  	  dedicated BAR, which the EP maps to the controller's message address.
>  
> +config PCI_ENDPOINT_DOE
> +	bool "PCI Endpoint Data Object Exchange (DOE) support"
> +	depends on PCI_ENDPOINT
> +	help
> +	  This enables support for Data Object Exchange (DOE) protocol
> +	  on PCI Endpoint controllers. It provides a communication
> +	  mechanism through mailboxes, primarily used for PCIe security
> +	  features.
> +
> +	  Say Y here if you want be able to communicate using PCIe DOE
> +	  mailboxes.
> +
> +	  If unsure, say N.
> +
>  source "drivers/pci/endpoint/functions/Kconfig"
>  
>  endmenu
> diff --git a/drivers/pci/endpoint/Makefile b/drivers/pci/endpoint/Makefile
> index b4869d52053a..1fa176b6792b 100644
> --- a/drivers/pci/endpoint/Makefile
> +++ b/drivers/pci/endpoint/Makefile
> @@ -7,3 +7,4 @@ obj-$(CONFIG_PCI_ENDPOINT_CONFIGFS)	+= pci-ep-cfs.o
>  obj-$(CONFIG_PCI_ENDPOINT)		+= pci-epc-core.o pci-epf-core.o\
>  					   pci-epc-mem.o functions/
>  obj-$(CONFIG_PCI_ENDPOINT_MSI_DOORBELL)	+= pci-ep-msi.o
> +obj-$(CONFIG_PCI_ENDPOINT_DOE)		+= pci-ep-doe.o
> diff --git a/drivers/pci/endpoint/pci-ep-doe.c b/drivers/pci/endpoint/pci-ep-doe.c
> new file mode 100644
> index 000000000000..ea6a152461bb
> --- /dev/null
> +++ b/drivers/pci/endpoint/pci-ep-doe.c
> @@ -0,0 +1,594 @@
> +// SPDX-License-Identifier: GPL-2.0-only OR MIT
> +/*
> + * Data Object Exchange for PCIe Endpoint
> + *	PCIe r7.0, sec 6.30 DOE
> + *
> + * Copyright (C) 2026 Texas Instruments Incorporated - https://www.ti.com
> + *	Aksh Garg <a-garg7@ti.com>
> + *	Siddharth Vadapalli <s-vadapalli@ti.com>
> + */
> +
> +#define dev_fmt(fmt) "DOE EP: " fmt
> +
> +#include <linux/bitfield.h>
> +#include <linux/device.h>
> +#include <linux/pci.h>
> +#include <linux/pci-epc.h>
> +#include <linux/pci-doe.h>
> +#include <linux/refcount.h>
> +#include <linux/slab.h>
> +#include <linux/workqueue.h>
> +#include <linux/xarray.h>
> +
> +#include "../pci.h"
> +
> +/* Forward declaration of discovery protocol handler */
> +static int pci_ep_doe_handle_discovery(const void *request, size_t request_sz,
> +				       void **response, size_t *response_sz);
> +
> +/**
> + * struct pci_doe_protocol - DOE protocol handler entry
> + * @vid: Vendor ID
> + * @type: Protocol type
> + * @handler: Handler function pointer
> + */
> +struct pci_doe_protocol {
> +	u16 vid;
> +	u8 type;
> +	pci_doe_protocol_handler_t handler;
> +};
> +
> +/**
> + * struct pci_ep_doe_mb - State for a single DOE mailbox on EP
> + *
> + * This state is used to manage a single DOE mailbox capability on the
> + * endpoint side.
> + *
> + * @epc: PCI endpoint controller this mailbox belongs to
> + * @func_no: Physical function number of the function this mailbox belongs to
> + * @cap_offset: Capability offset
> + * @work_queue: Queue of work items
> + * @flags: Bit array of PCI_DOE_FLAG_* flags
> + * @refs: Refcount to manage mailbox lifetime and ensure safe cleanup
> + */
> +struct pci_ep_doe_mb {
> +	struct pci_epc *epc;
> +	u8 func_no;
> +	u16 cap_offset;
> +	struct workqueue_struct *work_queue;
> +	unsigned long flags;
> +	refcount_t refs;
> +};
> +
> +/**
> + * struct pci_ep_doe_task - Represents a single DOE request/response task
> + *
> + * @feat: DOE feature (vendor ID and type)
> + * @request_pl: Request payload
> + * @request_pl_sz: Size of request payload in bytes
> + * @response_pl: Response buffer
> + * @response_pl_sz: Size of response buffer in bytes
> + * @complete: Completion callback
> + * @work: Work structure for workqueue
> + * @doe_mb: DOE mailbox handling this task
> + */
> +struct pci_ep_doe_task {
> +	struct pci_doe_feature feat;
> +	const void *request_pl;
> +	size_t request_pl_sz;
> +	void *response_pl;
> +	size_t response_pl_sz;
> +	pci_ep_doe_complete_t complete;
> +
> +	/* Initialized by pci_ep_doe_submit_task() */
> +	struct work_struct work;
> +	struct pci_ep_doe_mb *doe_mb;
> +};
> +
> +/*
> + * Global registry of protocol handlers.
> + * When a new DOE protocol, library is added, add an entry to this array.
> + */
> +static const struct pci_doe_protocol pci_doe_protocols[] = {
> +	{
> +		.vid = PCI_VENDOR_ID_PCI_SIG,
> +		.type = PCI_DOE_FEATURE_DISCOVERY,
> +		.handler = pci_ep_doe_handle_discovery,
> +	},
> +};
> +
> +/*
> + * Combines function number and capability offset into a unique lookup key
> + * for storing/retrieving DOE mailboxes in an xarray.

s/Combines/Combine/

> + */
> +#define PCI_DOE_MB_KEY(func, offset) \
> +	(((unsigned long)(func) << 16) | (offset))
> +#define PCI_DOE_PROTOCOL_COUNT        ARRAY_SIZE(pci_doe_protocols)
> +
> +/**
> + * pci_ep_doe_init() - Initialize the DOE framework for a controller in EP mode
> + * @epc: PCI endpoint controller
> + *
> + * Initialize the DOE framework data structures. This only initializes
> + * the xarray that will hold the mailboxes.
> + *
> + * Returns: 0 on success, -errno on failure

s/Returns:/Return:/ (throughout)

Mani suggested "Returns" (from v3 to v4 above), so that's OK too.
https://origin.kernel.org/doc/html/latest/doc-guide/kernel-doc.html
includes four "Return:" examples and one "Returns:" example, and
"Return" fits better in the preferred imperative mood, so I have a
slight preference for that.

> + */
> +int pci_ep_doe_init(struct pci_epc *epc)
> +{
> +	if (!epc)
> +		return -EINVAL;

I doubt this is useful.  Obviously a bug in the caller and I'd rather
take the NULL pointer dereference, which will definitely be noticed,
than assume the buggy caller checks for failure.  The function might
as well be void then, same as pci_doe_init().

> +	xa_init(&epc->doe_mbs);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_init);
> +
> +/**
> + * pci_ep_doe_add_mailbox() - Add a DOE mailbox for a physical function
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: Offset of the DOE capability
> + *
> + * Create and register a DOE mailbox for the specified physical function
> + * and capability offset.
> + *
> + * EPC core driver calls this for each DOE capability discovered in the config
> + * space of each endpoint function if DOE support is available for the EPC.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long key;
> +	int ret;
> +
> +	if (!epc)
> +		return -EINVAL;

Also doubtful about this.

> +	doe_mb = kzalloc_obj(*doe_mb, GFP_KERNEL);
> +	if (!doe_mb)
> +		return -ENOMEM;
> +
> +	doe_mb->epc = epc;
> +	doe_mb->func_no = func_no;
> +	doe_mb->cap_offset = cap_offset;
> +
> +	doe_mb->work_queue = alloc_ordered_workqueue("pci_ep_doe[%s:pf%d:offset%x]", 0,
> +						     dev_name(&epc->dev),
> +						     func_no, cap_offset);
> +	if (!doe_mb->work_queue) {
> +		dev_err(epc->dev.parent,
> +			"[pf%d:offset%x] failed to allocate work queue\n",
> +			func_no, cap_offset);
> +		ret = -ENOMEM;
> +		goto err_free;
> +	}
> +
> +	/* Add to xarray with composite key */
> +	key = PCI_DOE_MB_KEY(func_no, cap_offset);
> +	ret = xa_insert(&epc->doe_mbs, key, doe_mb, GFP_KERNEL);
> +	if (ret) {
> +		dev_err(epc->dev.parent,
> +			"[pf%d:offset%x] failed to insert mailbox: %d\n",
> +			func_no, cap_offset, ret);
> +		goto err_destroy;
> +	}
> +
> +	refcount_set(&doe_mb->refs, 1);
> +
> +	dev_dbg(epc->dev.parent,
> +		"DOE mailbox added: pf%d offset 0x%x\n",
> +		func_no, cap_offset);
> +
> +	return 0;
> +
> +err_destroy:
> +	destroy_workqueue(doe_mb->work_queue);
> +err_free:
> +	kfree(doe_mb);
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_add_mailbox);
> +
> +/**
> + * pci_ep_doe_cancel_tasks() - Cancel all pending tasks
> + * @doe_mb: DOE mailbox
> + *
> + * Cancel all pending tasks in the mailbox. Mark the mailbox as dead
> + * so no new tasks can be submitted.
> + */
> +static void pci_ep_doe_cancel_tasks(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

Seems like this silently hides caller bugs without even the
possibility of checking for failure?

> +	/* Mark the mailbox as dead */
> +	set_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags);
> +
> +	/* Stop all pending work items from starting */
> +	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +}
> +
> +/**
> + * pci_ep_doe_get_mailbox() - Get DOE mailbox by function and offset
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: Offset of the DOE capability
> + *
> + * Internal helper to look up a DOE mailbox by its function number and
> + * capability offset.
> + *
> + * Returns: Pointer to the mailbox or NULL if not found
> + */
> +static struct pci_ep_doe_mb *pci_ep_doe_get_mailbox(struct pci_epc *epc,
> +						    u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long key;
> +
> +	if (!epc)
> +		return NULL;

Same?

> +	key = PCI_DOE_MB_KEY(func_no, cap_offset);
> +
> +	xa_lock(&epc->doe_mbs);
> +
> +	doe_mb = xa_load(&epc->doe_mbs, key);
> +	if (doe_mb && !refcount_inc_not_zero(&doe_mb->refs))
> +		doe_mb = NULL;
> +
> +	xa_unlock(&epc->doe_mbs);
> +
> +	return doe_mb;
> +}
> +
> +/**
> + * pci_ep_doe_put_mailbox() - Release a reference to a DOE mailbox
> + * @doe_mb: The mailbox structure to release
> + *
> + * Drops the reference count. If this was the last active reference,
> + * the memory allocated for the mailbox structure is freed.

s/Drops/Drop/
s/If ... is freed/Free the ... if this was last active .../

> + */
> +static void pci_ep_doe_put_mailbox(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

Omit unless there's a reason for this.

> +	if (refcount_dec_and_test(&doe_mb->refs))
> +		kfree(doe_mb);
> +}
> +
> +/**
> + * pci_ep_doe_find_protocol() - Find protocol handler in static array
> + * @vendor: Vendor ID
> + * @type: Protocol type
> + *
> + * Look up a protocol handler in the static protocol array by matching vendor ID
> + * and protocol type.
> + *
> + * Returns: Handler function pointer or NULL if not found
> + */
> +static pci_doe_protocol_handler_t pci_ep_doe_find_protocol(u16 vendor, u8 type)
> +{
> +	int i;
> +
> +	/* Search static protocol array */
> +	for (i = 0; i < PCI_DOE_PROTOCOL_COUNT; i++) {
> +		if (pci_doe_protocols[i].vid == vendor &&
> +		    pci_doe_protocols[i].type == type)
> +			return pci_doe_protocols[i].handler;
> +	}
> +
> +	return NULL;
> +}
> +
> +/**
> + * pci_ep_doe_handle_discovery() - Handle Discovery protocol request
> + * @request: Request payload
> + * @request_sz: Request size
> + * @response: Output pointer for response buffer
> + * @response_sz: Output pointer for response size
> + *
> + * Handle the DOE Discovery protocol. The request contains an index specifying
> + * which protocol to query. This function creates a response containing the
> + * vendor ID and protocol type for the requested index, along with the next
> + * index value for further discovery:
> + *
> + * - next_index = 0: Signals this is the last protocol supported
> + * - next_index = n (non-zero): Signals more protocols available,
> + *   query index n next
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_ep_doe_handle_discovery(const void *request, size_t request_sz,
> +				       void **response, size_t *response_sz)
> +{
> +	struct pci_doe_protocol protocol;
> +	u8 requested_index, next_index;
> +	u32 *response_pl;
> +	u32 request_pl;
> +	u16 vendor;
> +	u8 type;
> +
> +	if (request_sz != sizeof(u32))
> +		return -EINVAL;
> +
> +	request_pl = *(u32 *)request;
> +	requested_index = FIELD_GET(PCI_DOE_DATA_OBJECT_DISC_REQ_3_INDEX, request_pl);
> +
> +	if (requested_index >= PCI_DOE_PROTOCOL_COUNT) {
> +		/* No more protocols to report */
> +		vendor = 0;
> +		type = 0;
> +	} else {
> +		/* Get protocol from array at requested_index */
> +		protocol = pci_doe_protocols[requested_index];
> +		vendor = protocol.vid;
> +		type = protocol.type;
> +	}
> +
> +	/* Calculate next index */
> +	next_index = (requested_index + 1 < PCI_DOE_PROTOCOL_COUNT) ? requested_index + 1 : 0;
> +
> +	response_pl = kzalloc_obj(*response_pl, GFP_KERNEL);
> +	if (!response_pl)
> +		return -ENOMEM;
> +
> +	/* Build response */
> +	*response_pl = FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_VID, vendor) |
> +		       FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_TYPE, type) |
> +		       FIELD_PREP(PCI_DOE_DATA_OBJECT_DISC_RSP_3_NEXT_INDEX, next_index);
> +
> +	*response = response_pl;
> +	*response_sz = sizeof(*response_pl);
> +
> +	return 0;
> +}
> +
> +static void signal_task_complete(struct pci_ep_doe_task *task, int status)
> +{
> +	struct pci_ep_doe_mb *doe_mb = task->doe_mb;
> +
> +	task->complete(doe_mb->epc, doe_mb->func_no, doe_mb->cap_offset,
> +		       status, task->feat.vid, task->feat.type,
> +		       task->response_pl, task->response_pl_sz);
> +
> +	/* Clear the CANCEL flag for next DOE request */
> +	clear_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +
> +	kfree(task->request_pl);
> +	kfree(task);
> +
> +	/* Release the mailbox reference acquired during process_request */
> +	pci_ep_doe_put_mailbox(doe_mb);
> +}
> +
> +/**
> + * doe_ep_task_work() - Work function for processing DOE EP tasks
> + * @work: Work structure
> + *
> + * Process a DOE request by calling the appropriate protocol handler.
> + */
> +static void doe_ep_task_work(struct work_struct *work)
> +{
> +	struct pci_ep_doe_task *task = container_of(work, struct pci_ep_doe_task,
> +						    work);
> +	struct pci_ep_doe_mb *doe_mb = task->doe_mb;
> +	pci_doe_protocol_handler_t handler;
> +	int rc;
> +
> +	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags)) {
> +		signal_task_complete(task, -EIO);
> +		return;
> +	}
> +
> +	/* Check if request was aborted */
> +	if (test_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags)) {
> +		signal_task_complete(task, -ECANCELED);
> +		return;
> +	}
> +
> +	/* Find protocol handler in the array */

Comment seems superfluous, given the function name.

> +	handler = pci_ep_doe_find_protocol(task->feat.vid, task->feat.type);
> +	if (!handler) {
> +		dev_warn_ratelimited(doe_mb->epc->dev.parent,
> +				     "[%d:%x] Unsupported protocol VID=%04x TYPE=%02x\n",
> +				     doe_mb->func_no, doe_mb->cap_offset,
> +				     task->feat.vid, task->feat.type);
> +		signal_task_complete(task, -EOPNOTSUPP);
> +		return;
> +	}
> +
> +	/* Call protocol handler */

Ditto.

> +	rc = handler(task->request_pl, task->request_pl_sz,
> +		     &task->response_pl, &task->response_pl_sz);
> +
> +	signal_task_complete(task, rc);
> +}
> +
> +/**
> + * pci_ep_doe_submit_task() - Submit a task to be processed
> + * @doe_mb: DOE mailbox
> + * @task: Task to submit
> + *
> + * Submit a DOE task to the workqueue for asynchronous processing.
> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +static int pci_ep_doe_submit_task(struct pci_ep_doe_mb *doe_mb,
> +				  struct pci_ep_doe_task *task)
> +{
> +	if (test_bit(PCI_DOE_FLAG_DEAD, &doe_mb->flags))
> +		return -EIO;
> +
> +	task->doe_mb = doe_mb;
> +	INIT_WORK(&task->work, doe_ep_task_work);
> +	queue_work(doe_mb->work_queue, &task->work);
> +	return 0;
> +}
> +
> +/**
> + * pci_ep_doe_process_request() - Process DOE request on endpoint
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: DOE capability offset
> + * @vendor: Vendor ID from request header
> + * @type: Protocol type from request header
> + * @request: Request payload in CPU-native format
> + * @request_sz: Size of request payload (bytes)
> + * @complete: Callback to invoke upon completion
> + *
> + * Asynchronously process a DOE request received on the endpoint. The request
> + * payload should not include the DOE header (vendor/type/length). Ownership
> + * of the request buffer is transferred to DOE EP core, which frees the buffer
> + * either on error or after the completion callback fires. The protocol handler
> + * will allocate the response buffer, which the caller (controller driver) must
> + * free after use.

I guess signal_task_complete() is where the request buffer is freed
after completion?  Maybe mention the function name directly instead of
just "completion callback", e.g., "by signal_task_complete(), the
completion callback"?

> + * This function returns immediately after queuing the request. The completion
> + * callback will be invoked asynchronously from workqueue context once the
> + * request is processed. The callback receives the function number and capability
> + * offset to identify the mailbox, along with a status code (0 on success, -errno
> + * on failure), and other required arguments.

Wrap to fit in 80 columns.

> + * As per DOE specification, a mailbox processes one request at a time.
> + * Therefore, this function will never be called concurrently for the same
> + * mailbox by different callers.
> + *
> + * The caller is responsible for the conversion of the received DOE request
> + * with le32_to_cpu() before calling this function.
> + * Similarly, it is responsible for converting the response payload with
> + * cpu_to_le32() before sending it back over the DOE mailbox.

Wrap to fill 78-80 columns (or add blank line if you want a new
paragraph, but this looks like all one paragraph).

> + * The caller is also responsible for ensuring that the request size
> + * is within the limits defined by PCI_DOE_MAX_LENGTH.
> + *
> + * Returns: 0 if the request was successfully queued, -errno on failure
> + */
> +int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no, u16 cap_offset,
> +			       u16 vendor, u8 type, void *request, size_t request_sz,
> +			       pci_ep_doe_complete_t complete)

Wrap to fit in 80 columns.

> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	struct pci_ep_doe_task *task;
> +	int rc;
> +
> +	doe_mb = pci_ep_doe_get_mailbox(epc, func_no, cap_offset);
> +	if (!doe_mb) {
> +		kfree(request);
> +		return -ENODEV;
> +	}
> +
> +	task = kzalloc_obj(*task, GFP_ATOMIC);
> +	if (!task) {
> +		kfree(request);
> +		pci_ep_doe_put_mailbox(doe_mb);
> +		return -ENOMEM;
> +	}
> +
> +	task->feat.vid = vendor;
> +	task->feat.type = type;
> +	task->request_pl = request;
> +	task->request_pl_sz = request_sz;
> +	task->response_pl = NULL;
> +	task->response_pl_sz = 0;
> +	task->complete = complete;
> +
> +	rc = pci_ep_doe_submit_task(doe_mb, task);
> +	if (rc) {
> +		kfree(request);
> +		kfree(task);
> +		pci_ep_doe_put_mailbox(doe_mb);
> +		return rc;

Good candidate for error path ladder, as you did in
pci_ep_doe_add_mailbox().

> +	}
> +
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_process_request);
> +
> +/**
> + * pci_ep_doe_abort() - Abort DOE operations on a mailbox
> + * @epc: PCI endpoint controller
> + * @func_no: Physical function number
> + * @cap_offset: DOE capability offset
> + *
> + * Abort the queued or in-flight DOE operation for the specified mailbox.
> + * This function is called by the EP controller driver when the RC sets the
> + * ABORT bit in the DOE Control register, and the BUSY bit is set in the
> + * DOE Status Register.
> + *
> + * The function sets the CANCEL flag on the mailbox to prevent queued requests
> + * from starting, and returns immediately. The CANCEL flag gets cleared in
> + * signal_task_complete(), allowing the mailbox to accept new requests.

s/The function sets .../Set .../
s/and returns/and return/

> + *
> + * Returns: 0 on success, -errno on failure
> + */
> +int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +
> +	if (!epc)
> +		return -EINVAL;

?

> +	doe_mb = pci_ep_doe_get_mailbox(epc, func_no, cap_offset);
> +	if (!doe_mb)
> +		return -ENODEV;
> +
> +	/* Set CANCEL flag - worker will abort queued requests */
> +	set_bit(PCI_DOE_FLAG_CANCEL, &doe_mb->flags);
> +
> +	dev_dbg_ratelimited(epc->dev.parent,
> +			    "DOE mailbox abort initialized: PF%d offset 0x%x\n",
> +			    func_no, cap_offset);
> +
> +	pci_ep_doe_put_mailbox(doe_mb);
> +	return 0;
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_abort);
> +
> +/**
> + * pci_ep_doe_destroy_mb() - Destroy a single DOE mailbox
> + * @doe_mb: DOE mailbox to destroy
> + *
> + * Internal function to destroy a mailbox and free its resources.
> + */
> +static void pci_ep_doe_destroy_mb(struct pci_ep_doe_mb *doe_mb)
> +{
> +	if (!doe_mb)
> +		return;

?

> +	pci_ep_doe_cancel_tasks(doe_mb);
> +
> +	if (doe_mb->work_queue)
> +		destroy_workqueue(doe_mb->work_queue);
> +
> +	pci_ep_doe_put_mailbox(doe_mb);
> +}
> +
> +/**
> + * pci_ep_doe_destroy() - Destroy all DOE mailboxes
> + * @epc: PCI endpoint controller
> + *
> + * Destroy all DOE mailboxes and free associated resources.
> + *
> + * The EPC core driver calls this to free all DOE resources,
> + * if DOE support is available for the EPC.
> + */
> +void pci_ep_doe_destroy(struct pci_epc *epc)
> +{
> +	struct pci_ep_doe_mb *doe_mb;
> +	unsigned long index;
> +
> +	if (!epc)
> +		return;

?

> +	xa_for_each(&epc->doe_mbs, index, doe_mb) {
> +		xa_erase(&epc->doe_mbs, index);
> +		pci_ep_doe_destroy_mb(doe_mb);
> +	}
> +
> +	xa_destroy(&epc->doe_mbs);
> +}
> +EXPORT_SYMBOL_GPL(pci_ep_doe_destroy);
> diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
> index 5844deee2b5f..c4a0e25625e3 100644
> --- a/drivers/pci/pci.h
> +++ b/drivers/pci/pci.h
> @@ -692,6 +692,13 @@ struct pci_doe_feature {
>  	u8 type;
>  };
>  
> +struct pci_epc;
> +
> +typedef void (*pci_ep_doe_complete_t)(struct pci_epc *epc, u8 func_no,
> +				      u16 cap_offset, int status,
> +				      u16 vendor, u8 type,
> +				      void *response_pl, size_t response_pl_sz);
> +
>  #ifdef CONFIG_PCI_DOE
>  void pci_doe_init(struct pci_dev *pdev);
>  void pci_doe_destroy(struct pci_dev *pdev);
> @@ -702,6 +709,38 @@ static inline void pci_doe_destroy(struct pci_dev *pdev) { }
>  static inline void pci_doe_disconnected(struct pci_dev *pdev) { }
>  #endif
>  
> +#ifdef CONFIG_PCI_ENDPOINT_DOE
> +int pci_ep_doe_init(struct pci_epc *epc);
> +int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no, u16 cap_offset);
> +int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no, u16 cap_offset,
> +			       u16 vendor, u8 type, void *request,
> +			       size_t request_sz, pci_ep_doe_complete_t complete);
> +int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset);
> +void pci_ep_doe_destroy(struct pci_epc *epc);
> +#else
> +static inline int pci_ep_doe_init(struct pci_epc *epc) { return -EOPNOTSUPP; }
> +static inline int pci_ep_doe_add_mailbox(struct pci_epc *epc, u8 func_no,
> +					 u16 cap_offset)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int pci_ep_doe_process_request(struct pci_epc *epc, u8 func_no,
> +					     u16 cap_offset, u16 vendor, u8 type,
> +					     void *request, size_t request_sz,
> +					     pci_ep_doe_complete_t complete)
> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline int pci_ep_doe_abort(struct pci_epc *epc, u8 func_no, u16 cap_offset)

Wrap to fit in 80 columns.

> +{
> +	return -EOPNOTSUPP;
> +}
> +
> +static inline void pci_ep_doe_destroy(struct pci_epc *epc) { }
> +#endif
> +
>  #ifdef CONFIG_PCI_NPEM
>  void pci_npem_create(struct pci_dev *dev);
>  void pci_npem_remove(struct pci_dev *dev);
> diff --git a/include/linux/pci-doe.h b/include/linux/pci-doe.h
> index abb9b7ae8029..c46e42f3ce78 100644
> --- a/include/linux/pci-doe.h
> +++ b/include/linux/pci-doe.h
> @@ -22,6 +22,11 @@ struct pci_doe_mb;
>  /* Max data object length is 2^18 dwords */
>  #define PCI_DOE_MAX_LENGTH		(1 << 18)
>  
> +typedef int (*pci_doe_protocol_handler_t)(const void *request,
> +					  size_t request_sz,
> +					  void **response,
> +					  size_t *response_sz);
> +
>  struct pci_doe_mb *pci_find_doe_mailbox(struct pci_dev *pdev, u16 vendor,
>  					u8 type);
>  
> diff --git a/include/linux/pci-epc.h b/include/linux/pci-epc.h
> index 1eca1264815b..dd26294c8175 100644
> --- a/include/linux/pci-epc.h
> +++ b/include/linux/pci-epc.h
> @@ -182,6 +182,9 @@ struct pci_epc {
>  	unsigned long			function_num_map;
>  	int				domain_nr;
>  	bool				init_complete;
> +#ifdef CONFIG_PCI_ENDPOINT_DOE
> +	struct xarray			doe_mbs;
> +#endif
>  };
>  
>  /**
> -- 
> 2.34.1
> 

^ permalink raw reply

* Re: [RFC V2 1/3] lib/vsprintf: Add support for pgtable entries
From: Andy Shevchenko @ 2026-06-11 18:59 UTC (permalink / raw)
  To: Anshuman Khandual
  Cc: Usama Arif, linux-mm, Rasmus Villemoes, Sergey Senozhatsky,
	Petr Mladek, Steven Rostedt, Jonathan Corbet, Andrew Morton,
	David Hildenbrand, linux-kernel, linux-doc, David Hildenbrand,
	Lorenzo Stoakes, Andy Whitcroft
In-Reply-To: <d41a24c1-592e-495f-8adf-dd538b769904@arm.com>

On Thu, Jun 11, 2026 at 03:20:13PM +0530, Anshuman Khandual wrote:
> On 11/06/26 12:47 PM, Andy Shevchenko wrote:
> > On Thu, Jun 11, 2026 at 10:45:01AM +0530, Anshuman Khandual wrote:
> >> On 10/06/26 4:43 PM, Usama Arif wrote:
> >>> On Wed, 10 Jun 2026 05:35:43 +0100 Anshuman Khandual <anshuman.khandual@arm.com> wrote:

...

> >>>> +		static_assert(sizeof(pte_t) == 4 ||
> >>>> +			      sizeof(pte_t) == 8,
> >>>> +			      "pte_t size must be 4 or 8 bytes");
> > 
> > Besides occupying too many lines, why are these static asserts hidden here and
> > not declared in the global space? More wide Q is why they are needed at all?
> 
> Sure, will move these static_assert just above pxd_pointer()
> These asserts ensure
> 
> - Platforms have either 32 bit or 64 bit pgtable descriptors
> - special_hex_number() can be used to print such descriptors

I understand that. My question is do we actually _need_ them? In other words
when this may be not satisfied? Any real (non-theoretical) example?

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* Re: [RFC PATCH v1 00/13] exec: add spawn templates for repeated executable startup
From: John Ericson @ 2026-06-11 18:53 UTC (permalink / raw)
  To: Mateusz Guzik, Li Chen
  Cc: Andy Lutomirski, Christian Brauner, Kees Cook, Al Viro,
	linux-fsdevel, linux-api, LKML, linux-mm, linux-arch, linux-doc,
	linux-kselftest, x86, Arnd Bergmann, Thomas Gleixner, Ingo Molnar,
	Borislav Petkov, Dave Hansen, H. Peter Anvin, Jan Kara,
	Jonathan Corbet, Shuah Khan
In-Reply-To: <hd3i6pxxohsjesyid7nhuic6ppp6nyoxxpwa4mny6riqvpyqec@mylfprni2yaw>

On Wed, Jun 10, 2026, at 7:40 PM, Mateusz Guzik wrote:
> [...]
>
> As I tried to explain in my previous e-mail this approach does not cut
> it because of NUMA.
>
> Suppose you have a machine with 2 nodes. The parent-to-be is running
> on node 0 and the child is intended to exec something on node 1.
>
> When the parent-to-be allocates and populates stuff, it takes place with
> memory backed by node 0. If you allocate task_struct, the file table and
> other frequently used (and modified!) objs in this way, you are
> guaranteeing performance loss due to interconnect traffic to access it.
>
> Trying to add plumbing so that all allocations respect numa placement is
> probably too cumbersome.

Are we sure that last part is true?

Let's also assume when this stuff was initially implemented, we didn't
have it. If the basic thrust of this work is to replace functions that
previously only worked on the current thread with those that worked on
either arbitrary (not yet started) threads or the current thread, would
that not prepare us for slowly migrating the allocation choice to
reflect the node of the target task (new parameter) rather than the node
of the current task over time?

(This assumes the task is pre-placed on a node before it is actually run
there, and that pre-placement happens as early in the allocation process
as possible, so subsequent allocations can read off the
partially-initialized task's node.)

"Slowly migrating" is good here! It doesn't need to be the fastest thing
out of the gate, but if this new proper spawning API gets popular as I
think it would, and there is a clear path to optimizing it per the
above, then I am confident that over the years it will happen.

> The primary example for that is looking up the binary to exec in the
> first place.
>
> userspace likes to pass paths which don't exist, meaning checking for
> the binary before any hard work is a useful optimization. Suppose the
> binary to be executed is in a container bound with a taskset using
> node 1 and the content of the fs part of the container is currently
> fully uncached.
>
> When you perform the lookup on node 0, you are populating a bunch of
> metadata (inode, dentry) using memory from that domain. But the intended
> user will only execute on node 1, again resulting in a performance loss.
>
> In order to not do it you would need to convince VFS to allocate memory
> elsewhere.

One thing I don't get about this is that isn't the cost doing a bunch of
work searching the PATH for the directories where the executable
*doesn't* exist? In the case of something like a shell that is going to
spawn a lot of processes, I would think it is *good* to keep all that
PATH crawling VFS filling to be on the shell's node, rather than the
child processes' nodes.

It is only the executable itself, the final step of the VFS crawl, that
should be loaded into the other NUMA nodes. Insofar as (unless I am
missing something) creating the process means finding the inode for the
executable but not loading those pages, aren't we OK here? Only when the
new process is actually scheduled and run must the ELF be paged into
memory, and then that will happen on the correct node.

> So I stand by my previous claim that ultimately a pristine child has to
> be created (like in this patch), but which also has to do the work on
> its own.

I have not been a kernel dev, so my apologies if I am missing things.
But in conclusion for me, the FS and other resource access patterns of
*creating a process* vs *that process itself running* do not seem
necessarily coincident to me. What you are describing as for sure a
problem might possibly be a *good thing*, if they are in fact quite
different.

> Suppose there is no explicit placement requested anywhere. Even in that
> case there are legitimate workloads which will eventually be forced to
> exec stuff on another node. Even these have a better chance retaining
> full locality if the child process does all the work.
>
> Per my previous message I don't see a clean interface to do it.
> something quasi-posix_spawn is probably the least bad way out, it will
> also allow userspace to easily wrap the new thing with posix_spawn
> itself.
>
> Also note there is another issue with the fd-based approach: the fd will
> get inherited on fork and will hang out in the child afterwards unless
> explicitly closed. Suppose you have a multithreaded program which likes
> to both fork(+no exec) and fork+exec. With the fd-based approach you
> have no means of stopping another thread from grabbing your state thanks
> to unix defaulting to copying everything. There was an attempt to fix
> this aspect with O_CLOFORK, but this got rejected.

I would think we don't need to worry about clone/fork very much, right?
I think the premise of your emails, and just about everyone else's in
this thread too, is that we agree fork+exec is bad, and the problem of
unnecessarily sharing resources is inherent to fork. Furthermore, I
think we all agree that while `O_CLOEXEC` and `O_CLOFORK` may help, both
are unsatisfying solutions because they are opt-out not opt-in, and
global to the parent process / preexec state (respectively) rather than
local to the specific fork / exec in question.

pidfds encounter these problems no more than any other
file-descriptor-based UAPI, right? And I don't think it is good to blame
any such file-descriptor-based UAPI when fork/exec are at fault.

Maybe during the transition, when some things use fork and some things
use this new API, stuff will be awkward, but I would rather that just be
an incentive to complete the transition away from fork, not a reason to
second-guess the plan.

Once the transition is complete, and everyone is diligently assembling
their child processes from scratch as is proposed, `O_CLOEXEC` and
`O_CLOFORK` are both unneeded, and oversharing privileges will be much
less common simply because "lazy coding"/"minimal typing" will only
share what is needed --- anything else is more code/keystrokes!

> Whatever exactly happens, NUMA is a sad fact of computing and needs to
> be accounted for. The approach as proposed not only does not do it, but
> it actively hinders such deployments.

Despite everything I said, I want to be clear that I do agree that NUMA
performance should be accounted for. Even if the first version isn't as
great as it could be on that metric, there should be a clear plan for
how future work can conclusively address it.

Cheers,

John

^ permalink raw reply

* Re: [PATCH iproute2-next 7/7] devlink: add scope filter to resource show
From: David Ahern @ 2026-06-11 18:53 UTC (permalink / raw)
  To: Tariq Toukan, Stephen Hemminger, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, Andrew Lunn, David S. Miller
  Cc: Donald Hunter, Simon Horman, Jiri Pirko, Jonathan Corbet,
	Shuah Khan, Saeed Mahameed, Leon Romanovsky, Mark Bloch,
	Shuah Khan, Matthieu Baerts (NGI0), Chuck Lever, Or Har-Toov,
	Carolina Jubran, Moshe Shemesh, Shay Drori, Dragos Tatulea,
	Daniel Zahka, Shahar Shitrit, Jacob Keller, Cosmin Ratiu,
	Parav Pandit, Kees Cook, Adithya Jayachandran, Daniel Jurgens,
	netdev, linux-kernel, linux-doc, linux-rdma, linux-kselftest,
	Gal Pressman, Ido Schimmel, Jiri Pirko, Petr Machata
In-Reply-To: <20260609053953.487152-8-tariqt@nvidia.com>

On 6/8/26 11:39 PM, Tariq Toukan wrote:
> @@ -9010,13 +9029,29 @@ static int cmd_resource_show(struct dl *dl)
>  	uint16_t flags = NLM_F_REQUEST | NLM_F_ACK;
>  	struct nlmsghdr *nlh;
>  	struct resource_ctx resource_ctx = {};
> +	struct dl_opts *opts = &dl->opts;
>  	int err;
>  
> -	err = dl_argv_parse_with_selector(dl, &flags, DEVLINK_CMD_RESOURCE_DUMP,
> -					  DL_OPT_HANDLE | DL_OPT_HANDLEP,
> -					  0, 0, 0);
> -	if (err)
> -		return err;
> +	if (dl_argv_match(dl, "scope")) {
> +		const char *scopestr;
> +
> +		dl_arg_inc(dl);
> +		err = dl_argv_str(dl, &scopestr);
> +		if (err)
> +			return err;
> +		err = resource_scope_get(scopestr, &opts->resource_scope_mask);
> +		if (err)
> +			return err;
> +		opts->present |= DL_OPT_RESOURCE_SCOPE;

Comment from Claude that seems legit:

Issue found: In cmd_resource_show, the scope path sets opts->present |=
DL_OPT_RESOURCE_SCOPE without first clearing opts->present. In batch
mode, dl->opts is shared across commands, and the non-scope path
correctly resets opts->present via dl_argv_parse(). But the scope path
bypasses dl_argv_parse(), so stale bits (e.g. DL_OPT_HANDLE from a
previous dev show) remain. When dl_opts_put() runs, it writes the stale
DEVLINK_ATTR_BUS_NAME/DEV_NAME attributes into the dump request,
silently filtering to a single device instead of all devices. Fix: use =
instead of |=

Are you ok with the suggested resolution?


^ permalink raw reply

* Re: [PATCH v6 02/11] x86/virt/tdx: Allocate page bitmap for Dynamic PAMT
From: Vishal Annapurve @ 2026-06-11 18:47 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx, x86,
	chao.gao, yan.y.zhao, kai.huang, Kirill A. Shutemov, Binbin Wu
In-Reply-To: <20260526023515.288829-3-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 7:35 PM Rick Edgecombe
<rick.p.edgecombe@intel.com> wrote:
>
> From: "Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>
>
> The TDX Physical Address Metadata Table (PAMT) holds data about the
> physical memory used by TDX, and must be allocated by the kernel during
> TDX module initialization.
>
> The exact size of the required PAMT memory is determined by the TDX module
> and may vary between TDX module versions. Currently it is approximately
> 0.4% of the system memory. This is a significant commitment, especially if
> it is not known upfront whether the machine will run any TDX guests.
>
> Each memory region that the TDX module might use needs three separate PAMT
> allocations. One for each supported page size (1GB, 2MB, 4KB). The
> TDX module supports a new feature designed to reduce PAMT overhead called
> Dynamic PAMT. At a high level, Dynamic PAMT still has the 1GB and 2MB
> levels allocated on TDX module initialization, but the 4KB level is
> allocated dynamically during runtime.
>
> However, in the details, Dynamic PAMT still needs some smaller per 4KB
> page scoped data (currently it is 1 bit per page). The TDX module exposes
> the number of bits as a separate piece of metadata than the 4KB static
> allocation for regular PAMT. Although the size is enumerated differently,
> it is handed to the TDX module in the same way the 4KB page size PAMT
> allocation is for regular, non-dynamic PAMT.
>
> Begin to implement Dynamic PAMT in the kernel by reading the bits-per-page
> needed for Dynamic PAMT. Calculate the size needed for the bitmap,
> and use it instead of the 4KB size determined for normal PAMT, in the case
> of Dynamic PAMT.
>
> Unlike the existing metadata reading code, this code is not generated by a
> script. So adjust the comment to be more generic. Also, start to adopt a
> more normal kernel code style without the tenary statements and if
> conditionals assignments that the auto generated code has.
>
> Assisted-by: Sashiko:claude-opus-4-6
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
> Co-developed-by: Rick Edgecombe <rick.p.edgecombe@intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

Kirill's comment make sense to me.

Reviewed-by: Vishal Annapurve <vannapurve@google.com>

^ permalink raw reply

* [PATCH] Documentation: arch: fix brackets
From: Manuel Ebner @ 2026-06-11 18:35 UTC (permalink / raw)
  To: Vineet Gupta, Jonathan Corbet, Shuah Khan, Krzysztof Kozlowski,
	Peter Griffin, Alim Akhtar, Catalin Marinas, Will Deacon,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Christophe Leroy, open list:SYNOPSYS ARC ARCHITECTURE,
	open list:DOCUMENTATION, open list,
	moderated list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:ARM/SAMSUNG S3C, S5P AND EXYNOS ARM ARCHITECTURES,
	open list:LINUX FOR POWERPC (32-BIT AND 64-BIT)
  Cc: Manuel Ebner

Add missing and remove needless parentheses, brackets and curly braces.

Signed-off-by: Manuel Ebner <manuelebner@mailbox.org>
---
 Documentation/arch/arc/arc.rst                 |  2 +-
 .../arm/samsung/clksrc-change-registers.awk    |  2 +-
 Documentation/arch/arm/vlocks.rst              |  4 ++--
 .../arch/arm64/memory-tagging-extension.rst    |  2 +-
 Documentation/arch/powerpc/vas-api.rst         |  2 +-
 Documentation/arch/sparc/oradax/dax-hv-api.txt | 18 +++++++++---------
 Documentation/arch/sparc/oradax/oracle-dax.rst |  3 ++-
 7 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/Documentation/arch/arc/arc.rst b/Documentation/arch/arc/arc.rst
index 6c4d978f3f4e..553851f43be7 100644
--- a/Documentation/arch/arc/arc.rst
+++ b/Documentation/arch/arc/arc.rst
@@ -36,7 +36,7 @@ Important note on ARC processors configurability
 
 ARC processors are highly configurable and several configurable options
 are supported in Linux. Some options are transparent to software
-(i.e cache geometries, some can be detected at runtime and configured
+(i.e cache geometries), some can be detected at runtime and configured
 and used accordingly, while some need to be explicitly selected or configured
 in the kernel's configuration utility (AKA "make menuconfig").
 
diff --git a/Documentation/arch/arm/samsung/clksrc-change-registers.awk b/Documentation/arch/arm/samsung/clksrc-change-registers.awk
index 7be1b8aa7cd9..48464397088c 100755
--- a/Documentation/arch/arm/samsung/clksrc-change-registers.awk
+++ b/Documentation/arch/arm/samsung/clksrc-change-registers.awk
@@ -163,4 +163,4 @@ BEGIN {
     }
 }
 
-// && ! /clksrc_clk.*=.*{/ { print $0 }
+// && ! /clksrc_clk.*=.*{/ { print $0 }}
diff --git a/Documentation/arch/arm/vlocks.rst b/Documentation/arch/arm/vlocks.rst
index 737aa8661a21..b0ac33263086 100644
--- a/Documentation/arch/arm/vlocks.rst
+++ b/Documentation/arch/arm/vlocks.rst
@@ -102,10 +102,10 @@ Features and limitations
 	if (I_won) {
 		/* we won the town election, let's go for the state */
 		my_state = states[(this_cpu >> 8) & 0xf];
-		I_won = vlock_lock(my_state, this_cpu & 0xf));
+		I_won = vlock_lock(my_state, this_cpu & 0xf);
 		if (I_won) {
 			/* and so on */
-			I_won = vlock_lock(the_whole_country, this_cpu & 0xf];
+			I_won = vlock_lock(the_whole_country, this_cpu & 0xf);
 			if (I_won) {
 				/* ... */
 			}
diff --git a/Documentation/arch/arm64/memory-tagging-extension.rst b/Documentation/arch/arm64/memory-tagging-extension.rst
index 679725030731..e6fe428f0e2a 100644
--- a/Documentation/arch/arm64/memory-tagging-extension.rst
+++ b/Documentation/arch/arm64/memory-tagging-extension.rst
@@ -222,7 +222,7 @@ programs should not retry in case of a non-zero system call return.
 address ABI control and MTE configuration of a process as per the
 ``prctl()`` options described in
 Documentation/arch/arm64/tagged-address-abi.rst and above. The corresponding
-``regset`` is 1 element of 8 bytes (``sizeof(long))``).
+``regset`` is 1 element of 8 bytes (``sizeof(long)``).
 
 Core dump support
 -----------------
diff --git a/Documentation/arch/powerpc/vas-api.rst b/Documentation/arch/powerpc/vas-api.rst
index a9625a2fa0c6..1d0d055356e3 100644
--- a/Documentation/arch/powerpc/vas-api.rst
+++ b/Documentation/arch/powerpc/vas-api.rst
@@ -293,7 +293,7 @@ Simple example
 				//Format CRB request with compression or
 				//uncompression
 				// Refer tests for vas_copy/vas_paste
-				vas_copy((&crb, 0, 1);
+				vas_copy(&crb, 0, 1);
 				vas_paste(addr, 0, 1);
 				// Poll on csb.flags with timeout
 				// csb address is listed in CRB
diff --git a/Documentation/arch/sparc/oradax/dax-hv-api.txt b/Documentation/arch/sparc/oradax/dax-hv-api.txt
index ef1a4c2bf08b..ef6088aeaa66 100644
--- a/Documentation/arch/sparc/oradax/dax-hv-api.txt
+++ b/Documentation/arch/sparc/oradax/dax-hv-api.txt
@@ -457,7 +457,7 @@ bits set, and terminate at a CCB that has the Conditional bit set, but not the P
 Offset   Size   Field Description
                 Bits         Field Description
                 [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                             “Secondary Input Element Size”
+                             “Secondary Input Element Size”)
                 [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                 [9]          Padding Direction selector: A value of 1 causes padding bytes
                              to be added to the left side of output elements. A value of 0
@@ -656,7 +656,7 @@ Offset         Size            Field Description
                                [18:16]      Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                                             Element Offsets”)
                                [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                                            “Secondary Input Element Size”
+                                            “Secondary Input Element Size”)
                                [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                                [9:5]        Operand size for first scan criteria value. In a scan value
                                             operation, this is one of two potential exact match values.
@@ -793,13 +793,13 @@ Offset   Size   Field Description
                 [18:16]      Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                              Element Offsets”)
                 [15:14]      Secondary Input Element Size (see Section 36.2.1.1.4,
-                             “Secondary Input Element Size”
+                             “Secondary Input Element Size”)
                 [13:10]      Output Format (see Section 36.2.1.1.6, “Output Format”)
                 [9]          Reserved
                 [8:0]        Test value used for comparison against the most significant bits
                              in the input values, when using 2 or 3 byte input elements.
-8        8      Completion (same fields as Section 36.2.1.2, “Extract command”
-16       8      Primary Input (same fields as Section 36.2.1.2, “Extract command”
+8        8      Completion (same fields as Section 36.2.1.2, “Extract command)”
+16       8      Primary Input (same fields as Section 36.2.1.2, “Extract command”)
 24       8      Data Access Control (same fields as Section 36.2.1.2, “Extract command”,
                 except Primary Input Length Format may not use the 0x0 value)
 32       8      Secondary Input, if used by Primary Input Format. Same fields as Primary
@@ -880,7 +880,7 @@ Offset   Size   Field Description
                                        [18:16]     Secondary Input Starting Offset (see Section 36.2.1.1.5, “Input
                                                    Element Offsets”)
                                        [15:14]     Secondary Input Element Size (see Section 36.2.1.1.4,
-                                                   “Secondary Input Element Size”
+                                                   “Secondary Input Element Size”)
 
 
                                                       524
@@ -895,8 +895,8 @@ Offset   Size   Field Description
                                                     causes padding bytes to be added to the right side of output
                                                     elements.
                                        [8:0]        Reserved
-        8              8               Completion (same fields as Section 36.2.1.2, “Extract command”
-        16             8               Primary Input (same fields as Section 36.2.1.2, “Extract command”
+        8              8               Completion (same fields as Section 36.2.1.2, “Extract command”)
+        16             8               Primary Input (same fields as Section 36.2.1.2, “Extract command”)
         24             8               Data Access Control (same fields as Section 36.2.1.2, “Extract command”)
         32             8               Secondary Bit Vector Input. Same fields as Primary Input.
         40             8               Reserved
@@ -949,7 +949,7 @@ Offset   Size   Field Description
                                    [31]        If set, this CCB functions as a Sync command. If clear, this
                                                CCB functions as a No-op command.
                                    [30:0]      Reserved
-       8             8             Completion (same fields as Section 36.2.1.2, “Extract command”
+       8             8             Completion (same fields as Section 36.2.1.2, “Extract command”)
        16            46            Reserved
 
 36.2.2. CCB Completion Area
diff --git a/Documentation/arch/sparc/oradax/oracle-dax.rst b/Documentation/arch/sparc/oradax/oracle-dax.rst
index d1e14d572918..67867ea7be40 100644
--- a/Documentation/arch/sparc/oradax/oracle-dax.rst
+++ b/Documentation/arch/sparc/oradax/oracle-dax.rst
@@ -438,7 +438,8 @@ that in user land::
 The output bitmap is ready for consumption immediately after the
 completion status indicates success.
 
-Excer[t from UltraSPARC Virtual Machine Specification
+Excer?t from UltraSPARC Virtual Machine Specification
+i guess this is wrong, but i don't know what's correct
 =====================================================
 
  .. include:: dax-hv-api.txt
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v6 01/11] x86/virt/tdx: Simplify tdmr_get_pamt_sz()
From: Vishal Annapurve @ 2026-06-11 18:25 UTC (permalink / raw)
  To: Rick Edgecombe
  Cc: bp, dave.hansen, hpa, kas, kvm, linux-coco, linux-doc,
	linux-kernel, mingo, nik.borisov, pbonzini, seanjc, tglx, x86,
	chao.gao, yan.y.zhao, kai.huang, Binbin Wu
In-Reply-To: <20260526023515.288829-2-rick.p.edgecombe@intel.com>

On Mon, May 25, 2026 at 7:35 PM Rick Edgecombe
<rick.p.edgecombe@intel.com> wrote:
>
> For each memory region that the TDX module might use (called TDMR), three
> separate traditional PAMT allocations are needed. One for each supported
> page size (1GB, 2MB, 4KB). These store information on each page in the
> TDMR. In Linux, they are allocated out of one physically contiguous block,
> in order to more efficiently use some internal TDX module book keeping
> resources. So some simple math is needed to break the single large
> allocation into three smaller allocations for each page size.
>
> There are some commonalities in the math needed to calculate the base and
> size for each smaller allocation, and so an effort was made to share logic
> across the three. Unfortunately doing this turned out unnaturally tortured,
> with a loop iterating over the three page sizes, only to call into a
> function with cases statement for each page size. In the future Dynamic
> PAMT will add more logic that is special to the 4KB page size, making the
> benefit of the math sharing even more questionable.
>
> Three is not a very high number, so get rid of the loop and just duplicate
> the small calculation three times. In doing so, setup for future Dynamic
> PAMT changes.
>
> Since the loop that iterates over it is gone, further simplify the code by
> dropping the array of intermediate size and base storage. Just store the
> values to their final locations. Accept the small complication of having
> to clear tdmr->pamt_4k_base in the error path, so that tdmr_do_pamt_func()
> will not try to operate on the TDMR struct when attempting to free it.
>
> Assisted-by: GitHub Copilot:claude-opus-4-6 Claude:claude-opus-4-7
> Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
> Signed-off-by: Rick Edgecombe <rick.p.edgecombe@intel.com>

Reviewed-by: Vishal Annapurve <vannapurve@google.com>

^ permalink raw reply

* Re: [PATCH net-next 2/3] docs: net: tls-offload: document tls_dev_del, tls_dev_resync, and rekey
From: Randy Dunlap @ 2026-06-11 18:13 UTC (permalink / raw)
  To: Jakub Kicinski, Sabrina Dubroca
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, corbet,
	linux-doc, bpf, john.fastabend, skhan
In-Reply-To: <20260611101817.18964bd9@kernel.org>



On 6/11/26 10:18 AM, Jakub Kicinski wrote:
> On Wed, 10 Jun 2026 23:06:44 +0200 Sabrina Dubroca wrote:
>>> +The third TLS device callback is :c:member:`tls_dev_resync`, called by the core
>>> +to synchronize the TCP stream with the record boundaries:
>>> +
>>> +.. code-block:: c
>>> +
>>> +	int (*tls_dev_resync)(struct net_device *netdev,
>>> +			      struct sock *sk, u32 seq, u8 *rcd_sn,
>>> +			      enum tls_offload_ctx_dir direction);
>>> +
>>> +See the `Resync handling`_ section for details.  
>>
>> Hmm, this callback is not mentioned at all in the "Resync handling"
>> section. I think it'd be good to add at least a quick note there about
>> how/when it's invoked, and what the arguments mean (at least the two
>> types of sequence numbers, since the rest is identical to the other
>> driver CBs).
> 
> Something like this, you mean?
> 
> --- a/Documentation/networking/tls-offload.rst
> +++ b/Documentation/networking/tls-offload.rst
> @@ -278,9 +278,9 @@ sequence number (as it will be updated from a different context).
>    bool tls_offload_tx_resync_pending(struct sock *sk)
>  
>  Next time ``ktls`` pushes a record it will first send its TCP sequence number
> -and TLS record number to the driver. Stack will also make sure that
> -the new record will start on a segment boundary (like it does when
> -the connection is initially added).
> +and TLS record number to the driver via the ``tls_dev_resync`` callback.
> +Stack will also make sure that the new record will start on a segment boundary

Preferably "The stack ..."

> +(like it does when the connection is initially added).
>  
>  RX
>  --


-- 
~Randy


^ permalink raw reply

* Re: [PATCH net-next 2/3] docs: net: tls-offload: document tls_dev_del, tls_dev_resync, and rekey
From: Sabrina Dubroca @ 2026-06-11 17:55 UTC (permalink / raw)
  To: Jakub Kicinski
  Cc: davem, netdev, edumazet, pabeni, andrew+netdev, horms, corbet,
	linux-doc, bpf, john.fastabend, skhan
In-Reply-To: <20260611101817.18964bd9@kernel.org>

2026-06-11, 10:18:17 -0700, Jakub Kicinski wrote:
> On Wed, 10 Jun 2026 23:06:44 +0200 Sabrina Dubroca wrote:
> > > +The third TLS device callback is :c:member:`tls_dev_resync`, called by the core
> > > +to synchronize the TCP stream with the record boundaries:
> > > +
> > > +.. code-block:: c
> > > +
> > > +	int (*tls_dev_resync)(struct net_device *netdev,
> > > +			      struct sock *sk, u32 seq, u8 *rcd_sn,
> > > +			      enum tls_offload_ctx_dir direction);
> > > +
> > > +See the `Resync handling`_ section for details.  
> > 
> > Hmm, this callback is not mentioned at all in the "Resync handling"
> > section. I think it'd be good to add at least a quick note there about
> > how/when it's invoked, and what the arguments mean (at least the two
> > types of sequence numbers, since the rest is identical to the other
> > driver CBs).
> 
> Something like this, you mean?

Yeah, exactly.

-- 
Sabrina

^ permalink raw reply

* [PATCH v6 20/20] nfsd: add support to CB_NOTIFY for dir attribute changes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

If the client requested dir attribute change notifications, send those
alongside any set of add/remove/rename events. Note that the server will
still recall the delegation on a SETATTR, so these are only sent for
changes to child dirents.

The child filehandle returned in these notifications is composed by
setup_notify_fhandle() without going through fh_compose(), so it does
not get a MAC appended. On exports configured with NFSEXP_SIGN_FH the
client would then get back an unsigned filehandle that fh_verify()
rejects as stale. Pass the delegation's export down to
setup_notify_fhandle() and append the MAC with fh_append_mac() when the
export requires signed filehandles; if signing fails, drop the
filehandle attribute rather than handing out an unusable one.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 25 ++++++++++++++++--
 fs/nfsd/nfs4xdr.c   | 73 +++++++++++++++++++++++++++++++++++++++++++++--------
 fs/nfsd/xdr4.h      |  2 ++
 3 files changed, 88 insertions(+), 12 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12627afb604f..e394278fb92e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -3503,10 +3503,15 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 	struct nfsd_notify_event *events[NOTIFY4_EVENT_QUEUE_SIZE];
 	struct xdr_buf xdr = { .buflen = PAGE_SIZE * NOTIFY4_PAGE_ARRAY_SIZE,
 			       .pages  = ncn->ncn_pages };
+	int limit = NOTIFY4_EVENT_QUEUE_SIZE;
 	struct xdr_stream stream;
 	struct nfsd_file *nf;
-	int count, i;
 	bool error = false;
+	int count, i;
+
+	/* Save a slot for dir attr update if requested */
+	if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS))
+		--limit;
 
 	xdr_init_encode_pages(&stream, &xdr);
 
@@ -3520,7 +3525,7 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 	}
 
 	/* we can't keep up! */
-	if (count > NOTIFY4_EVENT_QUEUE_SIZE) {
+	if (count > limit) {
 		spin_unlock(&ncn->ncn_lock);
 		goto out_recall;
 	}
@@ -3567,6 +3572,22 @@ nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
 		nfsd_notify_event_put(nne);
 	}
 	if (!error) {
+		if (dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)) {
+			u32 *maskp = (u32 *)xdr_reserve_space(&stream, sizeof(*maskp));
+
+			if (maskp) {
+				u8 *p = nfsd4_encode_dir_attr_change(&stream, dp, nf);
+
+				if (p) {
+					*maskp = BIT(NOTIFY4_CHANGE_DIR_ATTRS);
+					ncn->ncn_nf[count].notify_mask.count = 1;
+					ncn->ncn_nf[count].notify_mask.element = maskp;
+					ncn->ncn_nf[count].notify_vals.data = p;
+					ncn->ncn_nf[count].notify_vals.len = (u8 *)stream.p - p;
+					++count;
+				}
+			}
+		}
 		ncn->ncn_nf_cnt = count;
 		nfsd_file_put(nf);
 		return true;
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 1e3c360c06cd..7dd8476028d6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4199,7 +4199,8 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 
 static bool
 setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
-		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
+		     struct nfsd_file *nf, struct svc_export *exp,
+		     struct nfsd4_fattr_args *args)
 {
 	int fileid_type, fsid_len, maxsize, flags = 0;
 	struct knfsd_fh *fhp = &args->fhandle;
@@ -4227,6 +4228,17 @@ setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
 
 	fhp->fh_fileid_type = fileid_type;
 	fhp->fh_size += maxsize * 4;
+
+	/*
+	 * fh_compose() appends a MAC to filehandles on signed exports; this
+	 * hand-rolled filehandle must do the same or the client will get back
+	 * an unsigned filehandle that fh_verify() later rejects as stale.
+	 * If we can't sign it, don't hand it out at all.
+	 */
+	if (exp && (exp->ex_flags & NFSEXP_SIGN_FH))
+		if (!fh_append_mac(fhp, NFS4_FHSIZE, exp->cd->net))
+			return false;
+
 	return true;
 }
 
@@ -4240,11 +4252,11 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
 	struct nfs4_file *fi = dp->dl_stid.sc_file;
-	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
-			      .dentry = dentry };
+	struct path path = nf->nf_file->f_path;
 	struct nfsd4_fattr_args args = { };
 	uint32_t *attrmask;
 	__be32 status;
+	bool parent;
 	int ret;
 
 	/* Reserve space for attrmask */
@@ -4256,6 +4268,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 	ne->ne_file.len = namelen;
 	ne->ne_attrs.attrmask.element = attrmask;
 
+	parent = (dentry == path.dentry);
+	path.dentry = dentry;
+
 	/* FIXME: d_find_alias for inode ? */
 	if (!path.dentry || !d_inode(path.dentry))
 		goto noattrs;
@@ -4271,15 +4286,21 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 
 	args.change_attr = nfsd4_change_attribute(&args.stat);
 
-	attrmask[0] = dp->dl_child_attrs[0];
-	attrmask[1] = dp->dl_child_attrs[1];
-	attrmask[2] = 0;
+	if (parent) {
+		attrmask[0] = dp->dl_dir_attrs[0];
+		attrmask[1] = dp->dl_dir_attrs[1];
+	} else {
+		attrmask[0] = dp->dl_child_attrs[0];
+		attrmask[1] = dp->dl_child_attrs[1];
 
-	if (!setup_notify_fhandle(dentry, fi, nf, &args))
-		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
+		if (!setup_notify_fhandle(dentry, fi, nf,
+					  dp->dl_stid.sc_export, &args))
+			attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
 
-	if (!(args.stat.result_mask & STATX_BTIME))
-		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+		if (!(args.stat.result_mask & STATX_BTIME))
+			attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
+	}
+	attrmask[2] = 0;
 
 	ne->ne_attrs.attrmask.count = 2;
 	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
@@ -4392,6 +4413,38 @@ u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *
 	return NULL;
 }
 
+/**
+ * nfsd4_encode_dir_attr_change
+ * @xdr: stream to which to encode the fattr4
+ * @dp: delegation where the event occurred
+ * @nf: nfsd_file opened on the directory
+ *
+ * Encode a dir attr change event.
+ */
+u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+				 struct nfsd_file *nf)
+{
+	struct dentry *dentry = nf->nf_file->f_path.dentry;
+	struct notify_attr4 na = { };
+	bool ret;
+	u8 *p = NULL;
+
+	if (!(dp->dl_notify_mask & BIT(NOTIFY4_CHANGE_DIR_ATTRS)))
+		return NULL;
+
+	/* RFC 8881 s10.4.3: ne_file must be a zero-length string for dir attrs */
+	ret = nfsd4_setup_notify_entry4(&na.na_changed_entry, xdr,
+					dentry, dp, nf, "", 0);
+
+	/* Don't bother with the event if we're not encoding attrs */
+	if (ret && na.na_changed_entry.ne_attrs.attr_vals.len) {
+		p = (u8 *)xdr->p;
+		if (!xdrgen_encode_notify_attr4(xdr, &na))
+			p = NULL;
+	}
+	return p;
+}
+
 static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
 				struct xdr_buf *buf, __be32 *p, int bytes)
 {
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 62ac790428be..805c7122eb93 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -973,6 +973,8 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *nne,
 			      struct nfs4_delegation *dd, struct nfsd_file *nf,
 			      u32 *notify_mask);
+u8 *nfsd4_encode_dir_attr_change(struct xdr_stream *xdr, struct nfs4_delegation *dp,
+				 struct nfsd_file *nf);
 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, union nfsd4_op_u *u);
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 19/20] nfsd: track requested dir attributes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Track the union of requested and supported dir attributes in the
delegation. In a later patch this will be used to ensure that we
only encode the attributes in that union when sending
add/remove/rename updates.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c  |  9 ++++++---
 fs/nfsd/nfs4state.c | 20 ++++++++++++++++----
 fs/nfsd/state.h     |  2 ++
 3 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index caec82e77081..9e86f5907f06 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2530,9 +2530,10 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
-#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_REMOVE_ENTRY) |	\
-				 BIT(NOTIFY4_ADD_ENTRY) |	\
-				 BIT(NOTIFY4_RENAME_ENTRY) |	\
+#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_CHANGE_DIR_ATTRS) |	\
+				 BIT(NOTIFY4_REMOVE_ENTRY) |		\
+				 BIT(NOTIFY4_ADD_ENTRY) |		\
+				 BIT(NOTIFY4_RENAME_ENTRY) |		\
 				 BIT(NOTIFY4_GFLAG_EXTEND))
 
 static __be32
@@ -2579,6 +2580,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid));
 	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
 	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
+	gdd->gddr_dir_attributes[0] = dd->dl_dir_attrs[0];
+	gdd->gddr_dir_attributes[1] = dd->dl_dir_attrs[1];
 	nfs4_put_stid(&dd->dl_stid);
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0e6e008c121e..12627afb604f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -9945,6 +9945,15 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 				 FATTR4_WORD1_TIME_MODIFY |	\
 				 FATTR4_WORD1_TIME_CREATE)
 
+#define GDD_WORD0_DIR_ATTRS	(FATTR4_WORD0_CHANGE |		\
+				 FATTR4_WORD0_SIZE)
+
+#define GDD_WORD1_DIR_ATTRS	(FATTR4_WORD1_NUMLINKS |	\
+				 FATTR4_WORD1_SPACE_USED |	\
+				 FATTR4_WORD1_TIME_ACCESS |	\
+				 FATTR4_WORD1_TIME_METADATA |	\
+				 FATTR4_WORD1_TIME_MODIFY)
+
 /**
  * nfsd_get_dir_deleg - attempt to get a directory delegation
  * @cstate: compound state
@@ -10013,14 +10022,17 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 		dp->dl_stid.sc_export =
 			exp_get(cstate->current_fh.fh_export);
 
-	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
-	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
-
 	/*
 	 * NB: gddr_notification[0] represents the notifications that
 	 * will be granted to the client
 	 */
-	fl = nfs4_alloc_init_lease(dp, gdd->gddr_notification[0]);
+	dp->dl_notify_mask = gdd->gddr_notification[0];
+	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
+	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
+	dp->dl_dir_attrs[0] = gdd->gdda_dir_attributes[0] & GDD_WORD0_DIR_ATTRS;
+	dp->dl_dir_attrs[1] = gdd->gdda_dir_attributes[1] & GDD_WORD1_DIR_ATTRS;
+
+	fl = nfs4_alloc_init_lease(dp, dp->dl_notify_mask);
 	if (!fl)
 		goto out_put_stid;
 
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 0763893bfd48..17be4011740d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -299,7 +299,9 @@ struct nfs4_delegation {
 	struct timespec64	dl_ctime;
 
 	/* For dir delegations */
+	uint32_t		dl_notify_mask;
 	uint32_t		dl_child_attrs[2];
+	uint32_t		dl_dir_attrs[2];
 };
 
 static inline bool deleg_is_read(u32 dl_type)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 18/20] nfsd: properly track requested child attributes
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Track the union of requested and supported child attributes in the
delegation, and only encode the attributes in that union when sending
add/remove/rename updates.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c  |  2 ++
 fs/nfsd/nfs4state.c | 18 ++++++++++++++++++
 fs/nfsd/nfs4xdr.c   | 15 ++++++---------
 fs/nfsd/state.h     |  3 +++
 4 files changed, 29 insertions(+), 9 deletions(-)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 29f7339dc220..caec82e77081 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2577,6 +2577,8 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 
 	gdd->gddrnf_status = GDD4_OK;
 	memcpy(&gdd->gddr_stateid, &dd->dl_stid.sc_stateid, sizeof(gdd->gddr_stateid));
+	gdd->gddr_child_attributes[0] = dd->dl_child_attrs[0];
+	gdd->gddr_child_attributes[1] = dd->dl_child_attrs[1];
 	nfs4_put_stid(&dd->dl_stid);
 	return nfs_ok;
 }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index aa99783ce901..0e6e008c121e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -9930,6 +9930,21 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry,
 	return status;
 }
 
+#define GDD_WORD0_CHILD_ATTRS	(FATTR4_WORD0_TYPE |		\
+				 FATTR4_WORD0_CHANGE |		\
+				 FATTR4_WORD0_SIZE |		\
+				 FATTR4_WORD0_FILEID |		\
+				 FATTR4_WORD0_FILEHANDLE)
+
+#define GDD_WORD1_CHILD_ATTRS	(FATTR4_WORD1_MODE |		\
+				 FATTR4_WORD1_NUMLINKS |	\
+				 FATTR4_WORD1_RAWDEV |		\
+				 FATTR4_WORD1_SPACE_USED |	\
+				 FATTR4_WORD1_TIME_ACCESS |	\
+				 FATTR4_WORD1_TIME_METADATA |	\
+				 FATTR4_WORD1_TIME_MODIFY |	\
+				 FATTR4_WORD1_TIME_CREATE)
+
 /**
  * nfsd_get_dir_deleg - attempt to get a directory delegation
  * @cstate: compound state
@@ -9998,6 +10013,9 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 		dp->dl_stid.sc_export =
 			exp_get(cstate->current_fh.fh_export);
 
+	dp->dl_child_attrs[0] = gdd->gdda_child_attributes[0] & GDD_WORD0_CHILD_ATTRS;
+	dp->dl_child_attrs[1] = gdd->gdda_child_attributes[1] & GDD_WORD1_CHILD_ATTRS;
+
 	/*
 	 * NB: gddr_notification[0] represents the notifications that
 	 * will be granted to the client
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15ccd54ffdb6..1e3c360c06cd 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4271,18 +4271,15 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 
 	args.change_attr = nfsd4_change_attribute(&args.stat);
 
-	attrmask[0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
-		      FATTR4_WORD0_SIZE | FATTR4_WORD0_FILEID;
-	attrmask[1] = FATTR4_WORD1_MODE | FATTR4_WORD1_NUMLINKS | FATTR4_WORD1_RAWDEV |
-		      FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS |
-		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
+	attrmask[0] = dp->dl_child_attrs[0];
+	attrmask[1] = dp->dl_child_attrs[1];
 	attrmask[2] = 0;
 
-	if (setup_notify_fhandle(dentry, fi, nf, &args))
-		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
+	if (!setup_notify_fhandle(dentry, fi, nf, &args))
+		attrmask[0] &= ~FATTR4_WORD0_FILEHANDLE;
 
-	if (args.stat.result_mask & STATX_BTIME)
-		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
+	if (!(args.stat.result_mask & STATX_BTIME))
+		attrmask[1] &= ~FATTR4_WORD1_TIME_CREATE;
 
 	ne->ne_attrs.attrmask.count = 2;
 	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index d912e3d04dd7..0763893bfd48 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -297,6 +297,9 @@ struct nfs4_delegation {
 	struct timespec64	dl_atime;
 	struct timespec64	dl_mtime;
 	struct timespec64	dl_ctime;
+
+	/* For dir delegations */
+	uint32_t		dl_child_attrs[2];
 };
 
 static inline bool deleg_is_read(u32 dl_type)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 17/20] nfsd: add the filehandle to returned attributes in CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

nfsd's usual fh_compose routine requires a svc_export and fills out a
svc_fh. In the context of a CB_NOTIFY there is no such export to
consult.

Add a new routine that composes a filehandle with only a parent
filehandle and nfs4_file. Use that to fill out the fhandle field in the
nfsd4_fattr_args.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7b19248b1503..15ccd54ffdb6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4197,6 +4197,39 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	goto out;
 }
 
+static bool
+setup_notify_fhandle(struct dentry *dentry, struct nfs4_file *fi,
+		     struct nfsd_file *nf, struct nfsd4_fattr_args *args)
+{
+	int fileid_type, fsid_len, maxsize, flags = 0;
+	struct knfsd_fh *fhp = &args->fhandle;
+	struct inode *inode = d_inode(dentry);
+	struct inode *parent = NULL;
+	struct fid *fid;
+
+	fsid_len = key_len(fi->fi_fhandle.fh_fsid_type);
+	fhp->fh_size = 4 + fsid_len;
+
+	/* Copy first 4 bytes + fsid */
+	memcpy(&fhp->fh_raw, &fi->fi_fhandle.fh_raw, fhp->fh_size);
+
+	fid = (struct fid *)(fh_fsid(fhp) + fsid_len/4);
+	maxsize = (NFS4_FHSIZE - fhp->fh_size)/4;
+
+	if (fi->fi_connectable && !S_ISDIR(inode->i_mode)) {
+		parent = d_inode(nf->nf_file->f_path.dentry);
+		flags = EXPORT_FH_CONNECTABLE;
+	}
+
+	fileid_type = exportfs_encode_inode_fh(inode, fid, &maxsize, parent, flags);
+	if (fileid_type < 0 || fileid_type == FILEID_INVALID)
+		return false;
+
+	fhp->fh_fileid_type = fileid_type;
+	fhp->fh_size += maxsize * 4;
+	return true;
+}
+
 #define CB_NOTIFY_STATX_REQUEST_MASK (STATX_BASIC_STATS   | \
 				      STATX_BTIME	  | \
 				      STATX_CHANGE_COOKIE)
@@ -4206,6 +4239,7 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct dentry *dentry, struct nfs4_delegation *dp,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
+	struct nfs4_file *fi = dp->dl_stid.sc_file;
 	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
 			      .dentry = dentry };
 	struct nfsd4_fattr_args args = { };
@@ -4244,6 +4278,9 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
 	attrmask[2] = 0;
 
+	if (setup_notify_fhandle(dentry, fi, nf, &args))
+		attrmask[0] |= FATTR4_WORD0_FILEHANDLE;
+
 	if (args.stat.result_mask & STATX_BTIME)
 		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
 

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 16/20] nfsd: add a fi_connectable flag to struct nfs4_file
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

When encoding a filehandle for a CB_NOTIFY, there is no svc_export
available, but the server needs to know whether to encode a connectable
filehandle. Add a flag to the nfs4_file that tells whether the
svc_export under which a directory delegation was acquired has subtree
checking enabled, in which case it needs connectable filehandles.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 1 +
 fs/nfsd/state.h     | 1 +
 2 files changed, 2 insertions(+)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 513cbc1a583f..aa99783ce901 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -5231,6 +5231,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
 	memset(fp->fi_access, 0, sizeof(fp->fi_access));
 	fp->fi_aliased = false;
 	fp->fi_inode = d_inode(fh->fh_dentry);
+	fp->fi_connectable = !(fh->fh_export->ex_flags & NFSEXP_NOSUBTREECHECK);
 #ifdef CONFIG_NFSD_PNFS
 	INIT_LIST_HEAD(&fp->fi_lo_states);
 	atomic_set(&fp->fi_lo_recalls, 0);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index f8457e0f2b57..d912e3d04dd7 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -761,6 +761,7 @@ struct nfs4_file {
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;
 	bool			fi_had_conflict;
+	bool			fi_connectable;
 #ifdef CONFIG_NFSD_PNFS
 	struct list_head	fi_lo_states;
 	atomic_t		fi_lo_recalls;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 15/20] nfsd: allow encoding a filehandle into fattr4 without a svc_fh
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

The current fattr4 encoder requires a svc_fh in order to encode the
filehandle. This is not available in a CB_NOTIFY callback. Add a a new
"fhandle" field to struct nfsd4_fattr_args and copy the filehandle into
there from the svc_fh. CB_NOTIFY will populate it via other means.

A filehandle composed this way may still need a MAC appended on signed
exports, so generalize fh_append_mac() to operate on a bare knfsd_fh
(plus its maximum size and net) rather than a svc_fh.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 36 +++++++++++++++++++++---------------
 fs/nfsd/nfsfh.c   | 10 +++++-----
 fs/nfsd/nfsfh.h   |  1 +
 3 files changed, 27 insertions(+), 20 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 4fb61d05a4a7..7b19248b1503 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2715,7 +2715,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
 }
 
 static __be32 nfsd4_encode_nfs_fh4(struct xdr_stream *xdr,
-				   struct knfsd_fh *fh_handle)
+				   const struct knfsd_fh *fh_handle)
 {
 	return nfsd4_encode_opaque(xdr, fh_handle->fh_raw, fh_handle->fh_size);
 }
@@ -3158,6 +3158,7 @@ struct nfsd4_fattr_args {
 	struct svc_fh		*fhp;
 	struct svc_export	*exp;
 	struct dentry		*dentry;
+	struct knfsd_fh		fhandle;
 	struct kstat		stat;
 	struct kstatfs		statfs;
 	struct nfs4_acl		*acl;
@@ -3402,7 +3403,7 @@ static __be32 nfsd4_encode_fattr4_homogeneous(struct xdr_stream *xdr,
 static __be32 nfsd4_encode_fattr4_filehandle(struct xdr_stream *xdr,
 					     const struct nfsd4_fattr_args *args)
 {
-	return nfsd4_encode_nfs_fh4(xdr, &args->fhp->fh_handle);
+	return nfsd4_encode_nfs_fh4(xdr, &args->fhandle);
 }
 
 static __be32 nfsd4_encode_fattr4_fileid(struct xdr_stream *xdr,
@@ -4015,19 +4016,24 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		if (err)
 			goto out_nfserr;
 	}
-	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID)) &&
-	    !fhp) {
-		tempfh = kmalloc_obj(struct svc_fh);
-		status = nfserr_jukebox;
-		if (!tempfh)
-			goto out;
-		fh_init(tempfh, NFS4_FHSIZE);
-		status = fh_compose(tempfh, exp, dentry, NULL);
-		if (status)
-			goto out;
-		args.fhp = tempfh;
-	} else
-		args.fhp = fhp;
+
+	args.fhp = fhp;
+	if ((attrmask[0] & (FATTR4_WORD0_FILEHANDLE | FATTR4_WORD0_FSID))) {
+		if (!args.fhp) {
+			tempfh = kmalloc_obj(struct svc_fh);
+			status = nfserr_jukebox;
+			if (!tempfh)
+				goto out;
+			fh_init(tempfh, NFS4_FHSIZE);
+			status = fh_compose(tempfh, exp, dentry, NULL);
+			if (status)
+				goto out;
+			args.fhp = tempfh;
+		}
+		if (args.fhp)
+			fh_copy_shallow(&args.fhandle, &args.fhp->fh_handle);
+	}
+
 	if (attrmask[0] & (FATTR4_WORD0_CASE_INSENSITIVE |
 			   FATTR4_WORD0_CASE_PRESERVING)) {
 		/*
diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c
index b36915401758..3b29cd70d4a1 100644
--- a/fs/nfsd/nfsfh.c
+++ b/fs/nfsd/nfsfh.c
@@ -144,16 +144,15 @@ static inline __be32 check_pseudo_root(struct dentry *dentry,
 /* Size of a file handle MAC, in 4-octet words */
 #define FH_MAC_WORDS (sizeof(__le64) / 4)
 
-static bool fh_append_mac(struct svc_fh *fhp, struct net *net)
+bool fh_append_mac(struct knfsd_fh *fh, int fh_maxsize, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
-	struct knfsd_fh *fh = &fhp->fh_handle;
 	siphash_key_t *fh_key = nn->fh_key;
 	__le64 hash;
 
 	if (!fh_key)
 		goto out_no_key;
-	if (fh->fh_size + sizeof(hash) > fhp->fh_maxsize)
+	if (fh->fh_size + sizeof(hash) > fh_maxsize)
 		goto out_no_space;
 
 	hash = cpu_to_le64(siphash(&fh->fh_raw, fh->fh_size, fh_key));
@@ -167,7 +166,7 @@ static bool fh_append_mac(struct svc_fh *fhp, struct net *net)
 
 out_no_space:
 	pr_warn_ratelimited("NFSD: unable to sign filehandles, fh_size %zu would be greater than fh_maxsize %d.\n",
-			    fh->fh_size + sizeof(hash), fhp->fh_maxsize);
+			    fh->fh_size + sizeof(hash), fh_maxsize);
 	return false;
 }
 
@@ -566,7 +565,8 @@ static void _fh_update(struct svc_fh *fhp, struct svc_export *exp,
 		fhp->fh_handle.fh_size += maxsize * 4;
 
 		if (exp->ex_flags & NFSEXP_SIGN_FH)
-			if (!fh_append_mac(fhp, exp->cd->net))
+			if (!fh_append_mac(&fhp->fh_handle, fhp->fh_maxsize,
+					   exp->cd->net))
 				fhp->fh_handle.fh_fileid_type = FILEID_INVALID;
 	} else {
 		fhp->fh_handle.fh_fileid_type = FILEID_ROOT;
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 5ef7191f8ad8..5dc10b442d6c 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -226,6 +226,7 @@ __be32	fh_getattr(const struct svc_fh *fhp, struct kstat *stat);
 __be32	fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *);
 __be32	fh_update(struct svc_fh *);
 void	fh_put(struct svc_fh *);
+bool	fh_append_mac(struct knfsd_fh *fh, int fh_maxsize, struct net *net);
 
 static __inline__ struct svc_fh *
 fh_copy(struct svc_fh *dst, const struct svc_fh *src)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 14/20] nfsd: send basic file attributes in CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

In addition to the filename, send attributes about the inode in a
CB_NOTIFY event. This patch just adds a the basic inode information that
can be acquired via GETATTR.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 18adab1d7ca2..4fb61d05a4a7 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4191,12 +4191,21 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	goto out;
 }
 
+#define CB_NOTIFY_STATX_REQUEST_MASK (STATX_BASIC_STATS   | \
+				      STATX_BTIME	  | \
+				      STATX_CHANGE_COOKIE)
+
 static bool
 nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 			  struct dentry *dentry, struct nfs4_delegation *dp,
 			  struct nfsd_file *nf, char *name, u32 namelen)
 {
+	struct path path =  { .mnt = nf->nf_file->f_path.mnt,
+			      .dentry = dentry };
+	struct nfsd4_fattr_args args = { };
 	uint32_t *attrmask;
+	__be32 status;
+	int ret;
 
 	/* Reserve space for attrmask */
 	attrmask = xdr_reserve_space(xdr, 3 * sizeof(uint32_t));
@@ -4207,6 +4216,41 @@ nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
 	ne->ne_file.len = namelen;
 	ne->ne_attrs.attrmask.element = attrmask;
 
+	/* FIXME: d_find_alias for inode ? */
+	if (!path.dentry || !d_inode(path.dentry))
+		goto noattrs;
+
+	/*
+	 * It is possible that the client was granted a delegation when a file
+	 * was created. Note that we don't issue a CB_GETATTR here since stale
+	 * attributes are presumably ok.
+	 */
+	ret = vfs_getattr(&path, &args.stat, CB_NOTIFY_STATX_REQUEST_MASK, AT_STATX_SYNC_AS_STAT);
+	if (ret)
+		goto noattrs;
+
+	args.change_attr = nfsd4_change_attribute(&args.stat);
+
+	attrmask[0] = FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE |
+		      FATTR4_WORD0_SIZE | FATTR4_WORD0_FILEID;
+	attrmask[1] = FATTR4_WORD1_MODE | FATTR4_WORD1_NUMLINKS | FATTR4_WORD1_RAWDEV |
+		      FATTR4_WORD1_SPACE_USED | FATTR4_WORD1_TIME_ACCESS |
+		      FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY;
+	attrmask[2] = 0;
+
+	if (args.stat.result_mask & STATX_BTIME)
+		attrmask[1] |= FATTR4_WORD1_TIME_CREATE;
+
+	ne->ne_attrs.attrmask.count = 2;
+	ne->ne_attrs.attr_vals.data = (u8 *)xdr->p;
+
+	status = nfsd4_encode_attr_vals(xdr, attrmask, &args);
+	if (status != nfs_ok)
+		goto noattrs;
+
+	ne->ne_attrs.attr_vals.len = (u8 *)xdr->p - ne->ne_attrs.attr_vals.data;
+	return true;
+noattrs:
 	attrmask[0] = 0;
 	attrmask[1] = 0;
 	attrmask[2] = 0;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 13/20] nfsd: allow nfsd4_encode_fattr4_change() to work with no export
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

In the context of a CB_NOTIFY callback, we may not have easy access to
a svc_export. nfsd will not currently grant a delegation on a the V4 root
however, so this should be safe.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 7d162e5fb6ec..18adab1d7ca2 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3273,7 +3273,7 @@ static __be32 nfsd4_encode_fattr4_change(struct xdr_stream *xdr,
 {
 	const struct svc_export *exp = args->exp;
 
-	if (unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
+	if (exp && unlikely(exp->ex_flags & NFSEXP_V4ROOT)) {
 		u32 flush_time = convert_to_wallclock(exp->cd->flush_time);
 
 		if (xdr_stream_encode_u32(xdr, flush_time) != XDR_UNIT)

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 12/20] nfsd: add helper to marshal a fattr4 from completed args
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Break the loop that encodes the actual attr_vals field into a separate
function.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4xdr.c | 44 +++++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 19 deletions(-)

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c6f92ddeb449..7d162e5fb6ec 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3895,6 +3895,22 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
 #endif
 };
 
+static __be32
+nfsd4_encode_attr_vals(struct xdr_stream *xdr, u32 *attrmask, struct nfsd4_fattr_args *args)
+{
+	DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+	unsigned long bit;
+	__be32 status;
+
+	bitmap_from_arr32(attr_bitmap, attrmask, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
+	for_each_set_bit(bit, attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
+		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, args);
+		if (status != nfs_ok)
+			return status;
+	}
+	return nfs_ok;
+}
+
 /*
  * Note: @fhp can be NULL; in this case, we might have to compose the filehandle
  * ourselves. @case_cache is NULL for callers that encode a single dentry
@@ -3908,7 +3924,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		    int ignore_crossmnt,
 		    struct nfsd_case_attrs_cache *case_cache)
 {
-	DECLARE_BITMAP(attr_bitmap, ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
 	struct nfs4_delegation *dp = NULL;
 	struct nfsd4_fattr_args args;
 	struct svc_fh *tempfh = NULL;
@@ -3923,7 +3938,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 		.mnt	= exp->ex_path.mnt,
 		.dentry	= dentry,
 	};
-	unsigned long bit;
 
 	WARN_ON_ONCE(bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1);
 	WARN_ON_ONCE(!nfsd_attrs_supported(minorversion, bmval));
@@ -4137,27 +4151,22 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 #endif /* CONFIG_NFSD_V4_POSIX_ACLS */
 
 	/* attrmask */
-	status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1],
-				      attrmask[2]);
+	status = nfsd4_encode_bitmap4(xdr, attrmask[0], attrmask[1], attrmask[2]);
 	if (status)
 		goto out;
 
 	/* attr_vals */
 	attrlen_offset = xdr->buf->len;
-	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT)))
-		goto out_resource;
-	bitmap_from_arr32(attr_bitmap, attrmask,
-			  ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops));
-	for_each_set_bit(bit, attr_bitmap,
-			 ARRAY_SIZE(nfsd4_enc_fattr4_encode_ops)) {
-		status = nfsd4_enc_fattr4_encode_ops[bit](xdr, &args);
-		if (status != nfs_ok)
-			goto out;
+	if (unlikely(!xdr_reserve_space(xdr, XDR_UNIT))) {
+		status = nfserr_resource;
+		goto out;
 	}
-	attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
-	write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT);
-	status = nfs_ok;
 
+	status = nfsd4_encode_attr_vals(xdr, attrmask, &args);
+	if (status == nfs_ok) {
+		attrlen = cpu_to_be32(xdr->buf->len - attrlen_offset - XDR_UNIT);
+		write_bytes_to_xdr_buf(xdr->buf, attrlen_offset, &attrlen, XDR_UNIT);
+	}
 out:
 #ifdef CONFIG_NFSD_V4_POSIX_ACLS
 	if (args.dpacl)
@@ -4180,9 +4189,6 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 out_nfserr:
 	status = nfserrno(err);
 	goto out;
-out_resource:
-	status = nfserr_resource;
-	goto out;
 }
 
 static bool

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 11/20] nfsd: apply the notify mask to the delegation when requested
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

If the client requests a directory delegation with notifications
enabled, set the appropriate return mask in gddr_notification[0]. This
will ensure the lease acquisition sets the appropriate ignore mask.

If the client doesn't set NOTIFY4_GFLAG_EXTEND, then don't offer any
notifications, as nfsd won't provide directory offset information, and
"classic" notifications require them.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4proc.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 0c37d7c6d28c..29f7339dc220 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -2530,12 +2530,18 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	return status == nfserr_same ? nfs_ok : status;
 }
 
+#define SUPPORTED_NOTIFY_MASK	(BIT(NOTIFY4_REMOVE_ENTRY) |	\
+				 BIT(NOTIFY4_ADD_ENTRY) |	\
+				 BIT(NOTIFY4_RENAME_ENTRY) |	\
+				 BIT(NOTIFY4_GFLAG_EXTEND))
+
 static __be32
 nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 			 struct nfsd4_compound_state *cstate,
 			 union nfsd4_op_u *u)
 {
 	struct nfsd4_get_dir_delegation *gdd = &u->get_dir_delegation;
+	u32 requested = gdd->gdda_notification_types[0];
 	struct nfs4_delegation *dd;
 	struct nfsd_file *nf;
 	__be32 status;
@@ -2544,6 +2550,12 @@ nfsd4_get_dir_delegation(struct svc_rqst *rqstp,
 	if (status != nfs_ok)
 		return status;
 
+	/* No notifications if you don't set NOTIFY4_GFLAG_EXTEND! */
+	if (!(requested & BIT(NOTIFY4_GFLAG_EXTEND)))
+		requested = 0;
+
+	gdd->gddr_notification[0] = requested & SUPPORTED_NOTIFY_MASK;
+
 	/*
 	 * RFC 8881, section 18.39.3 says:
 	 *

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 10/20] nfsd: add notification handlers for dir events
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Add the necessary parts to accept a fsnotify callback for directory
change event and create a CB_NOTIFY request for it. When a dir nfsd_file
is created set a handle_event callback to handle the notification.

Use that to allocate a nfsd_notify_event object and then hand off a
reference to each delegation's CB_NOTIFY. If anything fails along the
way, recall any affected delegations.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/filecache.c    |  70 ++++++++---
 fs/nfsd/nfs4callback.c |  54 ++++++++-
 fs/nfsd/nfs4state.c    | 322 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/nfsd/nfs4xdr.c      | 117 ++++++++++++++++++
 fs/nfsd/state.h        |  20 ++-
 fs/nfsd/trace.h        |  23 ++++
 fs/nfsd/xdr4.h         |   3 +
 7 files changed, 564 insertions(+), 45 deletions(-)

diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index c5f2c5768324..b9548eb17c77 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -78,6 +78,7 @@ static struct kmem_cache		*nfsd_file_mark_slab;
 static struct list_lru			nfsd_file_lru;
 static unsigned long			nfsd_file_flags;
 static struct fsnotify_group		*nfsd_file_fsnotify_group;
+static struct fsnotify_group		*nfsd_dir_fsnotify_group;
 static struct delayed_work		nfsd_filecache_laundrette;
 static struct rhltable			nfsd_file_rhltable
 						____cacheline_aligned_in_smp;
@@ -153,7 +154,7 @@ static void
 nfsd_file_mark_put(struct nfsd_file_mark *nfm)
 {
 	if (refcount_dec_and_test(&nfm->nfm_ref)) {
-		fsnotify_destroy_mark(&nfm->nfm_mark, nfsd_file_fsnotify_group);
+		fsnotify_destroy_mark(&nfm->nfm_mark, nfm->nfm_mark.group);
 		fsnotify_put_mark(&nfm->nfm_mark);
 	}
 }
@@ -161,35 +162,37 @@ nfsd_file_mark_put(struct nfsd_file_mark *nfm)
 static struct nfsd_file_mark *
 nfsd_file_mark_find_or_create(struct inode *inode)
 {
-	int			err;
-	struct fsnotify_mark	*mark;
 	struct nfsd_file_mark	*nfm = NULL, *new;
+	struct fsnotify_group	*group;
+	struct fsnotify_mark	*mark;
+	int			err;
+
+	group = S_ISDIR(inode->i_mode) ? nfsd_dir_fsnotify_group : nfsd_file_fsnotify_group;
 
 	do {
-		fsnotify_group_lock(nfsd_file_fsnotify_group);
-		mark = fsnotify_find_inode_mark(inode,
-						nfsd_file_fsnotify_group);
+		fsnotify_group_lock(group);
+		mark = fsnotify_find_inode_mark(inode, group);
 		if (mark) {
 			nfm = nfsd_file_mark_get(container_of(mark,
 						 struct nfsd_file_mark,
 						 nfm_mark));
-			fsnotify_group_unlock(nfsd_file_fsnotify_group);
+			fsnotify_group_unlock(group);
 			if (nfm) {
 				fsnotify_put_mark(mark);
 				break;
 			}
 			/* Avoid soft lockup race with nfsd_file_mark_put() */
-			fsnotify_destroy_mark(mark, nfsd_file_fsnotify_group);
+			fsnotify_destroy_mark(mark, group);
 			fsnotify_put_mark(mark);
 		} else {
-			fsnotify_group_unlock(nfsd_file_fsnotify_group);
+			fsnotify_group_unlock(group);
 		}
 
 		/* allocate a new nfm */
 		new = kmem_cache_alloc(nfsd_file_mark_slab, GFP_KERNEL);
 		if (!new)
 			return NULL;
-		fsnotify_init_mark(&new->nfm_mark, nfsd_file_fsnotify_group);
+		fsnotify_init_mark(&new->nfm_mark, group);
 		new->nfm_mark.mask = FS_ATTRIB|FS_DELETE_SELF;
 		refcount_set(&new->nfm_ref, 1);
 		mutex_init(&new->nfm_recalc_mutex);
@@ -830,12 +833,36 @@ nfsd_file_fsnotify_handle_event(struct fsnotify_mark *mark, u32 mask,
 	return 0;
 }
 
+#ifdef CONFIG_NFSD_V4
+static int
+nfsd_dir_fsnotify_handle_event(struct fsnotify_group *group, u32 mask,
+			       const void *data, int data_type, struct inode *dir,
+			       const struct qstr *name, u32 cookie,
+			       struct fsnotify_iter_info *iter_info)
+{
+	return nfsd_handle_dir_event(mask, dir, data, data_type, name);
+}
+#else
+static int
+nfsd_dir_fsnotify_handle_event(struct fsnotify_group *group, u32 mask,
+			       const void *data, int data_type, struct inode *dir,
+			       const struct qstr *name, u32 cookie,
+			       struct fsnotify_iter_info *iter_info)
+{
+	return 0;
+}
+#endif
 
 static const struct fsnotify_ops nfsd_file_fsnotify_ops = {
 	.handle_inode_event = nfsd_file_fsnotify_handle_event,
 	.free_mark = nfsd_file_mark_free,
 };
 
+static const struct fsnotify_ops nfsd_dir_fsnotify_ops = {
+	.handle_event = nfsd_dir_fsnotify_handle_event,
+	.free_mark = nfsd_file_mark_free,
+};
+
 int
 nfsd_file_cache_init(void)
 {
@@ -887,8 +914,7 @@ nfsd_file_cache_init(void)
 		goto out_shrinker;
 	}
 
-	nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops,
-							0);
+	nfsd_file_fsnotify_group = fsnotify_alloc_group(&nfsd_file_fsnotify_ops, 0);
 	if (IS_ERR(nfsd_file_fsnotify_group)) {
 		pr_err("nfsd: unable to create fsnotify group: %ld\n",
 			PTR_ERR(nfsd_file_fsnotify_group));
@@ -897,11 +923,23 @@ nfsd_file_cache_init(void)
 		goto out_notifier;
 	}
 
+	nfsd_dir_fsnotify_group = fsnotify_alloc_group(&nfsd_dir_fsnotify_ops, 0);
+	if (IS_ERR(nfsd_dir_fsnotify_group)) {
+		pr_err("nfsd: unable to create fsnotify group: %ld\n",
+			PTR_ERR(nfsd_dir_fsnotify_group));
+		ret = PTR_ERR(nfsd_dir_fsnotify_group);
+		nfsd_dir_fsnotify_group = NULL;
+		goto out_notify_group;
+	}
+
 	INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker);
 out:
 	if (ret)
 		clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags);
 	return ret;
+out_notify_group:
+	fsnotify_put_group(nfsd_file_fsnotify_group);
+	nfsd_file_fsnotify_group = NULL;
 out_notifier:
 	lease_unregister_notifier(&nfsd_file_lease_notifier);
 out_shrinker:
@@ -1019,6 +1057,8 @@ nfsd_file_cache_shutdown(void)
 	rcu_barrier();
 	fsnotify_put_group(nfsd_file_fsnotify_group);
 	nfsd_file_fsnotify_group = NULL;
+	fsnotify_put_group(nfsd_dir_fsnotify_group);
+	nfsd_dir_fsnotify_group = NULL;
 	kmem_cache_destroy(nfsd_file_slab);
 	nfsd_file_slab = NULL;
 	fsnotify_wait_marks_destroyed();
@@ -1223,10 +1263,8 @@ nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net,
 open_file:
 	trace_nfsd_file_alloc(nf);
 
-	if (type == S_IFREG)
-		nf->nf_mark = nfsd_file_mark_find_or_create(inode);
-
-	if (type != S_IFREG || nf->nf_mark) {
+	nf->nf_mark = nfsd_file_mark_find_or_create(inode);
+	if (nf->nf_mark) {
 		if (file && (file->f_mode & FMODE_OPENED)) {
 			get_file(file);
 			nf->nf_file = file;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ca4dd2f969eb..59378751d596 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -892,11 +892,15 @@ static void nfs4_xdr_enc_cb_notify(struct rpc_rqst *req,
 				   const void *data)
 {
 	const struct nfsd4_callback *cb = data;
+	struct nfsd4_cb_notify *ncn = container_of(cb, struct nfsd4_cb_notify, ncn_cb);
+	struct nfs4_delegation *dp = container_of(ncn, struct nfs4_delegation, dl_cb_notify);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = 0,
 		.minorversion = cb->cb_clp->cl_minorversion,
 	};
-	struct CB_NOTIFY4args args = { };
+	struct CB_NOTIFY4args args;
+	unsigned int start;
+	__be32 *p;
 
 	WARN_ON_ONCE(hdr.minorversion == 0);
 
@@ -904,13 +908,45 @@ static void nfs4_xdr_enc_cb_notify(struct rpc_rqst *req,
 	encode_cb_sequence4args(xdr, cb, &hdr);
 
 	/*
-	 * FIXME: get stateid and fh from delegation. Inline the cna_changes
-	 * buffer, and zero it.
+	 * nfsd4_cb_notify_prepare() sized the payload against a single page,
+	 * but did not account for the compound, sequence, stateid, and
+	 * filehandle encoded here. If the variable-length encode overflows the
+	 * backchannel send buffer, roll back to before the operation so that a
+	 * truncated CB_NOTIFY is never placed on the wire.
 	 */
-	xdrgen_encode_CB_NOTIFY4args(xdr, &args);
+	start = xdr_stream_pos(xdr);
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		goto out_err;
+	*p = cpu_to_be32(OP_CB_NOTIFY);
+
+	args.cna_stateid.seqid = dp->dl_stid.sc_stateid.si_generation;
+	memcpy(&args.cna_stateid.other, &dp->dl_stid.sc_stateid.si_opaque,
+	       ARRAY_SIZE(args.cna_stateid.other));
+	args.cna_fh.len = dp->dl_stid.sc_file->fi_fhandle.fh_size;
+	args.cna_fh.data = dp->dl_stid.sc_file->fi_fhandle.fh_raw;
+	args.cna_changes.count = ncn->ncn_nf_cnt;
+	args.cna_changes.element = ncn->ncn_nf;
+	if (!xdrgen_encode_CB_NOTIFY4args(xdr, &args))
+		goto out_err;
 
 	hdr.nops++;
 	encode_cb_nops(&hdr);
+	return;
+
+out_err:
+	/*
+	 * Drop the CB_NOTIFY op and emit a valid CB_SEQUENCE-only compound so
+	 * the client still advances its slot. Flag the failure so the done
+	 * handler recalls the delegation and the missed notification is not
+	 * silently lost. The flag is written here in the transmit path and read
+	 * in the done handler; the two are serialized phases of the same
+	 * rpc_task, so no additional barrier is needed.
+	 */
+	ncn->ncn_encode_err = true;
+	xdr_truncate_encode(xdr, start);
+	encode_cb_nops(&hdr);
 }
 
 static int nfs4_xdr_dec_cb_notify(struct rpc_rqst *rqstp,
@@ -1408,6 +1444,16 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
 	else
 		clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags);
 
+	/*
+	 * Order the clear of NFSD4_CALLBACK_RUNNING above before the ->release()
+	 * callback below. A release op may re-check producer-side state to decide
+	 * whether to requeue itself (see nfsd4_cb_notify_release()), and that
+	 * check must not be reordered ahead of the clear. The plain clear_bit()
+	 * path carries no ordering; clear_and_wake_up_bit() already issues this
+	 * barrier internally, so the extra one is harmless there.
+	 */
+	smp_mb__after_atomic();
+
 	if (cb->cb_ops && cb->cb_ops->release)
 		cb->cb_ops->release(cb);
 	nfsd41_cb_inflight_end(clp);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 0a15d7f3b543..513cbc1a583f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -55,6 +55,7 @@
 #include "netns.h"
 #include "pnfs.h"
 #include "filecache.h"
+#include "nfs4xdr_gen.h"
 #include "trace.h"
 
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -3471,19 +3472,146 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
+static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
+{
+	bool queued;
+
+	if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
+		return;
+
+	/*
+	 * We're assuming the state code never drops its reference
+	 * without first removing the lease.  Since we're in this lease
+	 * callback (and since the lease code is serialized by the
+	 * flc_lock) we know the server hasn't removed the lease yet, and
+	 * we know it's safe to take a reference.
+	 */
+	refcount_inc(&dp->dl_stid.sc_count);
+	queued = nfsd4_run_cb(&dp->dl_recall);
+	WARN_ON_ONCE(!queued);
+	if (!queued) {
+		refcount_dec(&dp->dl_stid.sc_count);
+		clear_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags);
+	}
+}
+
+static bool
+nfsd4_cb_notify_prepare(struct nfsd4_callback *cb)
+{
+	struct nfsd4_cb_notify *ncn = container_of(cb, struct nfsd4_cb_notify, ncn_cb);
+	struct nfs4_delegation *dp = container_of(ncn, struct nfs4_delegation, dl_cb_notify);
+	struct nfsd_notify_event *events[NOTIFY4_EVENT_QUEUE_SIZE];
+	struct xdr_buf xdr = { .buflen = PAGE_SIZE * NOTIFY4_PAGE_ARRAY_SIZE,
+			       .pages  = ncn->ncn_pages };
+	struct xdr_stream stream;
+	struct nfsd_file *nf;
+	int count, i;
+	bool error = false;
+
+	xdr_init_encode_pages(&stream, &xdr);
+
+	spin_lock(&ncn->ncn_lock);
+	count = ncn->ncn_evt_cnt;
+
+	/* spurious queueing? */
+	if (count == 0) {
+		spin_unlock(&ncn->ncn_lock);
+		return false;
+	}
+
+	/* we can't keep up! */
+	if (count > NOTIFY4_EVENT_QUEUE_SIZE) {
+		spin_unlock(&ncn->ncn_lock);
+		goto out_recall;
+	}
+
+	memcpy(events, ncn->ncn_evt, sizeof(*events) * count);
+	ncn->ncn_evt_cnt = 0;
+	spin_unlock(&ncn->ncn_lock);
+
+	rcu_read_lock();
+	nf = nfsd_file_get(rcu_dereference(dp->dl_stid.sc_file->fi_deleg_file));
+	rcu_read_unlock();
+	if (!nf) {
+		for (i = 0; i < count; ++i)
+			nfsd_notify_event_put(events[i]);
+		goto out_recall;
+	}
+
+	for (i = 0; i < count; ++i) {
+		struct nfsd_notify_event *nne = events[i];
+
+		if (!error) {
+			u32 *maskp = (u32 *)xdr_reserve_space(&stream, sizeof(*maskp));
+			u8 *p;
+
+			if (!maskp) {
+				error = true;
+				goto put_event;
+			}
+
+			p = nfsd4_encode_notify_event(&stream, nne, dp, nf, maskp);
+			if (!p) {
+				pr_notice("Could not generate CB_NOTIFY from fsnotify mask 0x%x\n",
+					  nne->ne_mask);
+				error = true;
+				goto put_event;
+			}
+
+			ncn->ncn_nf[i].notify_mask.count = 1;
+			ncn->ncn_nf[i].notify_mask.element = maskp;
+			ncn->ncn_nf[i].notify_vals.data = p;
+			ncn->ncn_nf[i].notify_vals.len = (u8 *)stream.p - p;
+		}
+put_event:
+		nfsd_notify_event_put(nne);
+	}
+	if (!error) {
+		ncn->ncn_nf_cnt = count;
+		nfsd_file_put(nf);
+		return true;
+	}
+	nfsd_file_put(nf);
+out_recall:
+	nfsd_break_one_deleg(dp);
+	return false;
+}
+
 static int
 nfsd4_cb_notify_done(struct nfsd4_callback *cb,
 				struct rpc_task *task)
 {
+	struct nfsd4_cb_notify *ncn = container_of(cb, struct nfsd4_cb_notify, ncn_cb);
+	struct nfs4_delegation *dp = container_of(ncn, struct nfs4_delegation, dl_cb_notify);
+
+	if (dp->dl_stid.sc_status)
+		return 1;
+
+	/*
+	 * The CB_NOTIFY op overflowed the send buffer and was dropped from the
+	 * compound. The notification is lost, so recall the delegation rather
+	 * than leaving the client unaware of the directory change.
+	 */
+	if (ncn->ncn_encode_err) {
+		nfsd_break_one_deleg(dp);
+		return 1;
+	}
+
 	switch (task->tk_status) {
 	case -NFS4ERR_DELAY:
 		rpc_delay(task, 2 * HZ);
 		return 0;
 	default:
+		/* For any other hard error, recall the deleg */
+		nfsd_break_one_deleg(dp);
+		fallthrough;
+	case 0:
 		return 1;
 	}
 }
 
+static void nfsd4_run_cb_notify(struct nfsd4_cb_notify *ncn);
+
 static void
 nfsd4_cb_notify_release(struct nfsd4_callback *cb)
 {
@@ -3492,6 +3620,9 @@ nfsd4_cb_notify_release(struct nfsd4_callback *cb)
 	struct nfs4_delegation *dp =
 			container_of(ncn, struct nfs4_delegation, dl_cb_notify);
 
+	/* Drain events that arrived while this callback was in flight */
+	if (READ_ONCE(ncn->ncn_evt_cnt) > 0)
+		nfsd4_run_cb_notify(ncn);
 	nfs4_put_stid(&dp->dl_stid);
 }
 
@@ -3508,6 +3639,7 @@ static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
 };
 
 static const struct nfsd4_callback_ops nfsd4_cb_notify_ops = {
+	.prepare	= nfsd4_cb_notify_prepare,
 	.done		= nfsd4_cb_notify_done,
 	.release	= nfsd4_cb_notify_release,
 	.opcode		= OP_CB_NOTIFY,
@@ -5767,29 +5899,6 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
 	.opcode		= OP_CB_RECALL,
 };
 
-static void nfsd_break_one_deleg(struct nfs4_delegation *dp)
-{
-	bool queued;
-
-	if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags))
-		return;
-
-	/*
-	 * We're assuming the state code never drops its reference
-	 * without first removing the lease.  Since we're in this lease
-	 * callback (and since the lease code is serialized by the
-	 * flc_lock) we know the server hasn't removed the lease yet, and
-	 * we know it's safe to take a reference.
-	 */
-	refcount_inc(&dp->dl_stid.sc_count);
-	queued = nfsd4_run_cb(&dp->dl_recall);
-	WARN_ON_ONCE(!queued);
-	if (!queued) {
-		refcount_dec(&dp->dl_stid.sc_count);
-		clear_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags);
-	}
-}
-
 /* Called from break_lease() with flc_lock held. */
 static bool
 nfsd_break_deleg_cb(struct file_lease *fl)
@@ -9969,3 +10078,170 @@ void nfsd_update_cmtime_attr(struct file *f, unsigned int flags)
 				      MINOR(inode->i_sb->s_dev),
 				      inode->i_ino, ret);
 }
+
+static void
+nfsd4_run_cb_notify(struct nfsd4_cb_notify *ncn)
+{
+	struct nfs4_delegation *dp = container_of(ncn, struct nfs4_delegation, dl_cb_notify);
+
+	if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncn->ncn_cb.cb_flags))
+		return;
+
+	if (!refcount_inc_not_zero(&dp->dl_stid.sc_count))
+		clear_bit(NFSD4_CALLBACK_RUNNING, &ncn->ncn_cb.cb_flags);
+	else
+		nfsd4_run_cb(&ncn->ncn_cb);
+}
+
+static struct nfsd_notify_event *
+alloc_nfsd_notify_event(u32 mask, const struct qstr *q, struct dentry *dentry,
+			struct inode *target)
+{
+	struct nfsd_notify_event *ne;
+	struct name_snapshot newname;
+	u32 newnamelen = 0;
+
+	/*
+	 * For a rename, @q is the old name and the live dentry carries the new
+	 * name. Snapshot the new name now, while it is guaranteed to describe
+	 * this event: the dentry can be renamed again before the CB_NOTIFY work
+	 * runs, which would corrupt a late read in nfsd4_encode_notify_event().
+	 */
+	if (mask & FS_RENAME) {
+		take_dentry_name_snapshot(&newname, dentry);
+		newnamelen = newname.name.len;
+	}
+
+	ne = kmalloc(struct_size(ne, ne_name, q->len + 1 +
+				 (newnamelen ? newnamelen + 1 : 0)), GFP_NOFS);
+	if (!ne)
+		goto out;
+
+	memcpy(ne->ne_name, q->name, q->len);
+	ne->ne_name[q->len] = '\0';
+	ne->ne_namelen = q->len;
+
+	ne->ne_newnamelen = newnamelen;
+	if (newnamelen) {
+		char *p = nfsd_notify_event_newname(ne);
+
+		memcpy(p, newname.name.name, newnamelen);
+		p[newnamelen] = '\0';
+	}
+
+	refcount_set(&ne->ne_ref, 1);
+	ne->ne_mask = mask;
+	ne->ne_dentry = dget(dentry);
+	ne->ne_target = target;
+	if (ne->ne_target)
+		ihold(ne->ne_target);
+out:
+	if (mask & FS_RENAME)
+		release_dentry_name_snapshot(&newname);
+	return ne;
+}
+
+static bool
+should_notify_deleg(u32 mask, struct file_lease *fl)
+{
+	/* Don't notify the client generating the event */
+	if (nfsd_breaker_owns_lease(fl))
+		return false;
+
+	/* Skip if this event wasn't ignored by the lease */
+	if ((mask & FS_DELETE) && !(fl->c.flc_flags & FL_IGN_DIR_DELETE))
+		return false;
+	if ((mask & FS_CREATE) && !(fl->c.flc_flags & FL_IGN_DIR_CREATE))
+		return false;
+	if ((mask & FS_RENAME) && !(fl->c.flc_flags & FL_IGN_DIR_RENAME))
+		return false;
+
+	return true;
+}
+
+static void
+nfsd_recall_all_dir_delegs(const struct inode *dir)
+{
+	struct file_lock_context *ctx = locks_inode_context(dir);
+	struct file_lock_core *flc;
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+		struct file_lease *fl = container_of(flc, struct file_lease, c);
+
+		if (fl->fl_lmops == &nfsd_lease_mng_ops)
+			nfsd_break_deleg_cb(fl);
+	}
+	spin_unlock(&ctx->flc_lock);
+}
+
+int
+nfsd_handle_dir_event(u32 mask, const struct inode *dir, const void *data,
+		      int data_type, const struct qstr *name)
+{
+	struct dentry *dentry = fsnotify_data_dentry(data, data_type);
+	struct inode *target = fsnotify_data_rename_target(data, data_type);
+	struct file_lock_context *ctx;
+	struct file_lock_core *flc;
+	struct nfsd_notify_event *evt;
+
+	trace_nfsd_handle_dir_event(mask, dir, name);
+
+	/* Normalize cross-dir rename events to create/delete */
+	if (mask & FS_MOVED_FROM) {
+		mask &= ~FS_MOVED_FROM;
+		mask |= FS_DELETE;
+	}
+	if (mask & FS_MOVED_TO) {
+		mask &= ~FS_MOVED_TO;
+		mask |= FS_CREATE;
+	}
+
+	/*
+	 * FS_RENAME fires on the source directory even for a cross-dir
+	 * rename, where the moved entry now lives under a different parent.
+	 * NOTIFY4_RENAME_ENTRY describes an in-place rename, so reporting it
+	 * here would advertise a name absent from this directory.
+	 */
+	if ((mask & FS_RENAME) && dentry && d_inode(dentry->d_parent) != dir)
+		mask &= ~FS_RENAME;
+
+	/* Don't do anything if this is not an expected event */
+	if (!(mask & (FS_CREATE|FS_DELETE|FS_RENAME)))
+		return 0;
+
+	ctx = locks_inode_context(dir);
+	if (!ctx || list_empty(&ctx->flc_lease))
+		return 0;
+
+	evt = alloc_nfsd_notify_event(mask, name, dentry, target);
+	if (!evt) {
+		nfsd_recall_all_dir_delegs(dir);
+		return 0;
+	}
+
+	spin_lock(&ctx->flc_lock);
+	list_for_each_entry(flc, &ctx->flc_lease, flc_list) {
+		struct file_lease *fl = container_of(flc, struct file_lease, c);
+		struct nfs4_delegation *dp = flc->flc_owner;
+		struct nfsd4_cb_notify *ncn = &dp->dl_cb_notify;
+
+		if (!should_notify_deleg(mask, fl))
+			continue;
+
+		spin_lock(&ncn->ncn_lock);
+		if (ncn->ncn_evt_cnt >= NOTIFY4_EVENT_QUEUE_SIZE) {
+			/* We're generating notifications too fast. Recall. */
+			spin_unlock(&ncn->ncn_lock);
+			nfsd_break_deleg_cb(fl);
+			continue;
+		}
+		ncn->ncn_evt[ncn->ncn_evt_cnt++] = nfsd_notify_event_get(evt);
+		spin_unlock(&ncn->ncn_lock);
+
+		nfsd4_run_cb_notify(ncn);
+	}
+	spin_unlock(&ctx->flc_lock);
+	nfsd_notify_event_put(evt);
+	return 0;
+}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index b9037d99b564..c6f92ddeb449 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -4185,6 +4185,123 @@ nfsd4_encode_fattr4(struct svc_rqst *rqstp, struct xdr_stream *xdr,
 	goto out;
 }
 
+static bool
+nfsd4_setup_notify_entry4(struct notify_entry4 *ne, struct xdr_stream *xdr,
+			  struct dentry *dentry, struct nfs4_delegation *dp,
+			  struct nfsd_file *nf, char *name, u32 namelen)
+{
+	uint32_t *attrmask;
+
+	/* Reserve space for attrmask */
+	attrmask = xdr_reserve_space(xdr, 3 * sizeof(uint32_t));
+	if (!attrmask)
+		return false;
+
+	ne->ne_file.data = name;
+	ne->ne_file.len = namelen;
+	ne->ne_attrs.attrmask.element = attrmask;
+
+	attrmask[0] = 0;
+	attrmask[1] = 0;
+	attrmask[2] = 0;
+	ne->ne_attrs.attr_vals.data = NULL;
+	ne->ne_attrs.attr_vals.len = 0;
+	ne->ne_attrs.attrmask.count = 1;
+	return true;
+}
+
+/**
+ * nfsd4_encode_notify_event - encode a notify
+ * @xdr: stream to which to encode the fattr4
+ * @nne: nfsd_notify_event to encode
+ * @dp: delegation where the event occurred
+ * @nf: nfsd_file on which event occurred
+ * @notify_mask: pointer to word where notification mask should be set
+ *
+ * Encode @nne into @xdr. The matching bit in @notify_mask is set on
+ * success.
+ *
+ * Return: pointer to the start of the encoded event, or NULL if the
+ * event could not be encoded.
+ */
+u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *nne,
+			      struct nfs4_delegation *dp, struct nfsd_file *nf,
+			      u32 *notify_mask)
+{
+	u8 *p = NULL;
+
+	*notify_mask = 0;
+
+	if (nne->ne_mask & FS_DELETE) {
+		struct notify_remove4 nr = { };
+
+		if (!nfsd4_setup_notify_entry4(&nr.nrm_old_entry, xdr, nne->ne_dentry, dp,
+					       nf, nne->ne_name, nne->ne_namelen))
+			goto out_err;
+		p = (u8 *)xdr->p;
+		if (!xdrgen_encode_notify_remove4(xdr, &nr))
+			goto out_err;
+		*notify_mask |= BIT(NOTIFY4_REMOVE_ENTRY);
+	} else if (nne->ne_mask & FS_CREATE) {
+		struct notify_add4 na = { };
+		struct notify_remove4 old = { };
+
+		if (!nfsd4_setup_notify_entry4(&na.nad_new_entry, xdr, nne->ne_dentry, dp,
+					       nf, nne->ne_name, nne->ne_namelen))
+			goto out_err;
+
+		/* If a file was overwritten, report it in nad_old_entry */
+		if (nne->ne_target) {
+			if (!nfsd4_setup_notify_entry4(&old.nrm_old_entry, xdr,
+						       NULL, dp, nf,
+						       nne->ne_name, nne->ne_namelen))
+				goto out_err;
+			na.nad_old_entry.count = 1;
+			na.nad_old_entry.element = &old;
+		}
+
+		p = (u8 *)xdr->p;
+		if (!xdrgen_encode_notify_add4(xdr, &na))
+			goto out_err;
+
+		*notify_mask |= BIT(NOTIFY4_ADD_ENTRY);
+	} else if (nne->ne_mask & FS_RENAME) {
+		struct notify_rename4 nr = { };
+		struct notify_remove4 old = { };
+		char *newname = nfsd_notify_event_newname(nne);
+
+		/* Don't send any attributes in the old_entry since they're the same in new */
+		if (!nfsd4_setup_notify_entry4(&nr.nrn_old_entry.nrm_old_entry, xdr,
+					       NULL, dp, nf, nne->ne_name,
+					       nne->ne_namelen))
+			goto out_err;
+
+		if (!nfsd4_setup_notify_entry4(&nr.nrn_new_entry.nad_new_entry, xdr,
+					       nne->ne_dentry, dp, nf, newname,
+					       nne->ne_newnamelen))
+			goto out_err;
+
+		/* If a file was overwritten, report it in nad_old_entry */
+		if (nne->ne_target) {
+			if (!nfsd4_setup_notify_entry4(&old.nrm_old_entry, xdr,
+						       NULL, dp, nf, newname,
+						       nne->ne_newnamelen))
+				goto out_err;
+			nr.nrn_new_entry.nad_old_entry.count = 1;
+			nr.nrn_new_entry.nad_old_entry.element = &old;
+		}
+
+		p = (u8 *)xdr->p;
+		if (!xdrgen_encode_notify_rename4(xdr, &nr))
+			goto out_err;
+		*notify_mask |= BIT(NOTIFY4_RENAME_ENTRY);
+	}
+	return p;
+out_err:
+	pr_warn("nfsd: unable to marshal notify event to xdr stream\n");
+	return NULL;
+}
+
 static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
 				struct xdr_buf *buf, __be32 *p, int bytes)
 {
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index ac9dd798ea22..f8457e0f2b57 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -201,10 +201,23 @@ struct nfsd_notify_event {
 	refcount_t	ne_ref;		// refcount
 	u32		ne_mask;	// FS_* mask from fsnotify callback
 	struct dentry	*ne_dentry;	// dentry reference to target
-	u32		ne_namelen;	// length of ne_name
-	char		ne_name[];	// name of dentry being changed
+	struct inode	*ne_target;	// inode overwritten by rename, or NULL
+	u32		ne_namelen;	// length of ne_name (old name for a rename)
+	u32		ne_newnamelen;	// length of new name (rename only), else 0
+	char		ne_name[];	// entry name, then new name (rename only)
 };
 
+/*
+ * For a rename, the new name is snapshotted at event-alloc time and stored
+ * immediately after the (NUL-terminated) old name in ne_name[]. ne_dentry can
+ * be renamed again before the CB_NOTIFY work runs, so the new name must not be
+ * read from the live dentry at encode time.
+ */
+static inline char *nfsd_notify_event_newname(struct nfsd_notify_event *ne)
+{
+	return ne->ne_name + ne->ne_namelen + 1;
+}
+
 static inline struct nfsd_notify_event *nfsd_notify_event_get(struct nfsd_notify_event *ne)
 {
 	refcount_inc(&ne->ne_ref);
@@ -214,6 +227,7 @@ static inline struct nfsd_notify_event *nfsd_notify_event_get(struct nfsd_notify
 static inline void nfsd_notify_event_put(struct nfsd_notify_event *ne)
 {
 	if (refcount_dec_and_test(&ne->ne_ref)) {
+		iput(ne->ne_target);
 		dput(ne->ne_dentry);
 		kfree(ne);
 	}
@@ -901,6 +915,8 @@ void nfsd_update_cmtime_attr(struct file *f, unsigned int flags);
 extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(struct xdr_netobj name,
 				struct xdr_netobj princhash, struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(struct xdr_netobj name, struct nfsd_net *nn);
+int nfsd_handle_dir_event(u32 mask, const struct inode *dir, const void *data,
+			  int data_type, const struct qstr *name);
 
 void put_nfs4_file(struct nfs4_file *fi);
 extern void nfs4_put_cpntf_state(struct nfsd_net *nn,
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 171e8fdbafb6..db0a0dc70660 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -12,6 +12,7 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/xprt.h>
 #include <trace/misc/fs.h>
+#include <trace/misc/fsnotify.h>
 #include <trace/misc/nfs.h>
 #include <trace/misc/sunrpc.h>
 
@@ -1377,6 +1378,28 @@ TRACE_EVENT(nfsd_file_fsnotify_handle_event,
 			__entry->nlink, __entry->mode, __entry->mask)
 );
 
+TRACE_EVENT(nfsd_handle_dir_event,
+	TP_PROTO(u32 mask, const struct inode *dir, const struct qstr *name),
+	TP_ARGS(mask, dir, name),
+	TP_STRUCT__entry(
+		__field(u32, mask)
+		__field(dev_t, s_dev)
+		__field(u64, i_ino)
+		__string_len(name, name ? name->name : NULL,
+				   name ? name->len : 0)
+	),
+	TP_fast_assign(
+		__entry->mask = mask;
+		__entry->s_dev = dir ? dir->i_sb->s_dev : 0;
+		__entry->i_ino = dir ? dir->i_ino : 0;
+		__assign_str(name);
+	),
+	TP_printk("inode=0x%x:0x%x:0x%llx mask=%s name=%s",
+			MAJOR(__entry->s_dev), MINOR(__entry->s_dev),
+			__entry->i_ino, show_fsnotify_mask(__entry->mask),
+			__get_str(name))
+);
+
 DECLARE_EVENT_CLASS(nfsd_file_gc_class,
 	TP_PROTO(
 		const struct nfsd_file *nf
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 85574b2a139a..62ac790428be 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -970,6 +970,9 @@ __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 		struct svc_fh *fhp, struct svc_export *exp,
 		struct dentry *dentry,
 		u32 *bmval, struct svc_rqst *, int ignore_crossmnt);
+u8 *nfsd4_encode_notify_event(struct xdr_stream *xdr, struct nfsd_notify_event *nne,
+			      struct nfs4_delegation *dd, struct nfsd_file *nf,
+			      u32 *notify_mask);
 extern __be32 nfsd4_setclientid(struct svc_rqst *rqstp,
 		struct nfsd4_compound_state *, union nfsd4_op_u *u);
 extern __be32 nfsd4_setclientid_confirm(struct svc_rqst *rqstp,

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 09/20] nfsd: add data structures for handling CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Add the data structures, allocation helpers, and callback operations
needed for directory delegation CB_NOTIFY support:

- struct nfsd_notify_event: carries fsnotify events for CB_NOTIFY
- struct nfsd4_cb_notify: per-delegation state for notification handling
- Union dl_cb_fattr with dl_cb_notify in nfs4_delegation since a
  delegation is either a regular file delegation or a directory
  delegation, never both

Refactor alloc_init_deleg() into a common __alloc_init_deleg() base
with a pluggable sc_free callback, and add alloc_init_dir_deleg() which
allocates the page array and notify4 buffer needed for CB_NOTIFY
encoding.

Add skeleton nfsd4_cb_notify_ops with done/release handlers that will
be filled in when the notification path is wired up.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4state.c | 121 ++++++++++++++++++++++++++++++++++++++++++++++------
 fs/nfsd/state.h     |  47 +++++++++++++++++++-
 2 files changed, 152 insertions(+), 16 deletions(-)

diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 18e81c7f9d19..0a15d7f3b543 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -126,6 +126,7 @@ static void free_session(struct nfsd4_session *);
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
 static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_notify_ops;
 
 static struct workqueue_struct *laundry_wq;
 
@@ -1123,29 +1124,31 @@ static void block_delegations(struct knfsd_fh *fh)
 }
 
 static struct nfs4_delegation *
-alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
-		 struct nfs4_clnt_odstate *odstate, u32 dl_type)
+__alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+		   struct nfs4_clnt_odstate *odstate, u32 dl_type,
+		   void (*sc_free)(struct nfs4_stid *))
 {
 	struct nfs4_delegation *dp;
 	struct nfs4_stid *stid;
 	long n;
 
-	dprintk("NFSD alloc_init_deleg\n");
+	if (delegation_blocked(&fp->fi_fhandle))
+		return NULL;
+
 	n = atomic_long_inc_return(&num_delegations);
 	if (n < 0 || n > max_delegations)
 		goto out_dec;
-	if (delegation_blocked(&fp->fi_fhandle))
-		goto out_dec;
-	stid = nfs4_alloc_stid(clp, deleg_slab, nfs4_free_deleg);
+
+	stid = nfs4_alloc_stid(clp, deleg_slab, sc_free);
 	if (stid == NULL)
 		goto out_dec;
-	dp = delegstateid(stid);
 
 	/*
 	 * delegation seqid's are never incremented.  The 4.1 special
 	 * meaning of seqid 0 isn't meaningful, really, but let's avoid
-	 * 0 anyway just for consistency and use 1:
+	 * 0 anyway just for consistency and use 1.
 	 */
+	dp = delegstateid(stid);
 	dp->dl_stid.sc_stateid.si_generation = 1;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
@@ -1155,19 +1158,79 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
 	dp->dl_type = dl_type;
 	dp->dl_retries = 1;
 	dp->dl_recalled = false;
-	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
-		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
-	nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
-			&nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
-	dp->dl_cb_fattr.ncf_file_modified = false;
 	get_nfs4_file(fp);
 	dp->dl_stid.sc_file = fp;
+	nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client,
+		      &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL);
 	return dp;
 out_dec:
 	atomic_long_dec(&num_delegations);
 	return NULL;
 }
 
+static struct nfs4_delegation *
+alloc_init_deleg(struct nfs4_client *clp, struct nfs4_file *fp,
+		 struct nfs4_clnt_odstate *odstate, u32 dl_type)
+{
+	struct nfs4_delegation *dp;
+
+	dp = __alloc_init_deleg(clp, fp, odstate, dl_type, nfs4_free_deleg);
+	if (!dp)
+		return NULL;
+
+	nfsd4_init_cb(&dp->dl_cb_fattr.ncf_getattr, dp->dl_stid.sc_client,
+			&nfsd4_cb_getattr_ops, NFSPROC4_CLNT_CB_GETATTR);
+	dp->dl_cb_fattr.ncf_file_modified = false;
+	return dp;
+}
+
+static void nfs4_free_dir_deleg(struct nfs4_stid *stid)
+{
+	struct nfs4_delegation	*dp = delegstateid(stid);
+	struct nfsd4_cb_notify *ncn = &dp->dl_cb_notify;
+	int i;
+
+	for (i = 0; i < ncn->ncn_evt_cnt; ++i)
+		nfsd_notify_event_put(ncn->ncn_evt[i]);
+	kfree(ncn->ncn_nf);
+	for (i = 0; i < NOTIFY4_PAGE_ARRAY_SIZE; i++) {
+		if (!ncn->ncn_pages[i])
+			break;
+		put_page(ncn->ncn_pages[i]);
+	}
+	nfs4_free_deleg(stid);
+}
+
+static struct nfs4_delegation *
+alloc_init_dir_deleg(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+	struct nfs4_delegation *dp;
+	struct nfsd4_cb_notify *ncn;
+	int npages;
+
+	dp = __alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ, nfs4_free_dir_deleg);
+	if (!dp)
+		return NULL;
+
+	ncn = &dp->dl_cb_notify;
+
+	npages = alloc_pages_bulk(GFP_KERNEL, NOTIFY4_PAGE_ARRAY_SIZE, ncn->ncn_pages);
+	if (npages != NOTIFY4_PAGE_ARRAY_SIZE) {
+		nfs4_put_stid(&dp->dl_stid);
+		return NULL;
+	}
+
+	ncn->ncn_nf = kcalloc(NOTIFY4_EVENT_QUEUE_SIZE, sizeof(*ncn->ncn_nf), GFP_KERNEL);
+	if (!ncn->ncn_nf) {
+		nfs4_put_stid(&dp->dl_stid);
+		return NULL;
+	}
+	spin_lock_init(&ncn->ncn_lock);
+	nfsd4_init_cb(&ncn->ncn_cb, dp->dl_stid.sc_client,
+			&nfsd4_cb_notify_ops, NFSPROC4_CLNT_CB_NOTIFY);
+	return dp;
+}
+
 void
 nfs4_put_stid(struct nfs4_stid *s)
 {
@@ -3408,6 +3471,30 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
+static int
+nfsd4_cb_notify_done(struct nfsd4_callback *cb,
+				struct rpc_task *task)
+{
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 2 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_notify_release(struct nfsd4_callback *cb)
+{
+	struct nfsd4_cb_notify *ncn =
+			container_of(cb, struct nfsd4_cb_notify, ncn_cb);
+	struct nfs4_delegation *dp =
+			container_of(ncn, struct nfs4_delegation, dl_cb_notify);
+
+	nfs4_put_stid(&dp->dl_stid);
+}
+
 static const struct nfsd4_callback_ops nfsd4_cb_recall_any_ops = {
 	.done		= nfsd4_cb_recall_any_done,
 	.release	= nfsd4_cb_recall_any_release,
@@ -3420,6 +3507,12 @@ static const struct nfsd4_callback_ops nfsd4_cb_getattr_ops = {
 	.opcode		= OP_CB_GETATTR,
 };
 
+static const struct nfsd4_callback_ops nfsd4_cb_notify_ops = {
+	.done		= nfsd4_cb_notify_done,
+	.release	= nfsd4_cb_notify_release,
+	.opcode		= OP_CB_NOTIFY,
+};
+
 static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf)
 {
 	struct nfs4_delegation *dp =
@@ -9788,7 +9881,7 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 
 	/* Try to set up the lease */
 	status = -ENOMEM;
-	dp = alloc_init_deleg(clp, fp, NULL, NFS4_OPEN_DELEGATE_READ);
+	dp = alloc_init_dir_deleg(clp, fp);
 	if (!dp)
 		goto out_delegees;
 	if (cstate->current_fh.fh_export)
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4fca0537ca8b..ac9dd798ea22 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -197,6 +197,45 @@ struct nfs4_cb_fattr {
 #define NOTIFY4_EVENT_QUEUE_SIZE	3
 #define NOTIFY4_PAGE_ARRAY_SIZE		1
 
+struct nfsd_notify_event {
+	refcount_t	ne_ref;		// refcount
+	u32		ne_mask;	// FS_* mask from fsnotify callback
+	struct dentry	*ne_dentry;	// dentry reference to target
+	u32		ne_namelen;	// length of ne_name
+	char		ne_name[];	// name of dentry being changed
+};
+
+static inline struct nfsd_notify_event *nfsd_notify_event_get(struct nfsd_notify_event *ne)
+{
+	refcount_inc(&ne->ne_ref);
+	return ne;
+}
+
+static inline void nfsd_notify_event_put(struct nfsd_notify_event *ne)
+{
+	if (refcount_dec_and_test(&ne->ne_ref)) {
+		dput(ne->ne_dentry);
+		kfree(ne);
+	}
+}
+
+/*
+ * Represents a directory delegation. The callback is for handling CB_NOTIFYs.
+ * As notifications from fsnotify come in, allocate a new event, take the ncn_lock,
+ * and add it to the ncn_evt queue. The CB_NOTIFY prepare handler will take the
+ * lock, clean out the list and process it.
+ */
+struct nfsd4_cb_notify {
+	spinlock_t			ncn_lock;	// protects the evt queue and count
+	int				ncn_evt_cnt;	// count of events in ncn_evt
+	int				ncn_nf_cnt;	// count of valid entries in ncn_nf
+	struct nfsd_notify_event	*ncn_evt[NOTIFY4_EVENT_QUEUE_SIZE]; // list of events
+	struct page			*ncn_pages[NOTIFY4_PAGE_ARRAY_SIZE]; // for encoding
+	struct notify4			*ncn_nf;	// array of notify4's to be sent
+	bool				ncn_encode_err;	// did encoding fail?
+	struct nfsd4_callback		ncn_cb;		// notify4 callback
+};
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -233,8 +272,12 @@ struct nfs4_delegation {
 	bool			dl_written;
 	bool			dl_setattr;
 
-	/* for CB_GETATTR */
-	struct nfs4_cb_fattr    dl_cb_fattr;
+	union {
+		/* for CB_GETATTR */
+		struct nfs4_cb_fattr    dl_cb_fattr;
+		/* for CB_NOTIFY */
+		struct nfsd4_cb_notify	dl_cb_notify;
+	};
 
 	/* For delegated timestamps */
 	struct timespec64	dl_atime;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 08/20] nfsd: use RCU to protect fi_deleg_file
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

fi_deleg_file can be NULLed by put_deleg_file() when fi_delegees drops
to zero during delegation teardown (e.g. DELEGRETURN). Concurrent
accesses from workqueue callbacks -- such as CB_NOTIFY -- can
dereference a NULL pointer if they race with this teardown.

Annotate fi_deleg_file with __rcu and convert all accessors to use
proper RCU primitives:

- rcu_assign_pointer() / RCU_INIT_POINTER() for stores
- rcu_dereference_protected() for reads under fi_lock or where
  fi_delegees > 0 guarantees stability

This prepares for a subsequent patch that will use rcu_read_lock +
rcu_dereference + nfsd_file_get to safely acquire a reference from
the CB_NOTIFY callback path without holding fi_lock.

The error-path lease teardown in nfsd_get_dir_deleg() is one of these
accessors, and it must drop the lease against fi_deleg_file->nf_file
rather than this client's nf->nf_file. The lease's flc_file is
fi_deleg_file (set in nfs4_alloc_init_lease()), which differs from nf
when an earlier client already holds a delegation on the same directory.
generic_delete_lease() matches on flc_file, so unlocking the wrong file
would fail to remove the lease, leaking it on the inode and then freeing
its owning stid underneath it -- a use-after-free once the leaked lease
is later broken. Read fi_deleg_file there with rcu_dereference_protected()
like the other accessors, and recalculate the fsnotify mask after
dropping the lease to match the success path.

Assisted-by: Claude:claude-opus-4-6
Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4layouts.c |  7 ++++---
 fs/nfsd/nfs4state.c   | 51 ++++++++++++++++++++++++++++++++++-----------------
 fs/nfsd/state.h       |  2 +-
 3 files changed, 39 insertions(+), 21 deletions(-)

diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 4c3f253c7d07..22bcb6d09f70 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -248,12 +248,13 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
 			NFSPROC4_CLNT_CB_LAYOUT);
 
 	if (parent->sc_type == SC_TYPE_DELEG) {
-		spin_lock(&fp->fi_lock);
-		ls->ls_file = nfsd_file_get(fp->fi_deleg_file);
-		spin_unlock(&fp->fi_lock);
+		rcu_read_lock();
+		ls->ls_file = nfsd_file_get(rcu_dereference(fp->fi_deleg_file));
+		rcu_read_unlock();
 	} else {
 		ls->ls_file = find_any_file(fp);
 	}
+
 	if (!ls->ls_file) {
 		nfs4_put_stid(stp);
 		return NULL;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 1ff954a18f93..18e81c7f9d19 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1212,7 +1212,9 @@ static void put_deleg_file(struct nfs4_file *fp)
 
 	spin_lock(&fp->fi_lock);
 	if (--fp->fi_delegees == 0) {
-		swap(nf, fp->fi_deleg_file);
+		nf = rcu_dereference_protected(fp->fi_deleg_file,
+					       lockdep_is_held(&fp->fi_lock));
+		RCU_INIT_POINTER(fp->fi_deleg_file, NULL);
 		swap(rnf, fp->fi_rdeleg_file);
 	}
 	spin_unlock(&fp->fi_lock);
@@ -1250,7 +1252,7 @@ static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct f
 static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp)
 {
 	struct nfs4_file *fp = dp->dl_stid.sc_file;
-	struct nfsd_file *nf = fp->fi_deleg_file;
+	struct nfsd_file *nf = rcu_dereference_protected(fp->fi_deleg_file, 1);
 
 	WARN_ON_ONCE(!fp->fi_delegees);
 
@@ -3186,7 +3188,8 @@ static int nfs4_show_deleg(struct seq_file *s, struct nfs4_stid *st)
 	/* XXX: lease time, whether it's being recalled. */
 
 	spin_lock(&nf->fi_lock);
-	file = nf->fi_deleg_file;
+	file = rcu_dereference_protected(nf->fi_deleg_file,
+					 lockdep_is_held(&nf->fi_lock));
 	if (file) {
 		seq_puts(s, ", ");
 		nfs4_show_superblock(s, file);
@@ -4995,7 +4998,7 @@ static void nfsd4_file_init(const struct svc_fh *fh, struct nfs4_file *fp)
 	INIT_LIST_HEAD(&fp->fi_delegations);
 	INIT_LIST_HEAD(&fp->fi_clnt_odstate);
 	fh_copy_shallow(&fp->fi_fhandle, &fh->fh_handle);
-	fp->fi_deleg_file = NULL;
+	RCU_INIT_POINTER(fp->fi_deleg_file, NULL);
 	fp->fi_rdeleg_file = NULL;
 	fp->fi_had_conflict = false;
 	fp->fi_share_deny = 0;
@@ -6149,7 +6152,7 @@ static struct file_lease *nfs4_alloc_init_lease(struct nfs4_delegation *dp, u32
 	fl->c.flc_type = deleg_is_read(dp->dl_type) ? F_RDLCK : F_WRLCK;
 	fl->c.flc_owner = (fl_owner_t)dp;
 	fl->c.flc_pid = current->tgid;
-	fl->c.flc_file = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+	fl->c.flc_file = rcu_dereference_protected(dp->dl_stid.sc_file->fi_deleg_file, 1)->nf_file;
 	return fl;
 }
 
@@ -6157,7 +6160,7 @@ static int nfsd4_check_conflicting_opens(struct nfs4_client *clp,
 					 struct nfs4_file *fp)
 {
 	struct nfs4_ol_stateid *st;
-	struct file *f = fp->fi_deleg_file->nf_file;
+	struct file *f = rcu_dereference_protected(fp->fi_deleg_file, 1)->nf_file;
 	struct inode *ino = file_inode(f);
 	int writes;
 
@@ -6234,7 +6237,7 @@ nfsd4_verify_deleg_dentry(struct nfsd4_open *open, struct nfs4_file *fp,
 
 	exp_put(exp);
 	dput(child);
-	if (child != file_dentry(fp->fi_deleg_file->nf_file))
+	if (child != file_dentry(rcu_dereference_protected(fp->fi_deleg_file, 1)->nf_file))
 		return -EAGAIN;
 
 	return 0;
@@ -6340,8 +6343,9 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 		status = -EAGAIN;
 	else if (nfsd4_verify_setuid_write(open, nf))
 		status = -EAGAIN;
-	else if (!fp->fi_deleg_file) {
-		fp->fi_deleg_file = nf;
+	else if (!rcu_dereference_protected(fp->fi_deleg_file,
+					    lockdep_is_held(&fp->fi_lock))) {
+		rcu_assign_pointer(fp->fi_deleg_file, nf);
 		/* increment early to prevent fi_deleg_file from being
 		 * cleared */
 		fp->fi_delegees = 1;
@@ -6366,7 +6370,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	if (!fl)
 		goto out_clnt_odstate;
 
-	status = kernel_setlease(fp->fi_deleg_file->nf_file,
+	status = kernel_setlease(rcu_dereference_protected(fp->fi_deleg_file, 1)->nf_file,
 				      fl->c.flc_type, &fl, NULL);
 	if (fl)
 		locks_free_lease(fl);
@@ -6387,7 +6391,7 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 	 * Now that the deleg is set, check again to ensure that nothing
 	 * raced in and changed the mode while we weren't looking.
 	 */
-	status = nfsd4_verify_setuid_write(open, fp->fi_deleg_file);
+	status = nfsd4_verify_setuid_write(open, rcu_dereference_protected(fp->fi_deleg_file, 1));
 	if (status)
 		goto out_unlock;
 
@@ -6408,7 +6412,8 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp,
 
 	return dp;
 out_unlock:
-	kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+	kernel_setlease(rcu_dereference_protected(fp->fi_deleg_file, 1)->nf_file,
+			F_UNLCK, NULL, (void **)&dp);
 out_clnt_odstate:
 	put_clnt_odstate(dp->dl_clnt_odstate);
 	nfs4_put_stid(&dp->dl_stid);
@@ -6565,8 +6570,9 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open,
 	memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid));
 
 	if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) {
-		struct file *f = dp->dl_stid.sc_file->fi_deleg_file->nf_file;
+		struct file *f;
 
+		f = rcu_dereference_protected(dp->dl_stid.sc_file->fi_deleg_file, 1)->nf_file;
 		if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) ||
 				!nfs4_delegation_stat(dp, currentfh, &stat)) {
 			nfs4_put_stid(&dp->dl_stid);
@@ -9765,8 +9771,9 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 	/* existing delegation? */
 	if (nfs4_delegation_exists(clp, fp)) {
 		status = -EAGAIN;
-	} else if (!fp->fi_deleg_file) {
-		fp->fi_deleg_file = nfsd_file_get(nf);
+	} else if (!rcu_dereference_protected(fp->fi_deleg_file,
+					      lockdep_is_held(&fp->fi_lock))) {
+		rcu_assign_pointer(fp->fi_deleg_file, nfsd_file_get(nf));
 		fp->fi_delegees = 1;
 	} else {
 		++fp->fi_delegees;
@@ -9822,8 +9829,18 @@ nfsd_get_dir_deleg(struct nfsd4_compound_state *cstate,
 		return dp;
 	}
 
-	/* Something failed. Drop the lease and clean up the stid */
-	kernel_setlease(fp->fi_deleg_file->nf_file, F_UNLCK, NULL, (void **)&dp);
+	/*
+	 * Something failed after the lease was set. Drop the lease and clean
+	 * up the stid. The lease's flc_file is the fi_deleg_file (see
+	 * nfs4_alloc_init_lease()), which is not necessarily this client's
+	 * @nf when an earlier client already holds a delegation on @fp.
+	 * generic_delete_lease() matches on flc_file, so unlock against
+	 * fi_deleg_file or the lease will be leaked (and later freed with the
+	 * stid, leading to a use-after-free when it's eventually broken).
+	 */
+	kernel_setlease(rcu_dereference_protected(fp->fi_deleg_file, 1)->nf_file,
+			F_UNLCK, NULL, (void **)&dp);
+	nfsd_fsnotify_recalc_mask(nf);
 out_put_stid:
 	nfs4_put_stid(&dp->dl_stid);
 out_delegees:
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9f321e9ed76d..4fca0537ca8b 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -699,7 +699,7 @@ struct nfs4_file {
 	 */
 	atomic_t		fi_access[2];
 	u32			fi_share_deny;
-	struct nfsd_file	*fi_deleg_file;
+	struct nfsd_file __rcu	*fi_deleg_file;
 	struct nfsd_file	*fi_rdeleg_file;
 	int			fi_delegees;
 	struct knfsd_fh		fi_fhandle;

-- 
2.54.0


^ permalink raw reply related

* [PATCH v6 07/20] nfsd: add callback encoding and decoding linkages for CB_NOTIFY
From: Jeff Layton @ 2026-06-11 17:50 UTC (permalink / raw)
  To: NeilBrown, Olga Kornievskaia, Dai Ngo, Tom Talpey,
	Trond Myklebust, Anna Schumaker, Jonathan Corbet, Shuah Khan,
	Chuck Lever
  Cc: Steven Rostedt, Alexander Aring, Amir Goldstein, Jan Kara,
	Alexander Viro, Christian Brauner, Calum Mackay, linux-kernel,
	linux-doc, linux-nfs, Jeff Layton
In-Reply-To: <20260611-dir-deleg-v6-0-4c45080e5f3f@kernel.org>

Add routines for encoding and decoding CB_NOTIFY messages. These call
into the code generated by xdrgen to do the actual encoding and
decoding.

For now, the encoder is a stub. Later patches will flesh out the payload
encoding.

Signed-off-by: Jeff Layton <jlayton@kernel.org>
---
 fs/nfsd/nfs4callback.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/nfsd/state.h        |  8 ++++++++
 fs/nfsd/xdr4cb.h       | 12 ++++++++++++
 3 files changed, 66 insertions(+)

diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index a3c46905fd47..ca4dd2f969eb 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -887,6 +887,51 @@ static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so
 	xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len);
 }
 
+static void nfs4_xdr_enc_cb_notify(struct rpc_rqst *req,
+				   struct xdr_stream *xdr,
+				   const void *data)
+{
+	const struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+	struct CB_NOTIFY4args args = { };
+
+	WARN_ON_ONCE(hdr.minorversion == 0);
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+
+	/*
+	 * FIXME: get stateid and fh from delegation. Inline the cna_changes
+	 * buffer, and zero it.
+	 */
+	xdrgen_encode_CB_NOTIFY4args(xdr, &args);
+
+	hdr.nops++;
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_notify(struct rpc_rqst *rqstp,
+				  struct xdr_stream *xdr,
+				  void *data)
+{
+	struct nfsd4_callback *cb = data;
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	status = decode_cb_sequence4res(xdr, cb);
+	if (unlikely(status || cb->cb_seq_status))
+		return status;
+
+	return decode_cb_op_status(xdr, OP_CB_NOTIFY, &cb->cb_status);
+}
+
 static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
 					struct xdr_stream *xdr,
 					const void *data)
@@ -1048,6 +1093,7 @@ static const struct rpc_procinfo nfs4_cb_procedures[] = {
 #ifdef CONFIG_NFSD_PNFS
 	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
 #endif
+	PROC(CB_NOTIFY,		COMPOUND,	cb_notify,	cb_notify),
 	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 	PROC(CB_OFFLOAD,	COMPOUND,	cb_offload,	cb_offload),
 	PROC(CB_RECALL_ANY,	COMPOUND,	cb_recall_any,	cb_recall_any),
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 4c6765a4cf22..9f321e9ed76d 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -190,6 +190,13 @@ struct nfs4_cb_fattr {
 	u64 ncf_cur_fsize;
 };
 
+/*
+ * FIXME: the current backchannel encoder can't handle a send buffer longer
+ *        than a single page (see bc_malloc/bc_free).
+ */
+#define NOTIFY4_EVENT_QUEUE_SIZE	3
+#define NOTIFY4_PAGE_ARRAY_SIZE		1
+
 /*
  * Represents a delegation stateid. The nfs4_client holds references to these
  * and they are put when it is being destroyed or when the delegation is
@@ -776,6 +783,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 	NFSPROC4_CLNT_CB_RECALL_ANY,
 	NFSPROC4_CLNT_CB_GETATTR,
+	NFSPROC4_CLNT_CB_NOTIFY,
 };
 
 /* Returns true iff a is later than b: */
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index f4e29c0c701c..b06d0170d7c4 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -33,6 +33,18 @@
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
 
+#define NFS4_enc_cb_notify_sz		(cb_compound_enc_hdr_sz +       \
+					cb_sequence_enc_sz +            \
+					1 + enc_stateid_sz +            \
+					enc_nfs4_fh_sz +		\
+					1 +				\
+					NOTIFY4_EVENT_QUEUE_SIZE *	\
+					(2 + (NFS4_OPAQUE_LIMIT >> 2)))
+
+#define NFS4_dec_cb_notify_sz		(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
+
 #define NFS4_enc_cb_notify_lock_sz	(cb_compound_enc_hdr_sz +        \
 					cb_sequence_enc_sz +             \
 					2 + 1 +				 \

-- 
2.54.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox