All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [paulmckrcu:puranjay.2026.06.24a 54/65] tree.c:undefined reference to `rcu_preempt_blocked_readers_cgp_ndqs'
From: Paul E. McKenney @ 2026-06-24 21:39 UTC (permalink / raw)
  To: kernel test robot; +Cc: oe-kbuild-all
In-Reply-To: <202606250524.NuoTnzfd-lkp@intel.com>

On Thu, Jun 25, 2026 at 05:26:28AM +0800, kernel test robot wrote:
> tree:   https://github.com/paulmckrcu/linux puranjay.2026.06.24a
> head:   9f9e05ef4454de1724d65dcc1d45018d0457bd7c
> commit: ab15f8d23a687736b674b3b8669a744dd709d97f [54/65] rcu: Make rcu_gp_cleanup() account for ->dqs_blkd_tasks
> config: arm64-allnoconfig (https://download.01.org/0day-ci/archive/20260625/202606250524.NuoTnzfd-lkp@intel.com/config)
> compiler: aarch64-linux-gcc (GCC) 16.1.0
> reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260625/202606250524.NuoTnzfd-lkp@intel.com/reproduce)
> 
> If you fix the issue in a separate patch/commit (i.e. not just a new version of
> the same patch/commit), kindly add following tags
> | Reported-by: kernel test robot <lkp@intel.com>
> | Closes: https://lore.kernel.org/oe-kbuild-all/202606250524.NuoTnzfd-lkp@intel.com/
> 
> All error/warnings (new ones prefixed by >>):
> 
>    In file included from kernel/rcu/tree.c:69:
> >> kernel/rcu/tree.h:498:12: warning: 'rcu_preempt_blocked_readers_cgp_ndqs' used but never defined
>      498 | static int rcu_preempt_blocked_readers_cgp_ndqs(struct rcu_node *rnp);
>          |            ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
> --
>    aarch64-linux-ld: Unexpected GOT/PLT entries detected!
>    aarch64-linux-ld: Unexpected run-time procedure linkages detected!
>    aarch64-linux-ld: kernel/rcu/tree.o: in function `rcu_gp_cleanup':
> >> tree.c:(.text+0x66c0): undefined reference to `rcu_preempt_blocked_readers_cgp_ndqs'

Puranjay is an innocent bystander on this one.  The fix is at:

7c6c1219be4d ("fixup! rcu: Make rcu_gp_cleanup() account for ->dqs_blkd_tasks")

							Thanx, Paul

^ permalink raw reply

* Re: [PATCH v2 3/3] rtc: ds1307: Add driver for Epson RX8901CE
From: Alexandre Belloni @ 2026-06-24 21:38 UTC (permalink / raw)
  To: Fredrik M Olsson
  Cc: Rob Herring, Krzysztof Kozlowski, Conor Dooley, Nobuhiro Iwamatsu,
	linux-rtc, devicetree, linux-kernel, kernel
In-Reply-To: <20260520-ds1307-rx8901-add-v2-3-e069ea32e1db@axis.com>

On 20/05/2026 16:48:55+0200, Fredrik M Olsson wrote:
> +static int do_trickle_setup_rx8901(struct ds1307 *ds1307, u32 ohms __always_unused, bool diode)
> +{
> +	int ret;
> +	unsigned int setup;
> +
> +	ret = regmap_read(ds1307->regmap, RX8901_REG_PWSW_CFG, &setup);
> +	if (ret) {
> +		dev_err(ds1307->dev, "Failed to read PWSW_CFG register\n");
> +		return ret;
> +	}
> +
> +	/* Enable low battery voltage detection */
> +	setup |= RX8901_REG_PWSW_CFG_VBATLDETEN;
> +
> +	if (diode)
> +		setup |= RX8901_REG_PWSW_CFG_CHGEN;

When diode is false, you need to explicitly clear
RX8901_REG_PWSW_CFG_CHGEN so it is possible to actually disable trickle
charging once it has been enabled has the register is battery backed.


-- 
Alexandre Belloni, co-owner and COO, Bootlin
Embedded Linux and Kernel engineering
https://bootlin.com

^ permalink raw reply

* [paulmckrcu:puranjay.2026.06.24a 54/65] kernel/rcu/tree.h:498:12: warning: function 'rcu_preempt_blocked_readers_cgp_ndqs' has internal linkage but is not defined
From: kernel test robot @ 2026-06-24 21:37 UTC (permalink / raw)
  To: Paul E. McKenney; +Cc: llvm, oe-kbuild-all

tree:   https://github.com/paulmckrcu/linux puranjay.2026.06.24a
head:   9f9e05ef4454de1724d65dcc1d45018d0457bd7c
commit: ab15f8d23a687736b674b3b8669a744dd709d97f [54/65] rcu: Make rcu_gp_cleanup() account for ->dqs_blkd_tasks
config: s390-allnoconfig (https://download.01.org/0day-ci/archive/20260625/202606250529.ZnQlmwsD-lkp@intel.com/config)
compiler: clang version 23.0.0git (https://github.com/llvm/llvm-project a9b492db3d50683e446cd1a5c9ffaf4e92cb77a7)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260625/202606250529.ZnQlmwsD-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606250529.ZnQlmwsD-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from kernel/rcu/tree.c:69:
>> kernel/rcu/tree.h:498:12: warning: function 'rcu_preempt_blocked_readers_cgp_ndqs' has internal linkage but is not defined [-Wundefined-internal]
     498 | static int rcu_preempt_blocked_readers_cgp_ndqs(struct rcu_node *rnp);
         |            ^
   kernel/rcu/tree.c:2223:20: note: used here
    2223 |                 if (WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp_ndqs(rnp)))
         |                                  ^
   1 warning generated.


vim +/rcu_preempt_blocked_readers_cgp_ndqs +498 kernel/rcu/tree.h

   493	
   494	/* Forward declarations for tree_plugin.h */
   495	static void rcu_bootup_announce(void);
   496	static void rcu_qs(void);
   497	static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
 > 498	static int rcu_preempt_blocked_readers_cgp_ndqs(struct rcu_node *rnp);
   499	static int rcu_print_task_exp_stall(struct rcu_node *rnp);
   500	static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
   501	static void rcu_flavor_sched_clock_irq(int user);
   502	static void dump_blkd_tasks(struct rcu_node *rnp, int ncheck);
   503	static void rcu_preempt_deferred_qs_init(struct rcu_data *rdp);
   504	static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
   505	static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
   506	static bool rcu_is_callbacks_kthread(struct rcu_data *rdp);
   507	static void rcu_cpu_kthread_setup(unsigned int cpu);
   508	static void rcu_spawn_one_boost_kthread(struct rcu_node *rnp);
   509	static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
   510	static bool rcu_preempt_has_tasks_ndqs(struct rcu_node *rnp);
   511	static bool rcu_preempt_need_deferred_qs(struct task_struct *t);
   512	static void zero_cpu_stall_ticks(struct rcu_data *rdp);
   513	static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp);
   514	static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
   515	static void rcu_init_one_nocb(struct rcu_node *rnp);
   516	static bool wake_nocb_gp(struct rcu_data *rdp);
   517	static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
   518					  unsigned long j, bool lazy);
   519	static void call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *head,
   520				  rcu_callback_t func, unsigned long flags, bool lazy);
   521	static void __maybe_unused __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
   522							unsigned long flags);
   523	static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level);
   524	static bool do_nocb_deferred_wakeup(struct rcu_data *rdp);
   525	static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp);
   526	static void rcu_spawn_cpu_nocb_kthread(int cpu);
   527	static void show_rcu_nocb_state(struct rcu_data *rdp);
   528	static void rcu_nocb_lock(struct rcu_data *rdp);
   529	static void rcu_nocb_unlock(struct rcu_data *rdp);
   530	static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
   531					       unsigned long flags);
   532	static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp);
   533	#ifdef CONFIG_RCU_NOCB_CPU
   534	static void __init rcu_organize_nocb_kthreads(void);
   535	

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [jic23-iio:testing 23/62] Warning: lib/kstrtox.c:59 function parameter 'init' not described in '_parse_integer_limit'
From: kernel test robot @ 2026-06-24 21:37 UTC (permalink / raw)
  To: Rodrigo Alencar; +Cc: oe-kbuild-all, Jonathan Cameron, Andy Shevchenko

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/jic23/iio.git testing
head:   7667a80340e99fd45357d0c90ae05813b01bbfef
commit: 521b4ae3b5a47cf4ef5826016eecde08e8740bef [23/62] lib: kstrtox: add initial value to _parse_integer_limit()
config: nios2-allnoconfig (https://download.01.org/0day-ci/archive/20260625/202606250505.W2lYv9PS-lkp@intel.com/config)
compiler: nios2-linux-gcc (GCC) 11.5.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260625/202606250505.W2lYv9PS-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202606250505.W2lYv9PS-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Warning: lib/kstrtox.c:59 function parameter 'init' not described in '_parse_integer_limit'
>> Warning: lib/kstrtox.c:59 function parameter 'init' not described in '_parse_integer_limit'

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* [arnd-playground:abi-test 6/34] ./usr/include/linux/uhid.h:197:9: error: padding struct size to alignment boundary with 4 bytes
From: kernel test robot @ 2026-06-24 21:37 UTC (permalink / raw)
  To: oe-kbuild; +Cc: lkp

:::::: 
:::::: Manual check reason: "only suspicious fbc files changed"
:::::: 

BCC: lkp@intel.com
CC: oe-kbuild-all@lists.linux.dev
TO: Arnd Bergmann <arnd@arndb.de>

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/arnd/playground.git abi-test
head:   741e95c4b46803e113402223592189848b0ac9c8
commit: 3dfdb63c0bf9cefe7e3d914f595429779695f95f [6/34] kbuild: uapi: check for -Wpadded errors
:::::: branch date: 13 hours ago
:::::: commit date: 2 days ago
config: parisc-allnoconfig (https://download.01.org/0day-ci/archive/20260625/202606250535.DwPmfA3c-lkp@intel.com/config)
compiler: hppa-linux-gcc (GCC) 16.1.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260625/202606250535.DwPmfA3c-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/r/202606250535.DwPmfA3c-lkp@intel.com/

All errors (new ones prefixed by >>):

   In file included from ./usr/include/linux/uhid.h:23,
                    from <command-line>:
   usr/include/linux/input.h:413:15: error: padding struct to align 'custom_len' [-Werror=padded]
     413 |         __u32 custom_len;
         |               ^~~~~~~~~~
   usr/include/linux/input.h:443:15: error: padding struct to align 'intensity' [-Werror=padded]
     443 |         __u16 intensity;
         |               ^~~~~~~~~
   usr/include/linux/input.h:485:11: error: padding struct to align 'u' [-Werror=padded]
     485 |         } u;
         |           ^
>> ./usr/include/linux/uhid.h:197:9: error: padding struct size to alignment boundary with 4 bytes [-Werror=padded]
     197 |         } u;
         |         ^
   cc1: all warnings being treated as errors

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply

* Re: [PATCH v3 4/4] edac: xilinx: Add EDAC support for Versal XilSem
From: sashiko-bot @ 2026-06-24 21:37 UTC (permalink / raw)
  To: Rama devi Veggalam; +Cc: devicetree, conor+dt, robh
In-Reply-To: <20260624212545.2850787-5-rama.devi.veggalam@amd.com>

Thank you for your contribution! Sashiko AI review found 13 potential issue(s) to consider:
- [Critical] Passing a 64-bit `dma_addr_t` to a 32-bit firmware API parameter truncates the physical address, causing memory corruption.
- [High] Missing 'semrtca' resource breaks driver probe for older Device Trees.
- [High] The sysfs `store` functions use the virtual EDAC device for DMA allocations instead of the platform device.
- [High] Error paths introduced in `mc_probe` fail to unregister the `VERSAL_EVENT_ERROR_PMC_ERR1` PM notifier, causing a Use-After-Free.
- [High] The error event handler uses an `if - else if` chain for bitmask evaluation, silently dropping uncorrectable errors.
- [Medium] Hexadecimal prefixes mistakenly used for decimal bit-shift macros cause undefined behavior and break CRAM frame decoding.
- [Medium] The `xsem_total_cframes_ssit_show` function mutates the driver's cached CRAM frames state in-place, corrupting data on subsequent reads.
- [Medium] Sysfs `store` functions unconditionally truncate the last character of user input if it is not newline-terminated.
- [Medium] XilSEM fabric and interconnect errors are incorrectly attributed to DDR memory DIMM 0.
- [Medium] Failure to handle an unrecognized `family_code` causes the driver to register an event with uninitialized parameters.
- [Medium] Lack of synchronization in sysfs callbacks causes data races on shared buffers.
- [Medium] The `xsem_err_callback` event handler performs unprotected read-modify-write operations on shared error counters.
- [Low] Copy-paste error in `xsem_scan_control_show` format string causes incorrect output.
--

commit 3ba94d35140e37f0feb1403bd80442cef2ad8ed1
Author: Rama devi Veggalam <rama.devi.veggalam@amd.com>
Subject: edac: xilinx: Add EDAC support for Versal XilSem

This patch adds support for Xilinx Versal Soft Error Mitigation (XilSEM)
to handle and optionally correct soft errors in Configuration Memory.
It also introduces sysfs interfaces for XilSEM scan operations, error
injection, and configuration reads.

> diff --git a/drivers/edac/versal_edac.c b/drivers/edac/versal_edac.c
> index 5a43b5d43ca28..024ddd90f6996 100644
> --- a/drivers/edac/versal_edac.c
> +++ b/drivers/edac/versal_edac.c
[ ... ]
> @@ -11,12 +11,88 @@
>  #include <linux/of_address.h>
>  #include <linux/of_device.h>
>  #include <linux/platform_device.h>
> +#include <linux/dma-mapping.h>
>  #include <linux/sizes.h>
> +#include <linux/io.h>
>  #include <linux/firmware/xlnx-zynqmp.h>
> +#include <linux/firmware/xlnx-versal-error-events.h>
>  #include <linux/firmware/xlnx-event-manager.h>
>  
>  #include "edac_module.h"
>  
> +/* XilSem CE Error log count */
> +#define XILSEM_MAX_CE_LOG_CNT	7
> +
> +/** Maximum CRAM error register count */
> +#define MAX_CRAMERR_REG_CNT		14
> +/** Maximum NPI slave skip count */
> +#define MAX_NPI_SLV_SKIP_CNT	8
> +/** Maximum NPI Error info count */
> +#define MAX_NPI_ERR_INFO_CNT	2
> +
> +/* Maximum SLR count */
> +#define MAX_SLR_ID	3
> +
> +/** Maximum number of cframe types  */
> +#define CFRAME_MAX_TYPE	7
> +
> +/** Mask for getting Type_0, Type_4 frames */
> +#define CFRAME_TYPE_0_4_MASK	GENMASK(19, 0)
> +
> +/** Low mask, High mask for getting Type_1, Type_5 frames */
> +#define CFRAME_TYPE_1_5_MASK_L	GENMASK(39, 20)
> +#define CFRAME_TYPE_1_5_MASK_H	GENMASK(7, 0)
> +
> +/** Shift for getting Type_1, Type_5 frames */
> +#define CFRAME_TYPE_1_5_SHIFT_R		0x20
> +#define CFRAME_TYPE_1_5_SHIFT_L		0x12
> +
> +/** Mask for getting Type_2, Type_6 frames */
> +#define CFRAME_TYPE_2_6_MASK	GENMASK(27, 8)
> +
> +/** Shift for getting Type_2, Type_6 frames */
> +#define CFRAME_TYPE_2_6_SHIFT_R		0x8
> +
> +/** Low mask, high mask for getting Type_3 frames */
> +#define CFRAME_TYPE_3_MASK_L		GENMASK(31, 28)
> +#define CFRAME_TYPE_3_MASK_H		GENMASK(15, 0)
> +
> +/** Shift for getting Type_3 */
> +#define CFRAME_TYPE_3_SHIFT_R	0x28

[Severity: Medium]
Are these macro definitions incorrectly using a hex prefix for decimal shift
values? Shifting a 32-bit value by 0x20 (32) or 0x28 (40) bits will invoke
undefined behavior.

> +#define CFRAME_TYPE_3_SHIFT_L	0x4
[ ... ]
> +static ssize_t xsem_scan_control_show(struct device *dev,
> +				      struct device_attribute *mattr,
> +				      char *data)
> +{
> +	struct mem_ctl_info *mci = to_mci(dev);
> +	struct edac_priv *priv = mci->pvt_info;
> +
> +	if (!priv->xsem_rtca)
> +		return -EINVAL;
> +
> +	return sprintf(data, "[0x%x][0x%x][0x%x][0x%x][0x%x][0x%x][0x%x]\n\r",
> +			priv->xsem_rtca->scan_ctrl_status[0],
> +			priv->xsem_rtca->scan_ctrl_status[0],

[Severity: Low]
Is there a typo here where scan_ctrl_status[0] is printed twice, causing the
output to miss the last element of the array?

> +			priv->xsem_rtca->scan_ctrl_status[1],
> +			priv->xsem_rtca->scan_ctrl_status[2],
> +			priv->xsem_rtca->scan_ctrl_status[3],
> +			priv->xsem_rtca->scan_ctrl_status[4],
> +			priv->xsem_rtca->scan_ctrl_status[5]);
> +}
> +
> +/**
> + * xsem_scan_control_store - Set scan control operation
[ ... ]
> +static ssize_t xsem_scan_control_store(struct device *dev,
> +				       struct device_attribute *mattr,
> +				       const char *data, size_t count)
> +{
> +	struct mem_ctl_info *mci = to_mci(dev);
> +	struct edac_priv *priv = mci->pvt_info;
> +	char *kern_buff, *inbuf, *tok;
> +	u32 cmd;
> +	u32 slrid;
> +	int ret;
> +
> +	if (!priv->xsem_rtca)
> +		return -EINVAL;
> +
> +	kern_buff = kzalloc(count, GFP_KERNEL);
> +	if (!kern_buff)
> +		return -ENOMEM;
> +	strscpy(kern_buff, data, count);

[Severity: Medium]
Does this silently truncate the last character of the user input if it is not
newline-terminated? When count matches the string length exactly, strscpy()
copies at most count - 1 bytes and inserts a null terminator.

> +
> +	inbuf = kern_buff;
[ ... ]
> +static ssize_t xsem_total_cframes_ssit_show(struct device *dev,
> +					    struct device_attribute *mattr,
> +					    char *data)
> +{
> +	struct mem_ctl_info *mci = to_mci(dev);
> +	struct edac_priv *priv = mci->pvt_info;
> +	u32 temp_buf[CFRAME_MAX_TYPE] = {0};
> +	u32 id;
> +	int offset = 0;
> +
> +	if (!priv->xsem_rtca)
> +		return -EINVAL;
> +
> +	for (id = 0; id < CFRAME_MAX_TYPE; id++)
> +		temp_buf[id] = priv->xsem_rtca->cram_total_frames[id];
> +
> +	priv->xsem_rtca->cram_total_frames[0]  = (temp_buf[0] & CFRAME_TYPE_0_4_MASK);
> +	priv->xsem_rtca->cram_total_frames[1]  = (temp_buf[0] &
> +					CFRAME_TYPE_1_5_MASK_L) >> CFRAME_TYPE_1_5_SHIFT_R;

[Severity: Medium]
Does this modify the shared cram_total_frames array in-place during a read
operation? Subsequent reads might apply the bitwise shifts to already-shifted
data, returning garbled output.

> +	priv->xsem_rtca->cram_total_frames[1] |= (temp_buf[1] &
[ ... ]
> +static ssize_t xsem_total_cframes_ssit_store(struct device *dev,
> +					     struct device_attribute *mattr,
> +					     const char *data, size_t count)
> +{
> +	struct mem_ctl_info *mci = to_mci(dev);
> +	struct edac_priv *priv = mci->pvt_info;
> +	char *kern_buff, *inbuf, *tok, *kbuf1;
> +	dma_addr_t dma_addr = 0;
> +	u32 row, slrid, dma_size;
> +	int ret;
[ ... ]
> +	dma_size = sizeof(priv->xsem_rtca->cram_total_frames);
> +	kbuf1 = dma_alloc_coherent(dev, dma_size, &dma_addr, GFP_KERNEL);

[Severity: High]
Are we passing the virtual EDAC device pointer instead of the platform device
here? Sysfs store callbacks for this attribute receive &mci->dev, which lacks
DMA configuration and could cause allocation failures.

> +	if (!kbuf1) {
> +		ret = -ENOMEM;
> +		goto err;
> +	}
> +
> +	ret = zynqmp_pm_xilsem_cram_ssit_totframes(slrid, row, dma_addr,
> +						   priv->xsem_rtca->cram_get_frames_status);

[Severity: Critical]
Can this truncate the physical address and cause memory corruption? On ARM64,
dma_addr_t is 64-bit, but the firmware API parameters are 32-bit.

> +	if (ret) {
> +		edac_printk(KERN_ERR, EDAC_MC, "ERROR: XilSEM Status PM API failed\n");
> +		dma_free_coherent(dev, dma_size, kbuf1, dma_addr);
> +		goto err;
> +	}
> +
> +	memcpy(priv->xsem_rtca->cram_total_frames, kbuf1, dma_size);

[Severity: Medium]
Can concurrent sysfs store operations corrupt this shared buffer due to a lack
of locking?

> +	dma_free_coherent(dev, dma_size, kbuf1, dma_addr);
[ ... ]
> +static ssize_t xsem_read_ssit_status_store(struct device *dev,
> +					   struct device_attribute *mattr,
> +					   const char *data, size_t count)
> +{
> +	struct mem_ctl_info *mci = to_mci(dev);
> +	struct edac_priv *priv = mci->pvt_info;
> +	char *kern_buff, *inbuf, *tok, *kbuf1;
> +	size_t dma_size;
> +	dma_addr_t dma_addr = 0;
> +	u32 slrid;
> +	int ret;
[ ... ]
> +	dma_size  = sizeof(struct xsem_ssit_status);
> +
> +	kbuf1 = dma_alloc_coherent(dev, dma_size, &dma_addr, GFP_KERNEL);

[Severity: High]
Does passing the virtual EDAC device pointer (which lacks DMA configuration)
instead of the platform device cause DMA allocation failures?

> +	if (!kbuf1) {
> +		ret = -ENOMEM;
> +		goto err;
> +	}
> +
> +	ret = zynqmp_pm_xilsem_read_ssit_status(slrid, dma_addr,
> +						priv->xsem_rtca->xilsem_ssit_status);

[Severity: Critical]
Could passing the 64-bit dma_addr to this 32-bit firmware API parameter result
in physical address truncation and subsequent memory corruption?

> +	if (ret) {
> +		edac_printk(KERN_ERR, EDAC_MC, "ERROR: XilSEM Status PM API failed\n");
> +		dma_free_coherent(dev, dma_size, kbuf1, dma_addr);
> +		goto err;
> +	}
> +
> +	memcpy(priv->xsem_rtca->slr_info, kbuf1, dma_size);

[Severity: Medium]
Can concurrent executions of this sysfs store operation result in data races on
this shared slr_info buffer?

> +
> +	dma_free_coherent(dev, dma_size, kbuf1, dma_addr);
[ ... ]
> +static void xsem_geterror_info(struct mem_ctl_info *mci, struct xsem_error_status *p,
> +			       int mask)
> +{
> +	struct edac_priv *priv = mci->pvt_info;
> +	u32 error_word_0, error_word_1, ce_count;
> +	u8 index;
> +
> +	if (!priv->xsem_rtca || !priv->sem_baseaddr)
> +		return;
> +
> +	if (mask & priv->xsem_rtca->cram_ce_mask) {
> +		p->ce_cnt++;
[ ... ]
> +		/* Read CRAM status */
> +		p->ceinfo.status = readl(priv->sem_baseaddr + CRAM_STS_INFO_OFFSET);
> +	} else if (mask & priv->xsem_rtca->cram_ue_mask) {

[Severity: High]
Does this if-else chain cause uncorrectable errors to be silently ignored if
the payload mask contains both a correctable and uncorrectable error?

> +		p->ue_cnt++;
> +		p->ueinfo.data0 = 0;
> +		p->ueinfo.data1 = 0;
> +		p->ueinfo.status = readl(priv->sem_baseaddr + CRAM_STS_INFO_OFFSET);
> +	} else if (mask & priv->xsem_rtca->npi_ue_mask) {
> +		p->ue_cnt++;
> +		p->ueinfo.data0 = readl(priv->sem_baseaddr + NPI_ERR0_INFO_OFFSET);
> +		p->ueinfo.data1 = readl(priv->sem_baseaddr + NPI_ERR1_INFO_OFFSET);
> +		p->ueinfo.status = readl(priv->sem_baseaddr);
> +	} else {
> +		edac_printk(KERN_ERR, EDAC_MC, "Invalid Event received %d\n", mask);
> +	}
> +}
> +
> +/**
> + * xsem_handle_error - Handle XilSem error types CE and UE
> + * @mci:	Pointer to the memory controller instance
> + * @p:		Pointer to the xilsem error status structure
> + *
> + * Handles the correctable and uncorrectable error.
> + */
> +static void xsem_handle_error(struct mem_ctl_info *mci, struct xsem_error_status *p)
> +{
> +	struct xsem_ecc_error_info *pinf;
> +	char message[XDDR_EDAC_MSG_SIZE];
> +
> +	if (p->ce_cnt) {
> +		pinf = &p->ceinfo;
> +		snprintf(message, XDDR_EDAC_MSG_SIZE,
> +			 "\n\rXILSEM CRAM error type :%s\n\r"
> +			 "\nFrame_Addr: [0x%X]\t Row_num: [0x%X]\t Bit_loc: [0x%X]\t Qword: [0x%X]\n\r",
> +			 "CE", pinf->frame_addr, pinf->row_id,
> +			 pinf->bit_loc, pinf->qword);
> +
> +		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
> +				     p->ce_cnt, 0, 0, 0, 0, 0, -1,
> +				     message, "");

[Severity: Medium]
Are these fabric and interconnect errors being incorrectly attributed to the
first DDR memory module? Passing top_layer=0 and mid_layer=0 explicitly maps
them to csrow 0, channel 0.

> +	}
> +
> +	if (p->ue_cnt) {
> +		pinf = &p->ueinfo;
> +		snprintf(message, XDDR_EDAC_MSG_SIZE,
> +			 "\n\rXILSEM error type :%s\n\r"
> +			 "status: [0x%X]\n\rError_Info0: [0x%X]\n\r"
> +			 "Error_Info1: [0x%X]",
> +			 "UE", pinf->status, pinf->data0, pinf->data1);
> +
> +		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
> +				     p->ue_cnt, 0, 0, 0, 0, 0, -1,
> +				     message, "");

[Severity: Medium]
Does mapping uncorrectable XilSEM errors to top_layer=0 incorrectly attribute
them to the first DDR memory DIMM?

> +	}
> +}
> +
> +/**
> + * xsem_err_callback - Handle Correctable and Uncorrectable errors.
> + * @payload:	payload data.
> + * @data:	controller data.
> + *
> + * Handles ECC correctable and uncorrectable errors.
> + */
> +static void xsem_err_callback(const u32 *payload, void *data)
> +{
> +	struct xsem_error_status stat;
> +	struct edac_priv *priv;
> +	struct mem_ctl_info *mci = (struct mem_ctl_info *)data;
> +	int event;
> +
> +	priv = mci->pvt_info;
> +	if (!priv->xsem_rtca)
> +		return;
> +
> +	memset(&stat, 0, sizeof(stat));
> +	/* Read payload to get the event type */
> +	event = payload[2];
> +	edac_printk(KERN_INFO, EDAC_MC, "Event received %x\n", event);
> +	xsem_geterror_info(mci, &stat, event);
> +
> +	priv->xsem_rtca->cram_ce_cnt += stat.ce_cnt;
> +	priv->xsem_rtca->cram_ue_cnt += stat.ue_cnt;

[Severity: Medium]
Could these read-modify-write operations on shared counters lose increments if
the hardware event callback is executed concurrently on multiple CPUs?

> +	xsem_handle_error(mci, &stat);
> +}
[ ... ]
> @@ -1094,6 +2365,10 @@ static int mc_probe(struct platform_device *pdev)
>  	if (!get_ecc_state(ddrmc_baseaddr))
>  		return -ENXIO;
>  
> +	sem_baseaddr = devm_platform_ioremap_resource_byname(pdev, "semrtca");
> +	if (IS_ERR(sem_baseaddr))
> +		return PTR_ERR(sem_baseaddr);

[Severity: High]
Does this make the "semrtca" resource mandatory, breaking the driver probe for
older device trees that do not define it?

> +
>  	/* Allocate ID number for the EMIF controller */
>  	edac_mc_id = emif_get_id(pdev->dev.of_node);
>  
[ ... ]
> +	rc = zynqmp_pm_get_family_info(&family_code);
> +	if (rc) {
> +		if (rc == -ENODEV)
> +			rc = -EPROBE_DEFER;
> +
> +		goto del_mc;
> +	}
> +	if (family_code == PM_VERSAL_FAMILY_CODE) {
> +		priv->xsem_rtca->sw_event_node_id = VERSAL_EVENT_ERROR_SW_ERR;
> +		priv->xsem_rtca->cram_ce_mask = XPM_VERSAL_EVENT_ERROR_MASK_XSEM_CRAM_CE_5;
> +		priv->xsem_rtca->cram_ue_mask = XPM_VERSAL_EVENT_ERROR_MASK_XSEM_CRAM_UE_6;
> +		priv->xsem_rtca->npi_ue_mask = XPM_VERSAL_EVENT_ERROR_MASK_XSEM_NPI_UE_7;
> +	} else {
> +		edac_printk(KERN_ERR, EDAC_MC, "Invalid Device family code %d\n", family_code);
> +	}
> +
> +	rc = xlnx_register_event(PM_NOTIFY_CB, priv->xsem_rtca->sw_event_node_id,

[Severity: Medium]
Does this execution path fall through when the family code is unrecognized,
causing the driver to register an event using uninitialized parameters (zeros)?

> +				 priv->xsem_rtca->cram_ce_mask | priv->xsem_rtca->cram_ue_mask |
> +				 priv->xsem_rtca->npi_ue_mask,
> +				 false, xsem_err_callback, mci);
> +	if (rc)
> +		goto del_mc;
> +
>  #ifdef CONFIG_EDAC_DEBUG
>  	create_debugfs_attributes(mci);
>  	setup_address_map(priv);
> @@ -1154,6 +2484,8 @@ static int mc_probe(struct platform_device *pdev)
>  	enable_intr(priv);
>  	return rc;
>  
> +remove_sysfs:
> +	xsem_edac_remove_sysfs_attributes(mci);
>  del_mc:
>  	edac_mc_del_mc(&pdev->dev);
>  free_edac_mc:
> @@ -1173,9 +2505,21 @@ static void mc_remove(struct platform_device *pdev)

[Severity: High]
Does this error path fail to unregister the VERSAL_EVENT_ERROR_PMC_ERR1 event
that was registered earlier in the function? If the event triggers after mci is
freed, could it cause a use-after-free in the callback?

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260624212545.2850787-1-rama.devi.veggalam@amd.com?part=4

^ permalink raw reply

* Re: [PATCH v5 08/11] refs/files: lazy-load configuration to fix chicken-and-egg
From: Justin Tobler @ 2026-06-24 21:36 UTC (permalink / raw)
  To: Patrick Steinhardt; +Cc: git, Karthik Nayak, Jeff King
In-Reply-To: <20260622-b4-pks-refs-avoid-chdir-notify-reparent-v5-8-018475013dbc@pks.im>

On 26/06/22 10:28AM, Patrick Steinhardt wrote:
> When initializing the "files" reference backend we read the repository's
> config to parse "core.preferSymlinkRefs" and "core.logAllRefUpdates".
> This results in a chicken-and-egg problem though, because parsing the
> configuration may require us to have access to the reference store
> already when an "onbranch" condition exists.

Ok so both of these configuration options are currently parsed at ref
store initialization time. This is problematic because we need the ref
store to properly handle "onbranch" conditions in the config.

> Luckily, all the configuration that we honor only relates to writing
> references. Consequently, we don't strictly need that configuration to
> be readily available at initialization time, and we can easiliy defer
> parsing it to a later point in time.

That's nice. So we don't actually need this configuration during
initialization and can instead lazily load it when writing the first
references. Makes sense.

> Implement this fix and add tests that verify that we can indeed properly
> parse these config knobs via an "onbranch" condition.
> 
> Signed-off-by: Patrick Steinhardt <ps@pks.im>
> ---
>  refs/files-backend.c        | 37 ++++++++++++++++++++++++++-----------
>  t/t0600-reffiles-backend.sh | 21 +++++++++++++++++++++
>  2 files changed, 47 insertions(+), 11 deletions(-)
> 
> diff --git a/refs/files-backend.c b/refs/files-backend.c
> index 79fb6735e1..d0f379dcd6 100644
> --- a/refs/files-backend.c
> +++ b/refs/files-backend.c
> @@ -84,12 +84,14 @@ struct files_ref_store {
>  	unsigned int store_flags;
>  
>  	char *gitcommondir;
> -	enum log_refs_config log_all_ref_updates;
> -	int prefer_symlink_refs;
> -
>  	struct ref_cache *loose;
> -
>  	struct ref_store *packed_ref_store;
> +
> +	struct files_ref_store_write_options {
> +		enum log_refs_config log_all_ref_updates;
> +		int prefer_symlink_refs;
> +		bool initialized;
> +	} write_opts_lazy_loaded;

It might be nice to leave some sort of breadcrumb comment to future
readers to explain why we lazy load this configuration.

>  };
>  
>  static void clear_loose_ref_cache(struct files_ref_store *refs)
> @@ -121,17 +123,31 @@ static int files_ref_store_config(const char *var, const char *value,
>  				  const struct config_context *ctx UNUSED,
>  				  void *payload)
>  {
> -	struct files_ref_store *refs = payload;
> +	struct files_ref_store_write_options *opts = payload;
>  
>  	if (!strcmp(var, "core.prefersymlinkrefs")) {
> -		refs->prefer_symlink_refs = git_config_bool(var, value);
> +		opts->prefer_symlink_refs = git_config_bool(var, value);
>  	} else if (!strcmp(var, "core.logallrefupdates")) {
> -		refs->log_all_ref_updates = refs_parse_log_all_ref_updates_config(value);
> +		opts->log_all_ref_updates = refs_parse_log_all_ref_updates_config(value);
>  	}
>  
>  	return 0;
>  }
>  
> +static const struct files_ref_store_write_options *files_ref_store_write_options(struct files_ref_store *refs)
> +{
> +	struct files_ref_store_write_options *opts = &refs->write_opts_lazy_loaded;
> +
> +	if (opts->initialized)
> +		return opts;
> +
> +	opts->log_all_ref_updates = LOG_REFS_UNSET;
> +	repo_config(refs->base.repo, files_ref_store_config, opts);
> +
> +	opts->initialized = true;
> +	return opts;
> +}
> +
>  /*
>   * Create a new submodule ref cache and add it to the internal
>   * set of caches.
> @@ -156,9 +172,7 @@ static struct ref_store *files_ref_store_init(struct repository *repo,
>  	refs->packed_ref_store =
>  		packed_ref_store_init(repo, NULL, refs->gitcommondir, opts);
>  	refs->store_flags = opts->access_flags;
> -	refs->log_all_ref_updates = LOG_REFS_UNSET;
>  
> -	repo_config(repo, files_ref_store_config, refs);

Configs are no longer read eagerly during initialization.

The rest of this patch looks good to me.

-Justin

^ permalink raw reply

* [PATCH] drm/meson: cleanup unnecessary condition in kconfig
From: Gabriel Carvalho @ 2026-06-24 21:35 UTC (permalink / raw)
  To: Neil Armstrong
  Cc: Julian Braha, Jerome Brunet, Martin Blumenstingl, dri-devel,
	linux-amlogic, linux-kernel, Gabriel Carvalho

DRM_MESON_DW_HDMI and DRM_MESON_DW_MIPI_DSI both already depend on
DRM_MESON, so the "if DRM_MESON" on their "default y" lines is
redundant.

This redundant condition was found by kconfirm, a static analysis tool
for Kconfig.

Signed-off-by: Gabriel Carvalho <gabriel-n-carvalho@outlook.com>
Reviewed-by: Julian Braha <julianbraha@gmail.com>
---
 drivers/gpu/drm/meson/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/meson/Kconfig b/drivers/gpu/drm/meson/Kconfig
index 417f79829cf8..c3800825ac15 100644
--- a/drivers/gpu/drm/meson/Kconfig
+++ b/drivers/gpu/drm/meson/Kconfig
@@ -17,13 +17,13 @@ config DRM_MESON
 config DRM_MESON_DW_HDMI
 	tristate "HDMI Synopsys Controller support for Amlogic Meson Display"
 	depends on DRM_MESON
-	default y if DRM_MESON
+	default y
 	select DRM_DW_HDMI
 	imply DRM_DW_HDMI_I2S_AUDIO
 
 config DRM_MESON_DW_MIPI_DSI
 	tristate "MIPI DSI Synopsys Controller support for Amlogic Meson Display"
 	depends on DRM_MESON
-	default y if DRM_MESON
+	default y
 	select DRM_DW_MIPI_DSI
 	select GENERIC_PHY_MIPI_DPHY

base-commit: cdeb2ccd993ed8647adbbda2c3b103aa717fd6f7
-- 
2.53.0


_______________________________________________
linux-amlogic mailing list
linux-amlogic@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-amlogic

^ permalink raw reply related

* [PATCH] Bluetooth: sco: Fix a race condition in sco_sock_timeout()
From: Sungwoo Kim @ 2026-06-24 21:33 UTC (permalink / raw)
  To: Marcel Holtmann, Luiz Augusto von Dentz
  Cc: Sungwoo Kim, Dave Tian, Luiz Augusto von Dentz, linux-bluetooth,
	linux-kernel

sco_sock_timeout() runs asynchronously and lock_sock(sk). If the socket
is closing while the timer is running, it holds the same lock
(lock_sock(sk)) twice, leading to a deadlock.

CPU 0                      CPU 1
====================       ======================
sco_sock_close()
                           sco_sock_timeout()
lock_sock(sk) // <-- LOCK
  __sco_sock_close()
    sco_chan_del()
      sco_conn_put()
	      sco_conn_free()
	        disable_delayed_work_sync()
	                           lock(sk) // <-- SAME LOCK

Fix this by moving disable_delayed_work_sync() outside of lock_sock(sk),
ensuring that no lock_sock(sk) is held before sco_sock_timeout().

Lockdep splat:

WARNING: possible circular locking dependency detected
6.13.0-rc4 #7 Not tainted

syz-executor292/9514 is trying to acquire lock:
ffff8881115d5070 ((work_completion)(&(&conn->timeout_work)->work)){+.+.}-{0:0}, at: rcu_lock_acquire sect/v6.13-rc4/./include/linux/rcupdate.h:337 [inline]
ffff8881115d5070 ((work_completion)(&(&conn->timeout_work)->work)){+.+.}-{0:0}, at: rcu_read_lock sect/v6.13-rc4/./include/linux/rcupdate.h:849 [inline]
ffff8881115d5070 ((work_completion)(&(&conn->timeout_work)->work)){+.+.}-{0:0}, at: start_flush_work sect/v6.13-rc4/kernel/workqueue.c:4137 [inline]
ffff8881115d5070 ((work_completion)(&(&conn->timeout_work)->work)){+.+.}-{0:0}, at: __flush_work+0xd1/0xc40 sect/v6.13-rc4/kernel/workqueue.c:4195

but task is already holding lock:
ffff88807db3a258 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: lock_sock sect/v6.13-rc4/./include/net/sock.h:1623 [inline]
ffff88807db3a258 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}, at: sco_sock_close+0x25/0x100 sect/v6.13-rc4/net/bluetooth/sco.c:524

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #1 (sk_lock-AF_BLUETOOTH-BTPROTO_SCO){+.+.}-{0:0}:
       lock_acquire+0x1c4/0x520 sect/v6.13-rc4/kernel/locking/lockdep.c:5849
       lock_sock_nested+0x48/0x130 sect/v6.13-rc4/net/core/sock.c:3622
       lock_sock sect/v6.13-rc4/./include/net/sock.h:1623 [inline]
       sco_sock_timeout+0xbe/0x270 sect/v6.13-rc4/net/bluetooth/sco.c:158
       process_one_work sect/v6.13-rc4/kernel/workqueue.c:3229 [inline]
       process_scheduled_works+0xa99/0x18f0 sect/v6.13-rc4/kernel/workqueue.c:3310
       worker_thread+0x8a9/0xd80 sect/v6.13-rc4/kernel/workqueue.c:3391
       kthread+0x2c6/0x360 sect/v6.13-rc4/kernel/kthread.c:389
       ret_from_fork+0x4e/0x80 sect/v6.13-rc4/arch/x86/kernel/process.c:147
       ret_from_fork_asm+0x1a/0x30 sect/v6.13-rc4/arch/x86/entry/entry_64.S:244

-> #0 ((work_completion)(&(&conn->timeout_work)->work)){+.+.}-{0:0}:
       check_prev_add sect/v6.13-rc4/kernel/locking/lockdep.c:3161 [inline]
       check_prevs_add sect/v6.13-rc4/kernel/locking/lockdep.c:3280 [inline]
       validate_chain+0x1888/0x5760 sect/v6.13-rc4/kernel/locking/lockdep.c:3904
       __lock_acquire+0x13b4/0x2120 sect/v6.13-rc4/kernel/locking/lockdep.c:5226
       lock_acquire+0x1c4/0x520 sect/v6.13-rc4/kernel/locking/lockdep.c:5849
       touch_work_lockdep_map sect/v6.13-rc4/kernel/workqueue.c:3909 [inline]
       start_flush_work sect/v6.13-rc4/kernel/workqueue.c:4163 [inline]
       __flush_work+0x70f/0xc40 sect/v6.13-rc4/kernel/workqueue.c:4195
       __cancel_work_sync sect/v6.13-rc4/kernel/workqueue.c:4351 [inline]
       disable_delayed_work_sync+0xbb/0xf0 sect/v6.13-rc4/kernel/workqueue.c:4514
       sco_conn_free sect/v6.13-rc4/net/bluetooth/sco.c:95 [inline]
       kref_put sect/v6.13-rc4/./include/linux/kref.h:65 [inline]
       sco_conn_put+0x18f/0x270 sect/v6.13-rc4/net/bluetooth/sco.c:107
       sco_chan_del+0xe2/0x210 sect/v6.13-rc4/net/bluetooth/sco.c:236
       sco_sock_close+0x8f/0x100 sect/v6.13-rc4/net/bluetooth/sco.c:526
       sco_sock_release+0x62/0x2d0 sect/v6.13-rc4/net/bluetooth/sco.c:1300
       __sock_release+0xe1/0x2d0 sect/v6.13-rc4/net/socket.c:640
       sock_close+0x1c/0x30 sect/v6.13-rc4/net/socket.c:1408
       __fput+0x2bd/0xa80 sect/v6.13-rc4/fs/file_table.c:450
       __fput_sync+0x15e/0x1c0 sect/v6.13-rc4/fs/file_table.c:535
       __do_sys_close sect/v6.13-rc4/fs/open.c:1554 [inline]
       __se_sys_close sect/v6.13-rc4/fs/open.c:1539 [inline]
       __x64_sys_close+0x93/0x120 sect/v6.13-rc4/fs/open.c:1539
       do_syscall_x64 sect/v6.13-rc4/arch/x86/entry/common.c:52 [inline]
       do_syscall_64+0xee/0x210 sect/v6.13-rc4/arch/x86/entry/common.c:83
       entry_SYSCALL_64_after_hwframe+0x77/0x7f

Fixes: e6720779ae61 ("Bluetooth: SCO: Use kref to track lifetime of sco_conn")
Acked-by: Dave Tian <daveti@purdue.edu>
Signed-off-by: Sungwoo Kim <iam@sung-woo.kim>
---
 net/bluetooth/sco.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index fcc597be5bbd..c05f79b7aa31 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -570,10 +570,23 @@ static void __sco_sock_close(struct sock *sk)
 /* Must be called on unlocked socket. */
 static void sco_sock_close(struct sock *sk)
 {
+	struct sco_conn *conn;
+
+	lock_sock(sk);
+	conn = sco_pi(sk)->conn;
+	if (conn)
+		sco_conn_hold(conn);
+	release_sock(sk);
+
+	if (conn)
+		disable_delayed_work_sync(&conn->timeout_work);
+
 	lock_sock(sk);
-	sco_sock_clear_timer(sk);
 	__sco_sock_close(sk);
 	release_sock(sk);
+
+	if (conn)
+		sco_conn_put(conn);
 }
 
 static void sco_sock_init(struct sock *sk, struct sock *parent)
-- 
2.47.3


^ permalink raw reply related

* [PATCH] drm/meson: cleanup unnecessary condition in kconfig
From: Gabriel Carvalho @ 2026-06-24 21:35 UTC (permalink / raw)
  To: Neil Armstrong
  Cc: Julian Braha, Jerome Brunet, Martin Blumenstingl, dri-devel,
	linux-amlogic, linux-kernel, Gabriel Carvalho

DRM_MESON_DW_HDMI and DRM_MESON_DW_MIPI_DSI both already depend on
DRM_MESON, so the "if DRM_MESON" on their "default y" lines is
redundant.

This redundant condition was found by kconfirm, a static analysis tool
for Kconfig.

Signed-off-by: Gabriel Carvalho <gabriel-n-carvalho@outlook.com>
Reviewed-by: Julian Braha <julianbraha@gmail.com>
---
 drivers/gpu/drm/meson/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/gpu/drm/meson/Kconfig b/drivers/gpu/drm/meson/Kconfig
index 417f79829cf8..c3800825ac15 100644
--- a/drivers/gpu/drm/meson/Kconfig
+++ b/drivers/gpu/drm/meson/Kconfig
@@ -17,13 +17,13 @@ config DRM_MESON
 config DRM_MESON_DW_HDMI
 	tristate "HDMI Synopsys Controller support for Amlogic Meson Display"
 	depends on DRM_MESON
-	default y if DRM_MESON
+	default y
 	select DRM_DW_HDMI
 	imply DRM_DW_HDMI_I2S_AUDIO
 
 config DRM_MESON_DW_MIPI_DSI
 	tristate "MIPI DSI Synopsys Controller support for Amlogic Meson Display"
 	depends on DRM_MESON
-	default y if DRM_MESON
+	default y
 	select DRM_DW_MIPI_DSI
 	select GENERIC_PHY_MIPI_DPHY

base-commit: cdeb2ccd993ed8647adbbda2c3b103aa717fd6f7
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH v3 1/4] dt-bindings: edac: Add bindings for Xilinx Versal XilSEM
From: sashiko-bot @ 2026-06-24 21:33 UTC (permalink / raw)
  To: Rama devi Veggalam; +Cc: conor+dt, robh, devicetree
In-Reply-To: <20260624212545.2850787-2-rama.devi.veggalam@amd.com>

Thank you for your contribution! Sashiko AI review found 2 potential issue(s) to consider:
- [Critical] Adding a third mandatory item to `reg` and `reg-names` breaks backward compatibility for existing device trees, causing schema validation and driver probe failures.
- [Medium] Unrelated XilSEM hardware registers are incorrectly merged into the DDR Memory Controller (`xlnx,versal-ddrmc`) device tree node, violating DT design principles.
--

commit 4759b2e3e1d7757d4a16075824d7a25fd86d0c73
Author: Rama devi Veggalam <rama.devi.veggalam@amd.com>

dt-bindings: edac: Add bindings for Xilinx Versal XilSEM

Update versal edac device tree bindings for Versal Soft Error Mitigation
(XilSEM).

> diff --git a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
> index 12f8e9f350bcf..568d2af7de81a 100644
> --- a/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
> +++ b/Documentation/devicetree/bindings/memory-controllers/xlnx,versal-ddrmc-edac.yaml
[ ... ]
> @@ -23,11 +37,13 @@ properties:
>      items:
>        - description: DDR Memory Controller registers
>        - description: NOC registers corresponding to DDR Memory Controller
> +      - description: SEM RTCA Controller registers

[Severity: Medium]
Is it correct to merge unrelated XilSEM hardware registers into the DDR Memory
Controller device tree node?

The device tree describes an inaccurate hardware topology by grouping physically
separate register bases under a single compatible string simply because a single
EDAC driver handles both. Should XilSEM be represented by its own distinct
device tree node?

[Severity: Critical]
Does adding this new required register break backward compatibility with older
device trees?

Because this adds a third mandatory item to the list without specifying
minItems: 2, older device trees lacking the "semrtca" register will fail
schema validation.

Additionally, does the driver unconditionally require this new register in
mc_probe(), causing probes to fail on existing systems? If the driver code
looks for the new register:

drivers/edac/versal_edac.c:mc_probe() {
    ...
    sem_baseaddr = devm_platform_ioremap_resource_byname(pdev, "semrtca");
    if (IS_ERR(sem_baseaddr))
        return PTR_ERR(sem_baseaddr);
    ...
}

This will return an error when the DT node lacks the semrtca register, leading
to immediate probe failure on existing setups.

>  
>    reg-names:
>      items:
>        - const: base
>        - const: noc
> +      - const: semrtca

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260624212545.2850787-1-rama.devi.veggalam@amd.com?part=1

^ permalink raw reply

* [PATCH v4] dts: report dut/NIC info during DTS run
From: Koushik Bhargav Nimoji @ 2026-06-24 21:33 UTC (permalink / raw)
  To: luca.vizzarro, patrickrobb1997
  Cc: dev, abailey, ahassick, lylavoie, Koushik Bhargav Nimoji
In-Reply-To: <20260602163647.101815-1-knimoji@iol.unh.edu>

This patch gathers NIC info during a DTS run and writes it to an output
json file. This allows the json file to be used when reporting results
on the DTS results dashboard.

Signed-off-by: Koushik Bhargav Nimoji <knimoji@iol.unh.edu>
---
v2:
    *Resolved merge conflicts
v3:
    *Fixed an issue with retrieving
     the NIC's hardware version   
v4:
    *Moved nic info gathering step before the nics get
     binded to their respective drivers 
    *Condensed some areas of code in order to make them
     more readable
    *Removed redundant None checks and added some where
     required
    *Fixed LshwOutput class to better reflect the lshw
     command output
---
 dts/framework/test_run.py                    |  8 +++
 dts/framework/testbed_model/linux_session.py | 68 ++++++++++++++++++++
 dts/framework/testbed_model/os_session.py    | 11 ++++
 3 files changed, 87 insertions(+)

diff --git a/dts/framework/test_run.py b/dts/framework/test_run.py
index 94dc6023a7..c92fe90f2e 100644
--- a/dts/framework/test_run.py
+++ b/dts/framework/test_run.py
@@ -98,6 +98,7 @@
         "InternalError" -> "exit":ew
 """
 
+import json
 import random
 from collections import deque
 from collections.abc import Iterable
@@ -347,6 +348,12 @@ def next(self) -> State | None:
         test_run.ctx.dpdk.setup()
         test_run.ctx.topology.setup()
 
+        used_nic_info: list[dict[str, str]] = self.test_run.ctx.sut_node.main_session.get_nic_info()
+        with open(f"{SETTINGS.output_dir}/dut_info.json", "w") as file:
+            json.dump(used_nic_info, file, indent=3)
+
+        self.logger.info(f"DUT NIC info written to: {SETTINGS.output_dir}/dut_info.json")
+
         if test_run.config.use_virtual_functions:
             test_run.ctx.topology.instantiate_vf_ports()
         if test_run.ctx.sut_node.cryptodevs and test_run.config.crypto:
@@ -370,6 +377,7 @@ def next(self) -> State | None:
         test_run.supported_capabilities = get_supported_capabilities(
             test_run.ctx.sut_node, test_run.ctx.topology, test_run.required_capabilities
         )
+
         return TestRunExecution(test_run, self.result)
 
     def on_error(self, ex: BaseException) -> State | None:
diff --git a/dts/framework/testbed_model/linux_session.py b/dts/framework/testbed_model/linux_session.py
index 3a6e97974b..9e9146c372 100644
--- a/dts/framework/testbed_model/linux_session.py
+++ b/dts/framework/testbed_model/linux_session.py
@@ -38,6 +38,8 @@ class LshwConfigurationOutput(TypedDict):
     driver: str
     #:
     link: str
+    #:
+    firmware: str
 
 
 class LshwOutput(TypedDict):
@@ -61,6 +63,12 @@ class LshwOutput(TypedDict):
             ...
     """
 
+    #:
+    vendor: NotRequired[str]
+    #:
+    product: NotRequired[str]
+    #:
+    version: NotRequired[str]
     #:
     businfo: str
     #:
@@ -197,6 +205,66 @@ def unbind_ports(self, ports: list[Port]):
         if self._lshw_net_info:
             del self._lshw_net_info
 
+    def get_nic_info(self) -> list[dict[str, str]]:
+        """Overrides :meth`~.os_session.OSSession.get_nic_info`.
+
+        Raises:
+            ConfigurationError: If the NIC info could not be found.
+        """
+        port_data = {
+            port.get("businfo"): port for port in self._lshw_net_info if port.get("businfo")
+        }
+
+        all_nic_info: list[dict[str, str]] = []
+        for port in self._config.ports:
+            pci_addr = port.pci
+
+            command_result = self.send_command(
+                f"sudo lshw -c network -businfo | grep '{pci_addr}' | cut -d'@' -f1"
+            )
+            if command_result.return_code != 0 and command_result.stdout == "":
+                raise ConfigurationError(f"Unable to get bus type for port {pci_addr}.")
+            bus_type = command_result.stdout
+
+            bus_info = f"{bus_type}@{pci_addr}"
+            nic_port: LshwOutput | None = port_data[bus_info]
+            if nic_port is None:
+                raise ConfigurationError(f"Port {pci_addr} could not be found on the node.")
+
+            config: LshwConfigurationOutput | None = nic_port["configuration"]
+            if config is None:
+                raise ConfigurationError(
+                    f"Configuration info for port {pci_addr} could not be found on the node."
+                )
+
+            if "logicalname" not in nic_port:
+                raise ConfigurationError(
+                    f"Logical name for port {pci_addr} could not be found on the node."
+                )
+
+            command_result = self.send_command(
+                f"ethtool {nic_port['logicalname']} | grep 'Speed:' | awk '{{print $2}}'"
+            )
+            if command_result.return_code == 0 and command_result.stdout:
+                nic_speed = command_result.stdout
+            else:
+                self._logger.error(f"Unable to get speed for NIC: {pci_addr}")
+                nic_speed = None
+
+            dut_json = {
+                "make": nic_port["vendor"] if "vendor" in nic_port else "Unknown",
+                "model": nic_port["product"] if "product" in nic_port else "Unknown",
+                "hardware version": nic_port["version"] if "version" in nic_port else "Unknown",
+                "firmware version": config["firmware"] if "firmware" in config else "Unknown",
+                "deviceBusType": bus_type,
+                "deviceId": nic_port["serial"] if "serial" in nic_port else "Unknown",
+                "pmd": config["driver"] if "driver" in config else "Unknown",
+                "speed": nic_speed or "Unknown",
+            }
+            all_nic_info.append(dut_json)
+
+        return all_nic_info
+
     def bind_ports_to_driver(self, ports: list[Port], driver_name: str) -> None:
         """Overrides :meth:`~.os_session.OSSession.bind_ports_to_driver`.
 
diff --git a/dts/framework/testbed_model/os_session.py b/dts/framework/testbed_model/os_session.py
index f2dc9b20a9..f88427a53d 100644
--- a/dts/framework/testbed_model/os_session.py
+++ b/dts/framework/testbed_model/os_session.py
@@ -581,6 +581,17 @@ def unbind_ports(self, ports: list[Port]) -> None:
             ports: The list of ports to unbind.
         """
 
+    @abstractmethod
+    def get_nic_info(self) -> list[dict[str, str]]:
+        """Get NIC information.
+
+        Returns:
+            NIC info as a list of dictionaries.
+
+        Raises:
+            ConfigurationError: If the NIC info could not be found.
+        """
+
     @abstractmethod
     def bind_ports_to_driver(self, ports: list[Port], driver_name: str) -> None:
         """Bind `ports` to the given `driver_name`.
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Kuniyuki Iwashima @ 2026-06-24 21:33 UTC (permalink / raw)
  To: Michal Luczaj
  Cc: Willem de Bruijn, Jakub Sitnicki, John Fastabend, Jiayuan Chen,
	David S. Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Alexei Starovoitov, Cong Wang, Daniel Borkmann,
	Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi,
	Martin KaFai Lau, Song Liu, Yonghong Song, Jiri Olsa,
	Emil Tsalapatis, Shuah Khan, netdev, bpf, linux-kernel,
	linux-kselftest
In-Reply-To: <dd065bfb-52ce-48fd-b1ef-9c6166f714ed@rbox.co>

On Wed, Jun 24, 2026 at 2:26 PM Michal Luczaj <mhal@rbox.co> wrote:
>
> On 6/24/26 22:01, Willem de Bruijn wrote:
> > Jakub Sitnicki wrote:
> >> On Tue, Jun 23, 2026 at 08:03 PM +02, Michal Luczaj wrote:
> >>> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
> >>> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
> >>>
> >>> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
> >>> socket's refcount via lookup. If the socket is subsequently bound, the
> >>> transition from unbound to bound causes bpf_sk_release() to skip the
> >>> decrement of the refcount, causing a memory leak.
> >>>
> >>> unreferenced object 0xffff88810bc2eb40 (size 1984):
> >>>   comm "test_progs", pid 2451, jiffies 4295320596
> >>>   hex dump (first 32 bytes):
> >>>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
> >>>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
> >>>   backtrace (crc bdee079d):
> >>>     kmem_cache_alloc_noprof+0x557/0x660
> >>>     sk_prot_alloc+0x69/0x240
> >>>     sk_alloc+0x30/0x460
> >>>     inet_create+0x2ce/0xf80
> >>>     __sock_create+0x25b/0x5c0
> >>>     __sys_socket+0x119/0x1d0
> >>>     __x64_sys_socket+0x72/0xd0
> >>>     do_syscall_64+0xa1/0x5f0
> >>>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
> >>>
> >>> Maintain balanced refcounts across sk lookup/release: (re-)set
> >>> SOCK_RCU_FREE on proto update to treat the socket (whether bound or
> >>> unbound) as not requiring a refcount increment on (a RCU protected) lookup.
> >>>
> >>> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
> >>> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> >>> ---
> >>> Note: this issue is related to commit 67312adc96b5 ("bpf: reject unhashed
> >>> sockets in bpf_sk_assign").
> >>> ---
> >>>  net/ipv4/udp_bpf.c | 3 +++
> >>>  1 file changed, 3 insertions(+)
> >>>
> >>> diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
> >>> index ad57c4c9eaab..970327b59582 100644
> >>> --- a/net/ipv4/udp_bpf.c
> >>> +++ b/net/ipv4/udp_bpf.c
> >>> @@ -173,6 +173,9 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
> >>>     if (sk->sk_family == AF_INET6)
> >>>             udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
> >>>
> >>> +   /* Treat all sockets as non-refcounted, regardless of binding state. */
> >>> +   sock_set_flag(sk, SOCK_RCU_FREE);
> >>> +
> >>>     sock_replace_proto(sk, &udp_bpf_prots[family]);
> >>>     return 0;
> >>>  }
> >>
> >> There is a side effect that an unhashed (unbound) UDP socket can now be
> >> selected in sk_lookup with bpf_sk_assign.
> >
> > The commit does mention a related fix, beneath the ---, commit
> > 67312adc96b5 ("bpf: reject unhashed sockets in bpf_sk_assign").
> > That fixes a similar issue by exactly disallowing this:
> >
> >     Fix the problem by rejecting unhashed sockets in bpf_sk_assign().
> >     This matches the behaviour of __inet_lookup_skb which is ultimately
> >     the goal of bpf_sk_assign().
> >
> > So ..
> >
> >> Though perhaps that's for the
> >> better because TC bpf_sk_assign doesn't reject non-refcounted UDP
> >> sockets either, so we would have both socket dispatch sites behave the
> >> same way.
> >
> > .. there are two conflicting types of consistency here? Consistent with
> > __inet_lookup_skb or the TC bpf hook. Of those the first is the more
> > canonical.
> >
> >> Also, with this patch, if we insert & remove an unhashed UDP socket
> >> into/from a sockmap, we end up with an unhashed non-refcounted UDP
> >> socket. Not entirely sure if that is actually a problem or not.
> >>
> >> Willem, what is your take on having unhashed non-refcoted UDP sockets?
> >
> > I don't immediately see a problem, but I'm not an expert on SOCK_RCU_FREE.
>
> Perhaps it's worth mentioning that unhashed non-refcounted UDP socket is
> already possible: first auto-bind via connect(AF_INET) (which also sets
> SOCK_RCU_FREE), then unhash via connect(AF_UNSPEC).

Setting SOCK_RCU_FREE itself should not cause a problem, but I think
we should take a step back.

AFAIU, 0c48eefae712 was to allow putting AF_UNIX SOCK_DGRAM sockets
into sockmap, not to allow using unconnected UDP sockets in sk_lookup etc.

Actually, v4 of the patch was implemented as such but did not get any feedback,
https://lore.kernel.org/bpf/20210508220835.53801-9-xiyou.wangcong@gmail.com/#t

... and v5 (the final commit) somehow removed the restriction for unconnected
UDP socket as well.
https://lore.kernel.org/bpf/20210704190252.11866-3-xiyou.wangcong@gmail.com/

Given the initial use case, sockmap redirect, is still blocked by
TCP_ESTABLISHED
check in sock_map_redirect_allowed(), I feel there is no point in supporting
unconnected UDP sockets in sockmap.  It cannot get any skb from anywhere
(without buggy sk_lookup).

^ permalink raw reply

* [PATCH v5] pcapng: add user-supplied timestamp support
From: Dawid Wesierski @ 2026-06-24 21:57 UTC (permalink / raw)
  To: dev; +Cc: dawid.wesierski, marek.kasiewicz, thomas, stephen, mb
In-Reply-To: <20260623141302.486601-1-dawid.wesierski@intel.com>

Introduce rte_pcapng_copy_ts() alongside the existing rte_pcapng_copy()
so that callers with a hardware PTP or pre-captured timestamp can inject
an exact epoch-ns value directly into the packet record.

Timestamp handling in rte_pcapng_copy_ts():
 - ts != 0: caller-supplied nanoseconds since the Unix epoch, stored as-is.
 - ts == 0: TSC captured at copy time with bit 63 set as a sentinel.
   rte_pcapng_write_packets() detects the sentinel and converts the TSC to
   epoch ns using the file's calibrated clock.  The TSC will not reach
   bit 63 for centuries, and epoch-ns values stay below bit 63 until 2554,
   so the bit is safe to use as a disambiguation flag.

rte_pcapng_copy() is retained as a real exported function (not an inline
wrapper) so the stable ABI symbol is preserved.  It simply calls
rte_pcapng_copy_ts(..., 0) to capture the current TSC.

rte_pcapng_tsc_to_ns() is added as a new experimental helper (addressing
review requests from Stephen Hemminger and Morten Brørup).  It exposes the
same calibrated, drift-compensated, divide-free TSC-to-epoch-ns conversion
used internally by rte_pcapng_write_packets(), allowing callers to convert
a TSC captured at packet arrival time before passing it to
rte_pcapng_copy_ts().

Signed-off-by: Marek Kasiewicz <marek.kasiewicz@intel.com>
Signed-off-by: Dawid Wesierski <dawid.wesierski@intel.com>
---

Hi Stephen, Morten,
Thank you for the feedback on the previous versions,
In this version i added unit Tests test case in app/test/test_pcapng.c to
verify the TSC-to-NS conversion and the custom timestamp injection.


Regards,
Dawid Węsierski

 .mailmap                |   2 +
 app/test/test_pcapng.c  | 108 ++++++++++++++++++++++++++++++++++++++++
 lib/pcapng/rte_pcapng.c |  42 +++++++++++++---
 lib/pcapng/rte_pcapng.h |  53 ++++++++++++++++++++
 4 files changed, 199 insertions(+), 6 deletions(-)

diff --git a/.mailmap b/.mailmap
index 4001e5fb0e..a7d97a631e 100644
--- a/.mailmap
+++ b/.mailmap
@@ -366,6 +366,7 @@ David Zeng <zengxhsh@cn.ibm.com>
 Davide Caratti <dcaratti@redhat.com>
 Dawid Gorecki <dgr@semihalf.com>
 Dawid Jurczak <dawid_jurek@vp.pl>
+Dawid Wesierski <dawid.wesierski@intel.com> Wesierski, Dawid <dawid.wesierski@intel.com>
 Dawid Zielinski <dawid.zielinski@intel.com>
 Dawid Łukwiński <dawid.lukwinski@intel.com>
 Daxue Gao <daxuex.gao@intel.com>
@@ -1014,6 +1015,7 @@ Marcin Wilk <marcin.wilk@caviumnetworks.com>
 Marcin Wojtas <mw@semihalf.com>
 Marcin Zapolski <marcinx.a.zapolski@intel.com>
 Marco Varlese <mvarlese@suse.de>
+Marek Kasiewicz <marek.kasiewicz@intel.com>
 Marek Mical <marekx.mical@intel.com>
 Marek Zalfresso-jundzillo <marekx.zalfresso-jundzillo@intel.com>
 Maria Lingemark <maria.lingemark@ericsson.com>
diff --git a/app/test/test_pcapng.c b/app/test/test_pcapng.c
index 298bcbd31f..0554c33369 100644
--- a/app/test/test_pcapng.c
+++ b/app/test/test_pcapng.c
@@ -672,6 +672,113 @@ test_write_before_open(void)
 	return -1;
 }
 
+static int
+test_pcapng_timestamp(void)
+{
+	char file_name[PATH_MAX] = "/tmp/pcapng_test_XXXXXX.pcapng";
+	rte_pcapng_t *pcapng = NULL;
+	int ret, tmp_fd;
+	struct dummy_mbuf mbfs;
+	struct rte_mbuf *orig, *mc;
+	uint64_t now_ns, tsc, ns_from_tsc, pcap_ts;
+
+	tmp_fd = mkstemps(file_name, strlen(".pcapng"));
+	if (tmp_fd == -1) {
+		perror("mkstemps() failure");
+		goto fail;
+	}
+
+	pcapng = rte_pcapng_fdopen(tmp_fd, NULL, NULL, "pcapng_ts_test", NULL);
+	if (pcapng == NULL) {
+		printf("rte_pcapng_fdopen failed\n");
+		close(tmp_fd);
+		goto fail;
+	}
+
+	ret = rte_pcapng_add_interface(pcapng, port_id, DLT_EN10MB, NULL, NULL, NULL);
+	if (ret < 0) {
+		printf("can not add port %u\n", port_id);
+		goto fail;
+	}
+
+	/* Test 1: rte_pcapng_tsc_to_ns */
+	tsc = rte_get_tsc_cycles();
+	now_ns = current_timestamp();
+	ns_from_tsc = rte_pcapng_tsc_to_ns(pcapng, tsc);
+
+	/* Check if TSC-derived NS is reasonably close to wall clock NS (within 100ms) */
+	if (ns_from_tsc > now_ns + 100000000 || ns_from_tsc < now_ns - 100000000) {
+		printf("TSC to NS conversion failed: tsc=%"PRIu64
+		       " ns_from_tsc=%"PRIu64" now_ns=%"PRIu64"\n",
+		       tsc, ns_from_tsc, now_ns);
+		goto fail;
+	}
+
+	/* Test 2: rte_pcapng_copy_ts with explicit timestamp */
+	mbuf1_prepare(&mbfs);
+	orig = &mbfs.mb[0];
+	pcap_ts = now_ns + 1000000000; /* 1 second in future to be distinct */
+
+	mc = rte_pcapng_copy_ts(port_id, 0, orig, mp, rte_pktmbuf_pkt_len(orig),
+				RTE_PCAPNG_DIRECTION_IN, "custom_ts", pcap_ts);
+	if (mc == NULL) {
+		printf("rte_pcapng_copy_ts failed\n");
+		goto fail;
+	}
+
+	/* Write it */
+	ret = rte_pcapng_write_packets(pcapng, &mc, 1);
+	rte_pktmbuf_free(mc);
+	if (ret <= 0) {
+		printf("Write of custom timestamp packet failed\n");
+		goto fail;
+	}
+
+	rte_pcapng_close(pcapng);
+
+	/* Validate the file using libpcap */
+	/* We expect 1 packet with timestamp exactly pcap_ts */
+	{
+		char errbuf[PCAP_ERRBUF_SIZE];
+		pcap_t *pcap;
+		struct pcap_pkthdr h;
+		const u_char *bytes;
+		uint64_t ns;
+
+		pcap = pcap_open_offline_with_tstamp_precision(file_name,
+							       PCAP_TSTAMP_PRECISION_NANO,
+							       errbuf);
+		if (pcap == NULL) {
+			printf("pcap_open_offline failed: %s\n", errbuf);
+			goto fail;
+		}
+
+		bytes = pcap_next(pcap, &h);
+		if (bytes == NULL) {
+			printf("No packets in file\n");
+			pcap_close(pcap);
+			goto fail;
+		}
+
+		ns = (uint64_t)h.ts.tv_sec * NS_PER_S + h.ts.tv_usec;
+		if (ns != pcap_ts) {
+			printf("Timestamp mismatch: expected %"PRIu64" got %"PRIu64"\n",
+			       pcap_ts, ns);
+			pcap_close(pcap);
+			goto fail;
+		}
+		pcap_close(pcap);
+	}
+
+	remove(file_name);
+	return 0;
+
+fail:
+	if (pcapng)
+		rte_pcapng_close(pcapng);
+	return -1;
+}
+
 static void
 test_cleanup(void)
 {
@@ -688,6 +795,7 @@ unit_test_suite test_pcapng_suite  = {
 		TEST_CASE(test_add_interface),
 		TEST_CASE(test_write_packets),
 		TEST_CASE(test_write_before_open),
+		TEST_CASE(test_pcapng_timestamp),
 		TEST_CASES_END()
 	}
 };
diff --git a/lib/pcapng/rte_pcapng.c b/lib/pcapng/rte_pcapng.c
index b5d1026891..84c427fb2d 100644
--- a/lib/pcapng/rte_pcapng.c
+++ b/lib/pcapng/rte_pcapng.c
@@ -37,6 +37,9 @@
 /* upper bound for strings in pcapng option data */
 #define PCAPNG_STR_MAX	UINT16_MAX
 
+/* Flag to indicate timestamp is in TSC cycles (bit 63) */
+#define PCAPNG_TSC_FLAG	(1ULL << 63)
+
 /*
  * Converter from TSC values to nanoseconds since Unix epoch.
  * Uses reciprocal multiply to avoid runtime division.
@@ -480,6 +483,13 @@ rte_pcapng_mbuf_size(uint32_t length)
 		+ sizeof(uint32_t);		  /*  length */
 }
 
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_pcapng_tsc_to_ns, 26.07)
+uint64_t
+rte_pcapng_tsc_to_ns(const rte_pcapng_t *self, uint64_t tsc)
+{
+	return tsc_to_ns_epoch(&self->clock, tsc);
+}
+
 /* More generalized version rte_vlan_insert() */
 static int
 pcapng_vlan_insert(struct rte_mbuf *m, uint16_t ether_type, uint16_t tci)
@@ -554,11 +564,24 @@ rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 		uint32_t length,
 		enum rte_pcapng_direction direction,
 		const char *comment)
+{
+	return rte_pcapng_copy_ts(port_id, queue, md, mp, length, direction,
+				  comment, 0);
+}
+
+RTE_EXPORT_EXPERIMENTAL_SYMBOL(rte_pcapng_copy_ts, 26.07)
+struct rte_mbuf *
+rte_pcapng_copy_ts(uint16_t port_id, uint32_t queue,
+		const struct rte_mbuf *md,
+		struct rte_mempool *mp,
+		uint32_t length,
+		enum rte_pcapng_direction direction,
+		const char *comment,
+		uint64_t timestamp)
 {
 	struct pcapng_enhance_packet_block *epb;
 	uint32_t orig_len, pkt_len, padding, flags;
 	struct pcapng_option *opt;
-	uint64_t timestamp;
 	uint16_t optlen;
 	struct rte_mbuf *mc;
 	bool rss_hash;
@@ -690,8 +713,13 @@ rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 	/* Interface index is filled in later during write */
 	mc->port = port_id;
 
-	/* Put timestamp in cycles here - adjust in packet write */
-	timestamp = rte_get_tsc_cycles();
+	/*
+	 * Use caller provided timestamp.
+	 * If none provided, use current TSC and set flag.
+	 */
+	if (timestamp == 0)
+		timestamp = rte_get_tsc_cycles() | PCAPNG_TSC_FLAG;
+
 	epb->timestamp_hi = timestamp >> 32;
 	epb->timestamp_lo = (uint32_t)timestamp;
 	epb->capture_length = pkt_len;
@@ -743,9 +771,11 @@ rte_pcapng_write_packets(rte_pcapng_t *self,
 		 */
 		cycles = (uint64_t)epb->timestamp_hi << 32;
 		cycles += epb->timestamp_lo;
-		timestamp = tsc_to_ns_epoch(&self->clock, cycles);
-		epb->timestamp_hi = timestamp >> 32;
-		epb->timestamp_lo = (uint32_t)timestamp;
+		if (cycles & PCAPNG_TSC_FLAG) {
+			timestamp = tsc_to_ns_epoch(&self->clock, cycles & ~PCAPNG_TSC_FLAG);
+			epb->timestamp_hi = timestamp >> 32;
+			epb->timestamp_lo = (uint32_t)timestamp;
+		}
 
 		/*
 		 * Handle case of highly fragmented and large burst size
diff --git a/lib/pcapng/rte_pcapng.h b/lib/pcapng/rte_pcapng.h
index d8d328f710..42c42ca60c 100644
--- a/lib/pcapng/rte_pcapng.h
+++ b/lib/pcapng/rte_pcapng.h
@@ -140,6 +140,59 @@ rte_pcapng_copy(uint16_t port_id, uint32_t queue,
 		uint32_t length,
 		enum rte_pcapng_direction direction, const char *comment);
 
+/**
+ * Format an mbuf for writing to file with a custom timestamp.
+ *
+ * @param port_id
+ *   The Ethernet port on which packet was received
+ *   or is going to be transmitted.
+ * @param queue
+ *   The queue on the Ethernet port where packet was received
+ *   or is going to be transmitted.
+ * @param mp
+ *   The mempool from which the "clone" mbufs are allocated.
+ * @param m
+ *   The mbuf to copy
+ * @param length
+ *   The upper limit on bytes to copy.  Passing UINT32_MAX
+ *   means all data (after offset).
+ * @param direction
+ *   The direction of the packer: receive, transmit or unknown.
+ * @param comment
+ *   Optional per packet comment.
+ *   Truncated to UINT16_MAX characters.
+ * @param timestamp
+ *   Nanoseconds since the Unix epoch. If zero, TSC is captured and
+ *   converted at write time.
+ *
+ * @return
+ *   - The pointer to the new mbuf formatted for pcapng_write
+ *   - NULL on error such as invalid port or out of memory.
+ */
+__rte_experimental
+struct rte_mbuf *
+rte_pcapng_copy_ts(uint16_t port_id, uint32_t queue,
+		const struct rte_mbuf *m, struct rte_mempool *mp,
+		uint32_t length,
+		enum rte_pcapng_direction direction,
+		const char *comment, uint64_t timestamp);
+
+/**
+ * Convert a TSC value to nanoseconds since the Unix epoch.
+ *
+ * Uses the calibrated clock of the capture file.
+ *
+ * @param self
+ *  The handle to the packet capture file
+ * @param tsc
+ *  The TSC value to convert
+ * @return
+ *  Nanoseconds since Unix epoch
+ */
+__rte_experimental
+uint64_t
+rte_pcapng_tsc_to_ns(const rte_pcapng_t *self, uint64_t tsc);
+
 
 /**
  * Determine optimum mbuf data size.
-- 
2.47.3

---------------------------------------------------------------------
Intel Technology Poland sp. z o.o.
ul. Slowackiego 173 | 80-298 Gdansk | Sad Rejonowy Gdansk Polnoc | VII Wydzial Gospodarczy Krajowego Rejestru Sadowego - KRS 101882 | NIP 957-07-52-316 | Kapital zakladowy 200.000 PLN.
Spolka oswiadcza, ze posiada status duzego przedsiebiorcy w rozumieniu ustawy z dnia 8 marca 2013 r. o przeciwdzialaniu nadmiernym opoznieniom w transakcjach handlowych.

Ta wiadomosc wraz z zalacznikami jest przeznaczona dla okreslonego adresata i moze zawierac informacje poufne. W razie przypadkowego otrzymania tej wiadomosci, prosimy o powiadomienie nadawcy oraz trwale jej usuniecie; jakiekolwiek przegladanie lub rozpowszechnianie jest zabronione.
This e-mail and any attachments may contain confidential material for the sole use of the intended recipient(s). If you are not the intended recipient, please contact the sender and delete all copies; any review or distribution by others is strictly prohibited.

^ permalink raw reply related

* Re: [PATCH v3 2/4] Documentation: ABI: Add ABI doc for versal edac sysfs
From: sashiko-bot @ 2026-06-24 21:32 UTC (permalink / raw)
  To: Rama devi Veggalam; +Cc: robh, conor+dt, devicetree
In-Reply-To: <20260624212545.2850787-3-rama.devi.veggalam@amd.com>

Thank you for your contribution! Sashiko AI review found 4 potential issue(s) to consider:
- [Medium] The sysfs entries implement a stateful RPC command-response interface, causing data races.
- [Medium] Multiple sysfs attributes output complex, multiline blocks instead of single values.
- [Low] Malformed ABI Documentation Syntax.
- [Low] Duplicate and conflicting entries in ABI Documentation.
--

commit 9760cee2db24f4eb5850706de75cd24db0ae078c
Author: Rama devi Veggalam <rama.devi.veggalam@amd.com>

Documentation: ABI: Add ABI doc for versal edac sysfs

Add documentation for the sysfs entries created for
versal edac (XilSEM).

> diff --git a/Documentation/ABI/testing/sysfs-driver-versal-edac b/Documentation/ABI/testing/sysfs-driver-versal-edac
> new file mode 100644
> index 0000000000000..c35d864fcbe2c
> --- /dev/null
> +++ b/Documentation/ABI/testing/sysfs-driver-versal-edac
> @@ -0,0 +1,303 @@
> +What:		/sys/devices/system/edac/versal_xilsem/xsem_scan_control
> +Date:		What:		/sys/devices/system/edac/mc/mc0/xsem_scan_control
> +Date:		June 2026

[Severity: Low]
Is there malformed syntax in this documentation block? The second line has a 
duplicate "What:" definition inside the "Date:" field.

> +Contact:	rama.devi.veggalam@amd.com
> +Description:
> +		It is a read/write file.
> +		Writing to this file causes the software to initiate a
> +		request to the firmware for doing requested scan operation in
> +		Configuration RAM (CRAM) or NPI of Versal mono/SSIT devices.
> +		User needs to provide scan operation id (init, start, stop) and
> +		SLR id (Versal SSIT device) details. For Versal mono devices,
> +		the SLR id should be 0.
> +		The scan operation id values are as given below:
> +		1 - Initialize the scan
> +		2 - Start CRAM scan
> +		3 - Stop CRAM scan
> +		5 - Start NPI scan
> +		6 - Stop NPI scan
> +		7 - Inject NPI error in first descriptor
> +
> +		When read, it shows the current scan status with error code.

[Severity: Medium]
Could this stateful command-response interface introduce data races?

In drivers/edac/versal_edac.c, xsem_scan_control_store() writes the command
result directly to the shared, lockless array:

    ret = zynqmp_pm_xilsem_cntrl_ops(cmd, slrid, priv->xsem_rtca->scan_ctrl_status);

Then xsem_scan_control_show() reads from it:

    return sprintf(data, "[0x%x][0x%x][0x%x][0x%x][0x%x][0x%x][0x%x]\n\r",
                   priv->xsem_rtca->scan_ctrl_status[0],

Will concurrent writes to xsem_scan_control clobber the shared buffers
in priv->xsem_rtca->scan_ctrl_status before either process reads the
attribute, leading to incorrect results?

[ ... ]

> +What:		/sys/devices/system/edac/mc/mc0/xsem_read_ssit_status
> +Date:		June 2026
> +Contact:	rama.devi.veggalam@amd.com
> +Description:
> +		It is a read/write file.
> +		Writing to this file causes the software to initiate a
> +		request to read CRAM and NPI scan status information in a given SLR.
> +		User needs to provide SLR id in Versal SSIT device for which
> +		XilSEM status to be read. When read, it shows
> +		the XilSEM status of the requested SLR.
> +		The format is: <Header> <SLR id> <NPI status info> <CRAM status info>
> +		Example: Read SLR Status Cmd:[0x1030d]
> +		SLR ID:[0x3]
> +		NPI status:[0xa01]

[Severity: Medium]
Will this sysfs attribute violate the single-value-per-file guideline?

Looking at xsem_read_ssit_status_show() in drivers/edac/versal_edac.c, it
generates a multiline block instead of a single value:

    offset += sprintf(data + offset, "Read SLR Status Cmd:[0x%x]\n\r",
                      priv->xsem_rtca->xilsem_ssit_status[0]);
    offset += sprintf(data + offset, "SLR ID:[0x%x]\n\r",
                      priv->xsem_rtca->xilsem_ssit_status[1]);
    offset += sprintf(data + offset, "NPI status:[0x%x]\n\r",

Does this complex multiline output break standard userspace tools that
parse sysfs?

[ ... ]

> +What:		/sys/devices/system/edac/versal_xilsem/xsem_cram_injecterr
> +Date:		July 2025
> +Contact:	rama.devi.veggalam@amd.com

[Severity: Low]
Are there duplicate and conflicting entries in the documentation?

The path /sys/devices/system/edac/versal_xilsem/xsem_cram_injecterr 
listed here appears to conflict with the /sys/devices/system/edac/mc/mc0/ 
path documented earlier for the same attribute.

> +Description:
> +		It is a read/write file.

-- 
Sashiko AI review · https://sashiko.dev/#/patchset/20260624212545.2850787-1-rama.devi.veggalam@amd.com?part=2

^ permalink raw reply

* [PATCH 19/19] KVM: x86/mmu: use kvm_page_format to test SPTEs
From: Paolo Bonzini @ 2026-06-24 21:31 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

is_access_allowed(), and is_executable_pte() within it, are effectively
a special version of permission_fault() that only supports a subset
of roles.  In particular it does not allow SMEP, SMAP and PKE.

Replace its implementation with a modified version of permission_fault();
the new version will support SMEP (and hence AMD GMET) for free as soon
as update_spte_permission_bitmask() stops hardcoding cr4_smep == false.

This prepares for a possible future where TDP entries could have XS!=XU,
for example as part of implementing Hyper-V VSM natively inside KVM.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c     | 18 +++++++++++---
 arch/x86/kvm/mmu/spte.h    | 51 ++++++++++++++++++++++----------------
 arch/x86/kvm/mmu/tdp_mmu.c |  3 ++-
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 11d86e34acf4..6c13da942bfc 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -3694,6 +3694,7 @@ static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte)
  */
 static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 {
+	struct kvm_mmu *mmu;
 	struct kvm_mmu_page *sp;
 	int ret = RET_PF_INVALID;
 	u64 spte;
@@ -3703,6 +3704,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 	if (!page_fault_can_be_fast(vcpu->kvm, fault))
 		return ret;
 
+	mmu = vcpu->arch.mmu;
 	walk_shadow_page_lockless_begin(vcpu);
 
 	do {
@@ -3738,7 +3740,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * Need not check the access of upper level table entries since
 		 * they are always ACC_ALL.
 		 */
-		if (is_access_allowed(fault, spte)) {
+		if (!spte_permission_fault(mmu, spte, fault)) {
 			ret = RET_PF_SPURIOUS;
 			break;
 		}
@@ -3761,7 +3763,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 		 * that were write-protected for dirty-logging or access
 		 * tracking are handled here.  Don't bother checking if the
 		 * SPTE is writable to prioritize running with A/D bits enabled.
-		 * The is_access_allowed() check above handles the common case
+		 * The spte_permission_fault() check above handles the common case
 		 * of the fault being spurious, and the SPTE is known to be
 		 * shadow-present, i.e. except for access tracking restoration
 		 * making the new SPTE writable, the check is wasteful.
@@ -3786,7 +3788,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault)
 
 		/* Verify that the fault can be handled in the fast path */
 		if (new_spte == spte ||
-		    !is_access_allowed(fault, new_spte))
+		    spte_permission_fault(mmu, new_spte, fault))
 			break;
 
 		/*
@@ -5764,6 +5766,12 @@ static void update_permission_bitmask(struct kvm_pagewalk *w, bool tdp, bool ept
 				    is_cr0_wp(w), is_efer_nx(w));
 }
 
+static void update_spte_permission_bitmask(struct kvm_mmu *mmu, bool tdp, bool ept)
+{
+	__update_permission_bitmask(&mmu->fmt, tdp, ept,
+				    mmu->root_role.cr4_smep, false, true, true);
+}
+
 /*
 * PKU is an additional mechanism by which the paging controls access to
 * user-mode addresses based on the value in the PKRU register.  Protection
@@ -5973,6 +5981,7 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_spte = NULL;
 
+	update_spte_permission_bitmask(context, true, shadow_xs_mask);
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
@@ -5991,6 +6000,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	else
 		paging32_init_context(context);
 
+	update_spte_permission_bitmask(context, context == &vcpu->arch.guest_mmu, false);
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -6119,6 +6129,8 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		update_permission_bitmask(ngpa_walk, true, true);
 		ngpa_walk->fmt.pkru_mask = 0;
 		reset_rsvds_bits_mask_ept(vcpu, execonly, huge_page_level);
+
+		update_spte_permission_bitmask(context, true, true);
 		reset_ept_shadow_zero_bits_mask(context, execonly);
 	}
 
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 918533e61b98..e730717824b3 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -357,17 +357,6 @@ static inline bool is_last_spte(u64 pte, int level)
 	return (level == PG_LEVEL_4K) || is_large_pte(pte);
 }
 
-static inline bool is_executable_pte(u64 spte)
-{
-	/*
-	 * For now, return true if either the XS or XU bit is set
-	 * This function is only used for fast_page_fault,
-	 * which never processes shadow EPT, and regular page
-	 * tables always have XS==XU.
-	 */
-	return (spte & (shadow_xs_mask | shadow_xu_mask | shadow_nx_mask)) != shadow_nx_mask;
-}
-
 static inline kvm_pfn_t spte_to_pfn(u64 pte)
 {
 	return (pte & SPTE_BASE_ADDR_MASK) >> PAGE_SHIFT;
@@ -496,20 +485,40 @@ static inline bool is_mmu_writable_spte(u64 spte)
 }
 
 /*
- * Returns true if the access indicated by @fault is allowed by the existing
- * SPTE protections.  Note, the caller is responsible for checking that the
- * SPTE is a shadow-present, leaf SPTE (either before or after).
+ * Returns true if the access indicated by @fault is forbidden by the existing
+ * SPTE protections.
  */
-static inline bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
+static inline bool spte_permission_fault(struct kvm_mmu *mmu, u64 spte,
+					 struct kvm_page_fault *fault)
 {
-	if (fault->exec)
-		return is_executable_pte(spte);
+	unsigned pfec, pte_access;
 
-	if (fault->write)
-		return is_writable_pte(spte);
+	if (!is_shadow_present_pte(spte))
+		return true;
 
-	/* Fault was on Read access */
-	return spte & PT_PRESENT_MASK;
+	BUILD_BUG_ON(PT_PRESENT_MASK != ACC_READ_MASK);
+	BUILD_BUG_ON(PT_WRITABLE_MASK != ACC_WRITE_MASK);
+	BUILD_BUG_ON(VMX_EPT_READABLE_MASK != ACC_READ_MASK);
+	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != ACC_WRITE_MASK);
+
+	/* strip nested paging fault error codes */
+	pte_access = spte & (PT_PRESENT_MASK | PT_WRITABLE_MASK);
+	if (shadow_nx_mask) {
+		pte_access |= spte & shadow_user_mask ? ACC_USER_MASK : 0;
+		pte_access |= spte & shadow_nx_mask ? 0 : ACC_EXEC_MASK;
+	} else {
+		pte_access |= spte & shadow_xs_mask ? ACC_EXEC_MASK : 0;
+		pte_access |= spte & shadow_xu_mask ? ACC_USER_EXEC_MASK : 0;
+	}
+
+	/*
+	 * RSVD is handled elsewhere, and is used for SMAP in the context
+	 * of accessing fmt.permissions[].  SPTEs never use PK or SS, as
+	 * they are not supported for shadow paging and irrelevant for TDP.
+	 */
+	pfec = fault->error_code & (
+		PFERR_WRITE_MASK | PFERR_USER_MASK | PFERR_FETCH_MASK);
+	return (mmu->fmt.permissions[pfec >> 1] >> pte_access) & 1;
 }
 
 /*
diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c
index c1cbae65d239..ce3f2efadb05 100644
--- a/arch/x86/kvm/mmu/tdp_mmu.c
+++ b/arch/x86/kvm/mmu/tdp_mmu.c
@@ -1122,6 +1122,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 					  struct kvm_page_fault *fault,
 					  struct tdp_iter *iter)
 {
+	struct kvm_mmu *mmu = vcpu->arch.mmu;
 	struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep));
 	u64 new_spte;
 	int ret = RET_PF_FIXED;
@@ -1131,7 +1132,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu,
 		return RET_PF_RETRY;
 
 	if (is_shadow_present_pte(iter->old_spte) &&
-	    (fault->prefetch || is_access_allowed(fault, iter->old_spte)) &&
+	    (fault->prefetch || !spte_permission_fault(mmu, iter->old_spte, fault)) &&
 	    is_last_spte(iter->old_spte, iter->level)) {
 		WARN_ON_ONCE(fault->pfn != spte_to_pfn(iter->old_spte));
 		return RET_PF_SPURIOUS;
-- 
2.52.0


^ permalink raw reply related

* [PATCH 14/19] KVM: x86/mmu: unify root_gva_walk and ngva_walk
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

At this point, vcpu->arch.gva_walk and vcpu->arch.root_mmu.w contain
the same information (at least when KVM is not running a nested guest,
i.e. when root_mmu is actually in use); compare init_kvm_page_walk()
on one side with init_kvm_softmmu() + shadow_mmu_init_context() on
the other.  root_mmu.w is still used by shadow paging, via
FNAME(walk_addr) and its callers.

Always use the same instance of kvm_pagewalk to do GVA->GPA translations,
instead of flipping the gva_walk pointer back and forth.  After all the
page walking does behave the same no matter if you are in guest mode or
not; the difference lies in the behavior of kvm_translate_gpa and thus
in vcpu->arch.mmu, not in the page walker itself.

vcpu->arch.guest_mmu.w instead is used for both guest emulation
(kvm_translate_gpa) and shadow paging.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  13 +---
 arch/x86/kvm/hyperv.c           |   2 +-
 arch/x86/kvm/mmu.h              |   8 +--
 arch/x86/kvm/mmu/mmu.c          | 120 +++++++++++---------------------
 arch/x86/kvm/mmu/paging_tmpl.h  |   4 +-
 arch/x86/kvm/regs.c             |   2 +-
 arch/x86/kvm/svm/nested.c       |   2 -
 arch/x86/kvm/vmx/nested.c       |   3 -
 arch/x86/kvm/x86.c              |  18 ++---
 9 files changed, 60 insertions(+), 112 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8b9cf364c9f6..8a2126ca49c4 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -905,26 +905,15 @@ struct kvm_vcpu_arch {
 
 	/* Non-nested MMU for L1 */
 	struct kvm_mmu root_mmu;
-	struct kvm_pagewalk root_gva_walk;
 
 	/* L1 TDP when running nested */
 	struct kvm_mmu guest_mmu;
 	struct kvm_pagewalk ngpa_walk;
 
-	/*
-	 * Paging state of an L2 guest (used for nested npt)
-	 *
-	 * This context will save all necessary information to walk page tables
-	 * of an L2 guest. This context is only initialized for page table
-	 * walking and not for faulting since we never handle l2 page faults on
-	 * the host.
-	 */
-	struct kvm_pagewalk ngva_walk;
-
 	/*
 	 * Pagewalk context used for gva_to_gpa translations.
 	 */
-	struct kvm_pagewalk *gva_walk;
+	struct kvm_pagewalk gva_walk;
 
 	u64 pdptrs[4]; /* pae */
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index 51d812babe73..1ee0d23f8949 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2046,7 +2046,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	 * read with kvm_read_guest().
 	 */
 	if (!hc->fast) {
-		hc->ingpa = kvm_translate_gpa(vcpu, vcpu->arch.gva_walk, hc->ingpa,
+		hc->ingpa = kvm_translate_gpa(vcpu, &vcpu->arch.gva_walk, hc->ingpa,
 					      PFERR_GUEST_FINAL_MASK, NULL, 0);
 		if (unlikely(hc->ingpa == INVALID_GPA))
 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 1631fd43c9a1..9d00d0eb230b 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -253,9 +253,9 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
 	 * @w's snapshot of CR0.WP and thus all related paging metadata may
 	 * be stale.  Refresh CR0.WP and the metadata on-demand when checking
 	 * for permission faults.  Exempt nested MMUs, i.e. MMUs for shadowing
-	 * nEPT and nNPT, as CR0.WP is ignored in both cases.  Note, KVM does
-	 * need to refresh ngva_walk, a.k.a. the walker used to translate L2
-	 * GVAs to GPAs, so as to honor L2's CR0.WP.
+	 * nEPT and nNPT, as CR0.WP is ignored in both cases.  Note, KVM will
+	 * still refresh gva_walk, so as to honor L2's CR0.WP when translating
+	 * L2 GVAs to GPAs.
 	 */
 	if (!tdp_enabled || w == &vcpu->arch.ngpa_walk)
 		return;
@@ -382,7 +382,7 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
 				      struct x86_exception *exception,
 				      u64 pte_access)
 {
-	if (w != &vcpu->arch.ngva_walk)
+	if (!mmu_is_nested(vcpu) || w == &vcpu->arch.ngpa_walk)
 		return gpa;
 	return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access,
 							    exception,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 3ffaa48b566e..a464e3ec26ee 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5212,7 +5212,6 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
 static void nonpaging_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = nonpaging_page_fault;
-	context->w->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->sync_spte = NULL;
 }
 
@@ -5843,14 +5842,12 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
 static void paging64_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging64_page_fault;
-	context->w->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_spte = paging64_sync_spte;
 }
 
 static void paging32_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging32_page_fault;
-	context->w->gva_to_gpa = paging32_gva_to_gpa;
 	context->sync_spte = paging32_sync_spte;
 }
 
@@ -5965,39 +5962,22 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
 
-	if (cpu_role.as_u64 == context->w->cpu_role.as_u64 &&
-	    root_role.word == context->root_role.word)
+	if (root_role.word == context->root_role.word)
 		return;
 
-	context->w->cpu_role.as_u64 = cpu_role.as_u64;
 	context->root_role.word = root_role.word;
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_spte = NULL;
 
-	context->w->inject_page_fault = kvm_inject_page_fault;
-	context->w->get_pdptr = kvm_pdptr_read;
-	context->w->get_guest_pgd = get_guest_cr3;
-
-	if (!is_cr0_pg(context->w))
-		context->w->gva_to_gpa = nonpaging_gva_to_gpa;
-	else if (is_cr4_pae(context->w))
-		context->w->gva_to_gpa = paging64_gva_to_gpa;
-	else
-		context->w->gva_to_gpa = paging32_gva_to_gpa;
-
-	reset_guest_paging_metadata(vcpu, context->w);
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
 static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context,
-				    union kvm_cpu_role cpu_role,
 				    union kvm_mmu_page_role root_role)
 {
-	if (cpu_role.as_u64 == context->w->cpu_role.as_u64 &&
-	    root_role.word == context->root_role.word)
+	if (root_role.word == context->root_role.word)
 		return;
 
-	context->w->cpu_role.as_u64 = cpu_role.as_u64;
 	context->root_role.word = root_role.word;
 
 	if (!is_cr0_pg(context->w))
@@ -6007,7 +5987,6 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	else
 		paging32_init_context(context);
 
-	reset_guest_paging_metadata(vcpu, context->w);
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -6033,7 +6012,28 @@ static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
 	 */
 	root_role.efer_nx = true;
 
-	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+	shadow_mmu_init_context(vcpu, context, root_role);
+}
+
+static void init_kvm_page_walk(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
+			       union kvm_cpu_role cpu_role)
+{
+	if (cpu_role.as_u64 == w->cpu_role.as_u64)
+		return;
+
+	w->cpu_role.as_u64   = cpu_role.as_u64;
+	w->inject_page_fault = kvm_inject_page_fault;
+	w->get_pdptr         = kvm_pdptr_read;
+	w->get_guest_pgd     = get_guest_cr3;
+
+	if (!is_cr0_pg(w))
+		w->gva_to_gpa = nonpaging_gva_to_gpa;
+	else if (is_cr4_pae(w))
+		w->gva_to_gpa = paging64_gva_to_gpa;
+	else
+		w->gva_to_gpa = paging32_gva_to_gpa;
+
+	reset_guest_paging_metadata(vcpu, w);
 }
 
 void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
@@ -6052,13 +6052,15 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr4,
 	WARN_ON_ONCE(cpu_role.base.direct || !cpu_role.base.guest_mode);
 	cpu_role.base.cr4_smep = (misc_ctl & SVM_MISC_ENABLE_GMET) != 0;
 
+	init_kvm_page_walk(vcpu, &vcpu->arch.ngpa_walk, cpu_role);
+
 	root_role = cpu_role.base;
 	root_role.level = kvm_mmu_get_tdp_level(vcpu);
 	if (root_role.level == PT64_ROOT_5LEVEL &&
 	    cpu_role.base.level == PT64_ROOT_4LEVEL)
 		root_role.passthrough = 1;
 
-	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
+	shadow_mmu_init_context(vcpu, context, root_role);
 	kvm_mmu_new_pgd(vcpu, nested_cr3);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_npt_mmu);
@@ -6123,46 +6125,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
 static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 			     union kvm_cpu_role cpu_role)
 {
-	struct kvm_mmu *context = &vcpu->arch.root_mmu;
-
 	kvm_init_shadow_mmu(vcpu, cpu_role);
-
-	context->w->inject_page_fault = kvm_inject_page_fault;
-	context->w->get_pdptr         = kvm_pdptr_read;
-	context->w->get_guest_pgd     = get_guest_cr3;
-}
-
-static void init_kvm_ngva_walk(struct kvm_vcpu *vcpu,
-				union kvm_cpu_role new_mode)
-{
-	struct kvm_pagewalk *g_context = &vcpu->arch.ngva_walk;
-
-	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
-		return;
-
-	g_context->cpu_role.as_u64   = new_mode.as_u64;
-	g_context->inject_page_fault = kvm_inject_page_fault;
-	g_context->get_pdptr         = kvm_pdptr_read;
-	g_context->get_guest_pgd     = get_guest_cr3;
-
-	/*
-	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
-	 * L1's nested page tables (e.g. EPT12). The nested translation
-	 * of l2_gva to l1_gpa is done by arch.ngva_walk.gva_to_gpa using
-	 * L2's page tables as the first level of translation and L1's
-	 * nested page tables as the second level of translation. Basically
-	 * the gva_to_gpa functions between mmu and ngva_walk are swapped.
-	 */
-	if (!is_paging(vcpu))
-		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
-	else if (is_long_mode(vcpu))
-		g_context->gva_to_gpa = paging64_gva_to_gpa;
-	else if (is_pae(vcpu))
-		g_context->gva_to_gpa = paging64_gva_to_gpa;
-	else
-		g_context->gva_to_gpa = paging32_gva_to_gpa;
-
-	reset_guest_paging_metadata(vcpu, g_context);
 }
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
@@ -6170,12 +6133,14 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
 	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
 	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
 
-	if (mmu_is_nested(vcpu))
-		init_kvm_ngva_walk(vcpu, cpu_role);
-	else if (tdp_enabled)
-		init_kvm_tdp_mmu(vcpu, cpu_role);
-	else
-		init_kvm_softmmu(vcpu, cpu_role);
+	init_kvm_page_walk(vcpu, &vcpu->arch.gva_walk, cpu_role);
+
+	if (!mmu_is_nested(vcpu)) {
+		if (tdp_enabled)
+			init_kvm_tdp_mmu(vcpu, cpu_role);
+		else
+			init_kvm_softmmu(vcpu, cpu_role);
+	}
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
 
@@ -6195,9 +6160,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	 */
 	vcpu->arch.root_mmu.root_role.invalid = 1;
 	vcpu->arch.guest_mmu.root_role.invalid = 1;
-	vcpu->arch.root_gva_walk.cpu_role.ext.valid = 0;
 	vcpu->arch.ngpa_walk.cpu_role.ext.valid = 0;
-	vcpu->arch.ngva_walk.cpu_role.ext.valid = 0;
+	vcpu->arch.gva_walk.cpu_role.ext.valid = 0;
 	kvm_mmu_reset_context(vcpu);
 
 	KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm);
@@ -6693,13 +6657,14 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
 
 	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
-	if (w == vcpu->arch.gva_walk) {
+	if (w == &vcpu->arch.gva_walk) {
 		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
 		if (is_noncanonical_invlpg_address(addr, vcpu))
 			return;
 
 		kvm_x86_call(flush_tlb_gva)(vcpu, addr);
-		if (w == &vcpu->arch.ngva_walk)
+
+		if (tdp_enabled)
 			return;
 
 		mmu = &vcpu->arch.root_mmu;
@@ -6733,7 +6698,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	 * be synced when switching to that new cr3, so nothing needs to be
 	 * done here for them.
 	 */
-	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.gva_walk, gva, KVM_MMU_ROOTS_ALL);
+	kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, KVM_MMU_ROOTS_ALL);
 	++vcpu->stat.invlpg;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
@@ -6755,7 +6720,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	}
 
 	if (roots)
-		kvm_mmu_invalidate_addr(vcpu, mmu->w, gva, roots);
+		kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.gva_walk, gva, roots);
 	++vcpu->stat.invlpg;
 
 	/*
@@ -6871,13 +6836,12 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = &vcpu->arch.root_gva_walk;
 
 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu, &vcpu->arch.ngpa_walk);
 	if (ret)
 		return ret;
 
-	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu, &vcpu->arch.root_gva_walk);
+	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu, &vcpu->arch.gva_walk);
 	if (ret)
 		goto fail_allocate_root;
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 115f0fd2d4ba..a46384b7080f 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -548,7 +548,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	}
 #endif
 	walker->fault.address = addr;
-	walker->fault.nested_page_fault = w != vcpu->arch.gva_walk;
+	walker->fault.nested_page_fault = w != &vcpu->arch.gva_walk;
 	walker->fault.async_page_fault = false;
 
 #if PTTYPE != PTTYPE_EPT
@@ -906,7 +906,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 
 #ifndef CONFIG_X86_64
 	/* A 64-bit GVA should be impossible on 32-bit KVM. */
-	WARN_ON_ONCE((addr >> 32) && w == vcpu->arch.gva_walk);
+	WARN_ON_ONCE((addr >> 32) && w == &vcpu->arch.gva_walk);
 #endif
 
 	r = FNAME(walk_addr_generic)(&walker, vcpu, w, addr, access);
diff --git a/arch/x86/kvm/regs.c b/arch/x86/kvm/regs.c
index 02adaa4ef64e..bd8147798cc3 100644
--- a/arch/x86/kvm/regs.c
+++ b/arch/x86/kvm/regs.c
@@ -154,7 +154,7 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
  */
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
-	struct kvm_pagewalk *w = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *w = &vcpu->arch.gva_walk;
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	gpa_t real_gpa;
 	int i;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 97d3fabb8c0d..ba985a02208a 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -117,13 +117,11 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.ngpa_walk.get_guest_pgd     = nested_svm_get_tdp_cr3;
 	vcpu->arch.ngpa_walk.get_pdptr         = nested_svm_get_tdp_pdptr;
 	vcpu->arch.ngpa_walk.inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 3596d15ae405..0635e92471c8 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -516,14 +516,11 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.ngpa_walk.get_pdptr       = kvm_pdptr_read;
 
 	vcpu->arch.ngpa_walk.inject_page_fault = nested_ept_inject_page_fault;
-
-	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d6ab17f17d69..0626e835e9eb 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -587,7 +587,7 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
 
 	fault_walk = fault->nested_page_fault ? &vcpu->arch.ngpa_walk :
-						vcpu->arch.gva_walk;
+						&vcpu->arch.gva_walk;
 
 	/*
 	 * Invalidate the TLB entry for the faulting address, if it exists,
@@ -4769,7 +4769,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 			      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
@@ -4779,7 +4779,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read);
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 			       struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
@@ -4791,7 +4791,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 
 	return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, 0, exception);
 }
@@ -4800,7 +4800,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 				      struct kvm_vcpu *vcpu, u64 access,
 				      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
@@ -4833,7 +4833,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 				struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	unsigned offset;
 	int ret;
@@ -4892,7 +4892,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
 				      struct kvm_vcpu *vcpu, u64 access,
 				      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
@@ -4998,7 +4998,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 	u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
 		     | (write ? PFERR_WRITE_MASK : 0);
 
@@ -10601,7 +10601,7 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
 
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
 {
-	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
+	struct kvm_pagewalk *gva_walk = &vcpu->arch.gva_walk;
 	struct x86_exception fault;
 	u64 access = error_code &
 		(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
-- 
2.52.0



^ permalink raw reply related

* [PATCH 17/19] KVM: x86/mmu: merge struct rsvd_bits_validate into struct kvm_page_format
From: Paolo Bonzini @ 2026-06-24 21:31 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

Remove one level of indirection, and prepare for using the permission bitmask
machinery for shadow pages as well.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  38 +++++------
 arch/x86/kvm/mmu/mmu.c          | 116 ++++++++++++++++----------------
 arch/x86/kvm/mmu/paging_tmpl.h  |   8 +--
 arch/x86/kvm/mmu/spte.c         |   4 +-
 arch/x86/kvm/mmu/spte.h         |  18 ++---
 arch/x86/kvm/vmx/vmx.c          |   2 +-
 6 files changed, 91 insertions(+), 95 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 08aa1e2da582..b517257a6315 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -487,9 +487,24 @@ struct kvm_pio_request {
 
 #define PT64_ROOT_MAX_LEVEL 5
 
-struct rsvd_bits_validate {
+struct kvm_page_format {
 	u64 rsvd_bits_mask[2][PT64_ROOT_MAX_LEVEL];
 	u64 bad_mt_xwr;
+
+	/*
+	* The pkru_mask indicates if protection key checks are needed.  It
+	* consists of 16 domains indexed by page fault error code bits [4:1],
+	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
+	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
+	*/
+	u32 pkru_mask;
+
+	/*
+	 * Bitmap; bit set = permission fault
+	 * Array index: page fault error code [4:1]
+	 * Bit index: pte permissions in ACC_* format
+	 */
+	u16 permissions[16];
 };
 
 struct kvm_mmu_root_info {
@@ -516,25 +531,6 @@ struct kvm_page_fault;
  * and 2-level 32-bit).  The kvm_pagewalk structure abstracts the details of the
  * current mmu mode.
  */
-struct kvm_page_format {
-	struct rsvd_bits_validate guest_rsvd_check;
-
-	/*
-	* The pkru_mask indicates if protection key checks are needed.  It
-	* consists of 16 domains indexed by page fault error code bits [4:1],
-	* with PFEC.RSVD replaced by ACC_USER_MASK from the page tables.
-	* Each domain has 2 bits which are ANDed with AD and WD from PKRU.
-	*/
-	u32 pkru_mask;
-
-	/*
-	 * Bitmap; bit set = permission fault
-	 * Array index: page fault error code [4:1]
-	 * Bit index: pte permissions in ACC_* format
-	 */
-	u16 permissions[16];
-};
-
 struct kvm_pagewalk {
 	unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
 	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
@@ -570,7 +566,7 @@ struct kvm_mmu {
 	 * bits include not only hardware reserved bits but also
 	 * the bits spte never used.
 	 */
-	struct rsvd_bits_validate shadow_zero_check;
+	struct kvm_page_format fmt;
 };
 
 enum pmc_type {
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index f70892b15680..03e0bd3c8490 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -4446,7 +4446,7 @@ static int get_sptes_lockless(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes,
 static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 {
 	u64 sptes[PT64_ROOT_MAX_LEVEL + 1];
-	struct rsvd_bits_validate *rsvd_check;
+	struct kvm_page_format *rsvd_check;
 	int root, leaf, level;
 	bool reserved = false;
 
@@ -4467,7 +4467,7 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep)
 	if (!is_shadow_present_pte(sptes[leaf]))
 		leaf++;
 
-	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
+	rsvd_check = &vcpu->arch.mmu->fmt;
 
 	for (level = root; level >= leaf; level--)
 		reserved |= is_rsvd_spte(rsvd_check, sptes[level], level);
@@ -5387,7 +5387,7 @@ static bool sync_mmio_spte(struct kvm_vcpu *vcpu, u64 *sptep, gfn_t gfn,
 #include "paging_tmpl.h"
 #undef PTTYPE
 
-static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
+static void __reset_rsvds_bits_mask(struct kvm_page_format *fmt,
 				    u64 pa_bits_rsvd, int level, bool nx,
 				    bool gbpages, bool pse, bool amd)
 {
@@ -5395,7 +5395,7 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
 	u64 nonleaf_bit8_rsvd = 0;
 	u64 high_bits_rsvd;
 
-	rsvd_check->bad_mt_xwr = 0;
+	fmt->bad_mt_xwr = 0;
 
 	if (!gbpages)
 		gbpages_bit_rsvd = rsvd_bits(7, 7);
@@ -5419,59 +5419,59 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
 	switch (level) {
 	case PT32_ROOT_LEVEL:
 		/* no rsvd bits for 2 level 4K page table entries */
-		rsvd_check->rsvd_bits_mask[0][1] = 0;
-		rsvd_check->rsvd_bits_mask[0][0] = 0;
-		rsvd_check->rsvd_bits_mask[1][0] =
-			rsvd_check->rsvd_bits_mask[0][0];
+		fmt->rsvd_bits_mask[0][1] = 0;
+		fmt->rsvd_bits_mask[0][0] = 0;
+		fmt->rsvd_bits_mask[1][0] =
+			fmt->rsvd_bits_mask[0][0];
 
 		if (!pse) {
-			rsvd_check->rsvd_bits_mask[1][1] = 0;
+			fmt->rsvd_bits_mask[1][1] = 0;
 			break;
 		}
 
 		if (is_cpuid_PSE36())
 			/* 36bits PSE 4MB page */
-			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
+			fmt->rsvd_bits_mask[1][1] = rsvd_bits(17, 21);
 		else
 			/* 32 bits PSE 4MB page */
-			rsvd_check->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
+			fmt->rsvd_bits_mask[1][1] = rsvd_bits(13, 21);
 		break;
 	case PT32E_ROOT_LEVEL:
-		rsvd_check->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
+		fmt->rsvd_bits_mask[0][2] = rsvd_bits(63, 63) |
 						   high_bits_rsvd |
 						   rsvd_bits(5, 8) |
 						   rsvd_bits(1, 2);	/* PDPTE */
-		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
-		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
-		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[0][1] = high_bits_rsvd;	/* PDE */
+		fmt->rsvd_bits_mask[0][0] = high_bits_rsvd;	/* PTE */
+		fmt->rsvd_bits_mask[1][1] = high_bits_rsvd |
 						   rsvd_bits(13, 20);	/* large page */
-		rsvd_check->rsvd_bits_mask[1][0] =
-			rsvd_check->rsvd_bits_mask[0][0];
+		fmt->rsvd_bits_mask[1][0] =
+			fmt->rsvd_bits_mask[0][0];
 		break;
 	case PT64_ROOT_5LEVEL:
-		rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[0][4] = high_bits_rsvd |
 						   nonleaf_bit8_rsvd |
 						   rsvd_bits(7, 7);
-		rsvd_check->rsvd_bits_mask[1][4] =
-			rsvd_check->rsvd_bits_mask[0][4];
+		fmt->rsvd_bits_mask[1][4] =
+			fmt->rsvd_bits_mask[0][4];
 		fallthrough;
 	case PT64_ROOT_4LEVEL:
-		rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[0][3] = high_bits_rsvd |
 						   nonleaf_bit8_rsvd |
 						   rsvd_bits(7, 7);
-		rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[0][2] = high_bits_rsvd |
 						   gbpages_bit_rsvd;
-		rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd;
-		rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
-		rsvd_check->rsvd_bits_mask[1][3] =
-			rsvd_check->rsvd_bits_mask[0][3];
-		rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[0][1] = high_bits_rsvd;
+		fmt->rsvd_bits_mask[0][0] = high_bits_rsvd;
+		fmt->rsvd_bits_mask[1][3] =
+			fmt->rsvd_bits_mask[0][3];
+		fmt->rsvd_bits_mask[1][2] = high_bits_rsvd |
 						   gbpages_bit_rsvd |
 						   rsvd_bits(13, 29);
-		rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd |
+		fmt->rsvd_bits_mask[1][1] = high_bits_rsvd |
 						   rsvd_bits(13, 20); /* large page */
-		rsvd_check->rsvd_bits_mask[1][0] =
-			rsvd_check->rsvd_bits_mask[0][0];
+		fmt->rsvd_bits_mask[1][0] =
+			fmt->rsvd_bits_mask[0][0];
 		break;
 	}
 }
@@ -5479,7 +5479,7 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 					struct kvm_pagewalk *w)
 {
-	__reset_rsvds_bits_mask(&w->fmt.guest_rsvd_check,
+	__reset_rsvds_bits_mask(&w->fmt,
 				vcpu->arch.reserved_gpa_bits,
 				w->cpu_role.base.level, is_efer_nx(w),
 				guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
@@ -5487,7 +5487,7 @@ static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 				guest_cpuid_is_amd_compatible(vcpu));
 }
 
-static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
+static void __reset_rsvds_bits_mask_ept(struct kvm_page_format *fmt,
 					u64 pa_bits_rsvd, bool execonly,
 					int huge_page_level)
 {
@@ -5500,18 +5500,18 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 	if (huge_page_level < PG_LEVEL_2M)
 		large_2m_rsvd = rsvd_bits(7, 7);
 
-	rsvd_check->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
-	rsvd_check->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
-	rsvd_check->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
-	rsvd_check->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
-	rsvd_check->rsvd_bits_mask[0][0] = high_bits_rsvd;
+	fmt->rsvd_bits_mask[0][4] = high_bits_rsvd | rsvd_bits(3, 7);
+	fmt->rsvd_bits_mask[0][3] = high_bits_rsvd | rsvd_bits(3, 7);
+	fmt->rsvd_bits_mask[0][2] = high_bits_rsvd | rsvd_bits(3, 6) | large_1g_rsvd;
+	fmt->rsvd_bits_mask[0][1] = high_bits_rsvd | rsvd_bits(3, 6) | large_2m_rsvd;
+	fmt->rsvd_bits_mask[0][0] = high_bits_rsvd;
 
 	/* large page */
-	rsvd_check->rsvd_bits_mask[1][4] = rsvd_check->rsvd_bits_mask[0][4];
-	rsvd_check->rsvd_bits_mask[1][3] = rsvd_check->rsvd_bits_mask[0][3];
-	rsvd_check->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
-	rsvd_check->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
-	rsvd_check->rsvd_bits_mask[1][0] = rsvd_check->rsvd_bits_mask[0][0];
+	fmt->rsvd_bits_mask[1][4] = fmt->rsvd_bits_mask[0][4];
+	fmt->rsvd_bits_mask[1][3] = fmt->rsvd_bits_mask[0][3];
+	fmt->rsvd_bits_mask[1][2] = high_bits_rsvd | rsvd_bits(12, 29) | large_1g_rsvd;
+	fmt->rsvd_bits_mask[1][1] = high_bits_rsvd | rsvd_bits(12, 20) | large_2m_rsvd;
+	fmt->rsvd_bits_mask[1][0] = fmt->rsvd_bits_mask[0][0];
 
 	bad_mt_xwr = 0xFFull << (2 * 8);	/* bits 3..5 must not be 2 */
 	bad_mt_xwr |= 0xFFull << (3 * 8);	/* bits 3..5 must not be 3 */
@@ -5522,13 +5522,13 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 		/* bits 0..2 must not be 100 unless VMX capabilities allow it */
 		bad_mt_xwr |= REPEAT_BYTE(1ull << 4);
 	}
-	rsvd_check->bad_mt_xwr = bad_mt_xwr;
+	fmt->bad_mt_xwr = bad_mt_xwr;
 }
 
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 		bool execonly, int huge_page_level)
 {
-	__reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.fmt.guest_rsvd_check,
+	__reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.fmt,
 				    vcpu->arch.reserved_gpa_bits, execonly,
 				    huge_page_level);
 }
@@ -5550,13 +5550,13 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 	bool is_amd = true;
 	/* KVM doesn't use 2-level page tables for the shadow MMU. */
 	bool is_pse = false;
-	struct rsvd_bits_validate *shadow_zero_check;
+	struct kvm_page_format *fmt;
 	int i;
 
 	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
 
-	shadow_zero_check = &context->shadow_zero_check;
-	__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+	fmt = &context->fmt;
+	__reset_rsvds_bits_mask(fmt, reserved_hpa_bits(),
 				context->root_role.level,
 				context->root_role.efer_nx,
 				guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
@@ -5572,10 +5572,10 @@ static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
 		 * Bits in shadow_me_mask but not in shadow_me_value are
 		 * not allowed to be set.
 		 */
-		shadow_zero_check->rsvd_bits_mask[0][i] |= shadow_me_mask;
-		shadow_zero_check->rsvd_bits_mask[1][i] |= shadow_me_mask;
-		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_value;
-		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_value;
+		fmt->rsvd_bits_mask[0][i] |= shadow_me_mask;
+		fmt->rsvd_bits_mask[1][i] |= shadow_me_mask;
+		fmt->rsvd_bits_mask[0][i] &= ~shadow_me_value;
+		fmt->rsvd_bits_mask[1][i] &= ~shadow_me_value;
 	}
 
 }
@@ -5592,18 +5592,18 @@ static inline bool boot_cpu_is_amd(void)
  */
 static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 {
-	struct rsvd_bits_validate *shadow_zero_check;
+	struct kvm_page_format *fmt;
 	int i;
 
-	shadow_zero_check = &context->shadow_zero_check;
+	fmt = &context->fmt;
 
 	if (boot_cpu_is_amd())
-		__reset_rsvds_bits_mask(shadow_zero_check, reserved_hpa_bits(),
+		__reset_rsvds_bits_mask(fmt, reserved_hpa_bits(),
 					context->root_role.level, true,
 					boot_cpu_has(X86_FEATURE_GBPAGES),
 					false, true);
 	else
-		__reset_rsvds_bits_mask_ept(shadow_zero_check,
+		__reset_rsvds_bits_mask_ept(fmt,
 					    reserved_hpa_bits(), false,
 					    max_huge_page_level);
 
@@ -5611,8 +5611,8 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 		return;
 
 	for (i = context->root_role.level; --i >= 0;) {
-		shadow_zero_check->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
-		shadow_zero_check->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
+		fmt->rsvd_bits_mask[0][i] &= ~shadow_me_mask;
+		fmt->rsvd_bits_mask[1][i] &= ~shadow_me_mask;
 	}
 }
 
@@ -5623,7 +5623,7 @@ static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
 static void
 reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
 {
-	__reset_rsvds_bits_mask_ept(&context->shadow_zero_check,
+	__reset_rsvds_bits_mask_ept(&context->fmt,
 				    reserved_hpa_bits(), execonly,
 				    max_huge_page_level);
 }
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 58397f58320f..e73fc09ec4db 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -138,19 +138,19 @@ static inline int FNAME(is_present_gpte)(struct kvm_pagewalk *w,
 #endif
 }
 
-static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte)
+static bool FNAME(is_bad_mt_xwr)(struct kvm_page_format *fmt, u64 gpte)
 {
 #if PTTYPE != PTTYPE_EPT
 	return false;
 #else
-	return __is_bad_mt_xwr(rsvd_check, gpte);
+	return __is_bad_mt_xwr(fmt, gpte);
 #endif
 }
 
 static bool FNAME(is_rsvd_bits_set)(struct kvm_page_format *fmt, u64 gpte, int level)
 {
-	return __is_rsvd_bits_set(&fmt->guest_rsvd_check, gpte, level) ||
-	       FNAME(is_bad_mt_xwr)(&fmt->guest_rsvd_check, gpte);
+	return __is_rsvd_bits_set(fmt, gpte, level) ||
+	       FNAME(is_bad_mt_xwr)(fmt, gpte);
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu/spte.c b/arch/x86/kvm/mmu/spte.c
index d2f5f7dd8fe1..bdf72a98c19c 100644
--- a/arch/x86/kvm/mmu/spte.c
+++ b/arch/x86/kvm/mmu/spte.c
@@ -280,9 +280,9 @@ bool make_spte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 	if (prefetch && !synchronizing)
 		spte = mark_spte_for_access_track(spte);
 
-	WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->shadow_zero_check, spte, level),
+	WARN_ONCE(is_rsvd_spte(&vcpu->arch.mmu->fmt, spte, level),
 		  "spte = 0x%llx, level = %d, rsvd bits = 0x%llx", spte, level,
-		  get_rsvd_bits(&vcpu->arch.mmu->shadow_zero_check, spte, level));
+		  get_rsvd_bits(&vcpu->arch.mmu->fmt, spte, level));
 
 	/*
 	 * Mark the memslot dirty *after* modifying it for access tracking.
diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h
index 13eea94dd212..918533e61b98 100644
--- a/arch/x86/kvm/mmu/spte.h
+++ b/arch/x86/kvm/mmu/spte.h
@@ -378,33 +378,33 @@ static inline bool is_accessed_spte(u64 spte)
 	return spte & shadow_accessed_mask;
 }
 
-static inline u64 get_rsvd_bits(struct rsvd_bits_validate *rsvd_check, u64 pte,
+static inline u64 get_rsvd_bits(struct kvm_page_format *fmt, u64 pte,
 				int level)
 {
 	int bit7 = (pte >> 7) & 1;
 
-	return rsvd_check->rsvd_bits_mask[bit7][level-1];
+	return fmt->rsvd_bits_mask[bit7][level-1];
 }
 
-static inline bool __is_rsvd_bits_set(struct rsvd_bits_validate *rsvd_check,
+static inline bool __is_rsvd_bits_set(struct kvm_page_format *fmt,
 				      u64 pte, int level)
 {
-	return pte & get_rsvd_bits(rsvd_check, pte, level);
+	return pte & get_rsvd_bits(fmt, pte, level);
 }
 
-static inline bool __is_bad_mt_xwr(struct rsvd_bits_validate *rsvd_check,
+static inline bool __is_bad_mt_xwr(struct kvm_page_format *fmt,
 				   u64 pte)
 {
 	if (pte & VMX_EPT_USER_EXECUTABLE_MASK)
 		pte |= VMX_EPT_EXECUTABLE_MASK;
-	return rsvd_check->bad_mt_xwr & BIT_ULL(pte & 0x3f);
+	return fmt->bad_mt_xwr & BIT_ULL(pte & 0x3f);
 }
 
-static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check,
+static __always_inline bool is_rsvd_spte(struct kvm_page_format *fmt,
 					 u64 spte, int level)
 {
-	return __is_bad_mt_xwr(rsvd_check, spte) ||
-	       __is_rsvd_bits_set(rsvd_check, spte, level);
+	return __is_bad_mt_xwr(fmt, spte) ||
+	       __is_rsvd_bits_set(fmt, spte, level);
 }
 
 /*
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index aded7039bd3e..3681d565f177 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -8719,7 +8719,7 @@ __init int vmx_hardware_setup(void)
 
 	/*
 	 * Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
-	 * bits to shadow_zero_check.
+	 * bits into the MMU's struct kvm_page_format.
 	 */
 	vmx_setup_me_spte_mask();
 
-- 
2.52.0



^ permalink raw reply related

* [PATCH 18/19] KVM: x86/mmu: parameterize update_permission_bitmask()
From: Paolo Bonzini @ 2026-06-24 21:31 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

Make it possible to apply the computation loop to both guest
and shadow PTEs formats; the latter do not have an extended role, so
pass the four parameters to the function one by one.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 03e0bd3c8490..11d86e34acf4 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5658,18 +5658,15 @@ reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
 	 (14 & (access) ? 1 << 14 : 0) | \
 	 (15 & (access) ? 1 << 15 : 0))
 
-static void update_permission_bitmask(struct kvm_pagewalk *pw, bool tdp, bool ept)
+static void __update_permission_bitmask(struct kvm_page_format *fmt, bool tdp,
+					bool ept, bool cr4_smep, bool cr4_smap,
+					bool cr0_wp, bool efer_nx)
 {
 	unsigned index;
 
 	const u16 w = ACC_BITS_MASK(ACC_WRITE_MASK);
 	const u16 r = ACC_BITS_MASK(ACC_READ_MASK);
 
-	bool cr4_smep = is_cr4_smep(pw);
-	bool cr4_smap = is_cr4_smap(pw);
-	bool cr0_wp = is_cr0_wp(pw);
-	bool efer_nx = is_efer_nx(pw);
-
 	/*
 	 * In hardware, page fault error codes are generated (as the name
 	 * suggests) on any kind of page fault.  permission_fault() and
@@ -5682,7 +5679,7 @@ static void update_permission_bitmask(struct kvm_pagewalk *pw, bool tdp, bool ep
 	 * permission_fault() to indicate accesses that are *not* subject to
 	 * SMAP restrictions.
 	 */
-	for (index = 0; index < ARRAY_SIZE(pw->fmt.permissions); ++index) {
+	for (index = 0; index < ARRAY_SIZE(fmt->permissions); ++index) {
 		unsigned pfec = index << 1;
 
 		/*
@@ -5756,10 +5753,17 @@ static void update_permission_bitmask(struct kvm_pagewalk *pw, bool tdp, bool ep
 				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
 		}
 
-		pw->fmt.permissions[index] = ff | uf | wf | rf | smapf;
+		fmt->permissions[index] = ff | uf | wf | rf | smapf;
 	}
 }
 
+static void update_permission_bitmask(struct kvm_pagewalk *w, bool tdp, bool ept)
+{
+	__update_permission_bitmask(&w->fmt, tdp, ept,
+				    is_cr4_smep(w), is_cr4_smap(w),
+				    is_cr0_wp(w), is_efer_nx(w));
+}
+
 /*
 * PKU is an additional mechanism by which the paging controls access to
 * user-mode addresses based on the value in the PKRU register.  Protection
-- 
2.52.0



^ permalink raw reply related

* [PATCH 13/19] KVM: x86/mmu: pull struct kvm_pagewalk out of struct kvm_mmu
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

Replace kvm_mmu's w field with a pointer to an external instance of
struct kvm_pagewalk.  This is the first step towards using a single
kvm_pagewalk struct for all GVA walks, whether nested or not.

With this patch, non-MMU code basically does not use kvm_mmu anymore:
it does care about page walks, but it funnels (almost) all interactions
with the TLB to mmu.c.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |   8 ++-
 arch/x86/kvm/mmu.h              |   2 +-
 arch/x86/kvm/mmu/mmu.c          | 113 ++++++++++++++++++--------------
 arch/x86/kvm/mmu/paging_tmpl.h  |  14 ++--
 arch/x86/kvm/svm/nested.c       |  11 ++--
 arch/x86/kvm/vmx/nested.c       |  13 ++--
 arch/x86/kvm/x86.c              |   2 +-
 7 files changed, 88 insertions(+), 75 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 25288d73cce7..8b9cf364c9f6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -545,11 +545,11 @@ struct kvm_pagewalk {
 };
 
 struct kvm_mmu {
-	struct kvm_pagewalk w;
-
 	int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 	int (*sync_spte)(struct kvm_vcpu *vcpu,
 			 struct kvm_mmu_page *sp, int i);
+	struct kvm_pagewalk *w;
+
 	struct kvm_mmu_root_info root;
 	hpa_t mirror_root_hpa;
 	union kvm_mmu_page_role root_role;
@@ -905,9 +905,11 @@ struct kvm_vcpu_arch {
 
 	/* Non-nested MMU for L1 */
 	struct kvm_mmu root_mmu;
+	struct kvm_pagewalk root_gva_walk;
 
-	/* L1 MMU when running nested */
+	/* L1 TDP when running nested */
 	struct kvm_mmu guest_mmu;
+	struct kvm_pagewalk ngpa_walk;
 
 	/*
 	 * Paging state of an L2 guest (used for nested npt)
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 2610b996e144..1631fd43c9a1 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -257,7 +257,7 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
 	 * need to refresh ngva_walk, a.k.a. the walker used to translate L2
 	 * GVAs to GPAs, so as to honor L2's CR0.WP.
 	 */
-	if (!tdp_enabled || w == &vcpu->arch.guest_mmu.w)
+	if (!tdp_enabled || w == &vcpu->arch.ngpa_walk)
 		return;
 
 	__kvm_mmu_refresh_passthrough_bits(vcpu, w);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8b5ffd78565b..3ffaa48b566e 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -2478,12 +2478,14 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
 					struct kvm_vcpu *vcpu, hpa_t root,
 					u64 addr)
 {
+	struct kvm_pagewalk *w = vcpu->arch.mmu->w;
+
 	iterator->addr = addr;
 	iterator->shadow_addr = root;
 	iterator->level = vcpu->arch.mmu->root_role.level;
 
 	if (iterator->level >= PT64_ROOT_4LEVEL &&
-	    vcpu->arch.mmu->w.cpu_role.base.level < PT64_ROOT_4LEVEL &&
+	    w->cpu_role.base.level < PT64_ROOT_4LEVEL &&
 	    !vcpu->arch.mmu->root_role.direct)
 		iterator->level = PT32E_ROOT_LEVEL;
 
@@ -4090,12 +4092,13 @@ static int mmu_first_shadow_root_alloc(struct kvm *kvm)
 static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
+	struct kvm_pagewalk *w = mmu->w;
 	u64 pdptrs[4], pm_mask;
 	gfn_t root_gfn, root_pgd;
 	int quadrant, i, r;
 	hpa_t root;
 
-	root_pgd = kvm_mmu_get_guest_pgd(vcpu, &mmu->w);
+	root_pgd = kvm_mmu_get_guest_pgd(vcpu, w);
 	root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
 
 	if (!kvm_vcpu_is_visible_gfn(vcpu, root_gfn)) {
@@ -4107,9 +4110,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * On SVM, reading PDPTRs might access guest memory, which might fault
 	 * and thus might sleep.  Grab the PDPTRs before acquiring mmu_lock.
 	 */
-	if (mmu->w.cpu_role.base.level == PT32E_ROOT_LEVEL) {
+	if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) {
 		for (i = 0; i < 4; ++i) {
-			pdptrs[i] = mmu->w.get_pdptr(vcpu, i);
+			pdptrs[i] = w->get_pdptr(vcpu, i);
 			if (!(pdptrs[i] & PT_PRESENT_MASK))
 				continue;
 
@@ -4131,7 +4134,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	 * Do we shadow a long mode page table? If so we need to
 	 * write-protect the guests page table root.
 	 */
-	if (mmu->w.cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+	if (w->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
 		root = mmu_alloc_root(vcpu, root_gfn, 0,
 				      mmu->root_role.level);
 		mmu->root.hpa = root;
@@ -4170,7 +4173,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 	for (i = 0; i < 4; ++i) {
 		WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
 
-		if (mmu->w.cpu_role.base.level == PT32E_ROOT_LEVEL) {
+		if (w->cpu_role.base.level == PT32E_ROOT_LEVEL) {
 			if (!(pdptrs[i] & PT_PRESENT_MASK)) {
 				mmu->pae_root[i] = INVALID_PAE_ROOT;
 				continue;
@@ -4184,7 +4187,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 		 * directory. Othwerise each PAE page direct shadows one guest
 		 * PAE page directory so that quadrant should be 0.
 		 */
-		quadrant = (mmu->w.cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
+		quadrant = (w->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : 0;
 
 		root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
 		mmu->pae_root[i] = root | pm_mask;
@@ -4208,6 +4211,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
 static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu *mmu = vcpu->arch.mmu;
+	struct kvm_pagewalk *w = mmu->w;
 	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
 	u64 *pml5_root = NULL;
 	u64 *pml4_root = NULL;
@@ -4220,7 +4224,7 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
 	 * on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
 	 */
 	if (mmu->root_role.direct ||
-	    mmu->w.cpu_role.base.level >= PT64_ROOT_4LEVEL ||
+	    w->cpu_role.base.level >= PT64_ROOT_4LEVEL ||
 	    mmu->root_role.level < PT64_ROOT_4LEVEL)
 		return 0;
 
@@ -4325,7 +4329,7 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
 
 	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
 
-	if (vcpu->arch.mmu->w.cpu_role.base.level >= PT64_ROOT_4LEVEL) {
+	if (vcpu->arch.mmu->w->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
 		hpa_t root = vcpu->arch.mmu->root.hpa;
 
 		if (!is_unsync_root(root))
@@ -4567,7 +4571,7 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu,
 	if (arch.direct_map)
 		arch.cr3 = (unsigned long)INVALID_GPA;
 	else
-		arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, &vcpu->arch.mmu->w);
+		arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w);
 
 	return kvm_setup_async_pf(vcpu, fault->addr,
 				  kvm_vcpu_gfn_to_hva(vcpu, fault->gfn), &arch);
@@ -5110,7 +5114,7 @@ void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
 		return;
 
 	if (!vcpu->arch.mmu->root_role.direct &&
-	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, &vcpu->arch.mmu->w))
+	      work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, vcpu->arch.mmu->w))
 		return;
 
 	r = kvm_mmu_do_page_fault(vcpu, work->cr2_or_gpa, work->arch.error_code,
@@ -5208,7 +5212,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_tdp_mmu_map_private_pfn);
 static void nonpaging_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = nonpaging_page_fault;
-	context->w.gva_to_gpa = nonpaging_gva_to_gpa;
+	context->w->gva_to_gpa = nonpaging_gva_to_gpa;
 	context->sync_spte = NULL;
 }
 
@@ -5523,9 +5527,9 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 }
 
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
-		struct kvm_mmu *context, bool execonly, int huge_page_level)
+		bool execonly, int huge_page_level)
 {
-	__reset_rsvds_bits_mask_ept(&context->w.guest_rsvd_check,
+	__reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.guest_rsvd_check,
 				    vcpu->arch.reserved_gpa_bits, execonly,
 				    huge_page_level);
 }
@@ -5832,21 +5836,21 @@ static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
 		return;
 
 	reset_guest_rsvds_bits_mask(vcpu, w);
-	update_permission_bitmask(w, w == &vcpu->arch.guest_mmu.w, false);
+	update_permission_bitmask(w, w == &vcpu->arch.ngpa_walk, false);
 	update_pkru_bitmask(w);
 }
 
 static void paging64_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging64_page_fault;
-	context->w.gva_to_gpa = paging64_gva_to_gpa;
+	context->w->gva_to_gpa = paging64_gva_to_gpa;
 	context->sync_spte = paging64_sync_spte;
 }
 
 static void paging32_init_context(struct kvm_mmu *context)
 {
 	context->page_fault = paging32_page_fault;
-	context->w.gva_to_gpa = paging32_gva_to_gpa;
+	context->w->gva_to_gpa = paging32_gva_to_gpa;
 	context->sync_spte = paging32_sync_spte;
 }
 
@@ -5961,27 +5965,27 @@ static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
 	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
 
-	if (cpu_role.as_u64 == context->w.cpu_role.as_u64 &&
+	if (cpu_role.as_u64 == context->w->cpu_role.as_u64 &&
 	    root_role.word == context->root_role.word)
 		return;
 
-	context->w.cpu_role.as_u64 = cpu_role.as_u64;
+	context->w->cpu_role.as_u64 = cpu_role.as_u64;
 	context->root_role.word = root_role.word;
 	context->page_fault = kvm_tdp_page_fault;
 	context->sync_spte = NULL;
 
-	context->w.inject_page_fault = kvm_inject_page_fault;
-	context->w.get_pdptr = kvm_pdptr_read;
-	context->w.get_guest_pgd = get_guest_cr3;
+	context->w->inject_page_fault = kvm_inject_page_fault;
+	context->w->get_pdptr = kvm_pdptr_read;
+	context->w->get_guest_pgd = get_guest_cr3;
 
-	if (!is_cr0_pg(&context->w))
-		context->w.gva_to_gpa = nonpaging_gva_to_gpa;
-	else if (is_cr4_pae(&context->w))
-		context->w.gva_to_gpa = paging64_gva_to_gpa;
+	if (!is_cr0_pg(context->w))
+		context->w->gva_to_gpa = nonpaging_gva_to_gpa;
+	else if (is_cr4_pae(context->w))
+		context->w->gva_to_gpa = paging64_gva_to_gpa;
 	else
-		context->w.gva_to_gpa = paging32_gva_to_gpa;
+		context->w->gva_to_gpa = paging32_gva_to_gpa;
 
-	reset_guest_paging_metadata(vcpu, &context->w);
+	reset_guest_paging_metadata(vcpu, context->w);
 	reset_tdp_shadow_zero_bits_mask(context);
 }
 
@@ -5989,21 +5993,21 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 				    union kvm_cpu_role cpu_role,
 				    union kvm_mmu_page_role root_role)
 {
-	if (cpu_role.as_u64 == context->w.cpu_role.as_u64 &&
+	if (cpu_role.as_u64 == context->w->cpu_role.as_u64 &&
 	    root_role.word == context->root_role.word)
 		return;
 
-	context->w.cpu_role.as_u64 = cpu_role.as_u64;
+	context->w->cpu_role.as_u64 = cpu_role.as_u64;
 	context->root_role.word = root_role.word;
 
-	if (!is_cr0_pg(&context->w))
+	if (!is_cr0_pg(context->w))
 		nonpaging_init_context(context);
-	else if (is_cr4_pae(&context->w))
+	else if (is_cr4_pae(context->w))
 		paging64_init_context(context);
 	else
 		paging32_init_context(context);
 
-	reset_guest_paging_metadata(vcpu, &context->w);
+	reset_guest_paging_metadata(vcpu, context->w);
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
@@ -6095,18 +6099,20 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
 						   execonly, level, mbec);
 
-	if (new_mode.as_u64 != context->w.cpu_role.as_u64) {
+	struct kvm_pagewalk *ngpa_walk = &vcpu->arch.ngpa_walk;
+
+	if (new_mode.as_u64 != ngpa_walk->cpu_role.as_u64) {
 		/* EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. */
-		context->w.cpu_role.as_u64 = new_mode.as_u64;
+		ngpa_walk->cpu_role.as_u64 = new_mode.as_u64;
 		context->root_role.word = new_mode.base.word;
 
 		context->page_fault = ept_page_fault;
-		context->w.gva_to_gpa = ept_gva_to_gpa;
+		ngpa_walk->gva_to_gpa = ept_gva_to_gpa;
 		context->sync_spte = ept_sync_spte;
 
-		update_permission_bitmask(&context->w, true, true);
-		context->w.pkru_mask = 0;
-		reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
+		update_permission_bitmask(ngpa_walk, true, true);
+		ngpa_walk->pkru_mask = 0;
+		reset_rsvds_bits_mask_ept(vcpu, execonly, huge_page_level);
 		reset_ept_shadow_zero_bits_mask(context, execonly);
 	}
 
@@ -6121,9 +6127,9 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 
 	kvm_init_shadow_mmu(vcpu, cpu_role);
 
-	context->w.inject_page_fault = kvm_inject_page_fault;
-	context->w.get_pdptr         = kvm_pdptr_read;
-	context->w.get_guest_pgd     = get_guest_cr3;
+	context->w->inject_page_fault = kvm_inject_page_fault;
+	context->w->get_pdptr         = kvm_pdptr_read;
+	context->w->get_guest_pgd     = get_guest_cr3;
 }
 
 static void init_kvm_ngva_walk(struct kvm_vcpu *vcpu,
@@ -6189,8 +6195,8 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	 */
 	vcpu->arch.root_mmu.root_role.invalid = 1;
 	vcpu->arch.guest_mmu.root_role.invalid = 1;
-	vcpu->arch.root_mmu.w.cpu_role.ext.valid = 0;
-	vcpu->arch.guest_mmu.w.cpu_role.ext.valid = 0;
+	vcpu->arch.root_gva_walk.cpu_role.ext.valid = 0;
+	vcpu->arch.ngpa_walk.cpu_role.ext.valid = 0;
 	vcpu->arch.ngva_walk.cpu_role.ext.valid = 0;
 	kvm_mmu_reset_context(vcpu);
 
@@ -6687,7 +6693,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
 
 	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
-	if (w != &vcpu->arch.guest_mmu.w) {
+	if (w == vcpu->arch.gva_walk) {
 		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
 		if (is_noncanonical_invlpg_address(addr, vcpu))
 			return;
@@ -6695,9 +6701,13 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 		kvm_x86_call(flush_tlb_gva)(vcpu, addr);
 		if (w == &vcpu->arch.ngva_walk)
 			return;
+
+		mmu = &vcpu->arch.root_mmu;
+	} else {
+		mmu = &vcpu->arch.guest_mmu;
 	}
 
-	mmu = container_of(w, struct kvm_mmu, w);
+	/* Invalidate shadow pages, whether GPA->GVA or nGPA->GPA.  */
 	if (!mmu->sync_spte)
 		return;
 
@@ -6745,7 +6755,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	}
 
 	if (roots)
-		kvm_mmu_invalidate_addr(vcpu, &mmu->w, gva, roots);
+		kvm_mmu_invalidate_addr(vcpu, mmu->w, gva, roots);
 	++vcpu->stat.invlpg;
 
 	/*
@@ -6790,11 +6800,12 @@ static void free_mmu_pages(struct kvm_mmu *mmu)
 	free_page((unsigned long)mmu->pml5_root);
 }
 
-static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu)
+static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, struct kvm_pagewalk *w)
 {
 	struct page *page;
 	int i;
 
+	mmu->w = w;
 	mmu->root.hpa = INVALID_PAGE;
 	mmu->root.pgd = 0;
 	mmu->mirror_root_hpa = INVALID_PAGE;
@@ -6860,13 +6871,13 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
+	vcpu->arch.gva_walk = &vcpu->arch.root_gva_walk;
 
-	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
+	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu, &vcpu->arch.ngpa_walk);
 	if (ret)
 		return ret;
 
-	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu);
+	ret = __kvm_mmu_create(vcpu, &vcpu->arch.root_mmu, &vcpu->arch.root_gva_walk);
 	if (ret)
 		goto fail_allocate_root;
 
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index 9cfae71cd3e6..115f0fd2d4ba 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -157,7 +157,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 				  struct kvm_mmu_page *sp, u64 *spte,
 				  u64 gpte)
 {
-	struct kvm_pagewalk *w = &vcpu->arch.mmu->w;
+	struct kvm_pagewalk *w = vcpu->arch.mmu->w;
 
 	if (!FNAME(is_present_gpte)(w, gpte))
 		goto no_present;
@@ -563,7 +563,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 static int FNAME(walk_addr)(struct guest_walker *walker,
 			    struct kvm_vcpu *vcpu, gpa_t addr, u64 access)
 {
-	return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu->w, addr,
+	return FNAME(walk_addr_generic)(walker, vcpu, vcpu->arch.mmu->w, addr,
 					access);
 }
 
@@ -579,7 +579,7 @@ FNAME(prefetch_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
 
 	gfn = gpte_to_gfn(gpte);
 	pte_access = sp->role.access & FNAME(gpte_access)(gpte);
-	FNAME(protect_clean_gpte)(&vcpu->arch.mmu->w, &pte_access, gpte);
+	FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte);
 
 	return kvm_mmu_prefetch_sptes(vcpu, gfn, spte, 1, pte_access);
 }
@@ -662,7 +662,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault,
 	WARN_ON_ONCE(gw->gfn != base_gfn);
 	direct_access = gw->pte_access;
 
-	top_level = vcpu->arch.mmu->w.cpu_role.base.level;
+	top_level = vcpu->arch.mmu->w->cpu_role.base.level;
 	if (top_level == PT32E_ROOT_LEVEL)
 		top_level = PT32_ROOT_LEVEL;
 	/*
@@ -851,7 +851,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 	 * otherwise KVM will cache incorrect access information in the SPTE.
 	 */
 	if (fault->write && !(walker.pte_access & ACC_WRITE_MASK) &&
-	    !is_cr0_wp(&vcpu->arch.mmu->w) && !fault->user && fault->slot) {
+	    !is_cr0_wp(vcpu->arch.mmu->w) && !fault->user && fault->slot) {
 		walker.pte_access |= ACC_WRITE_MASK;
 		walker.pte_access &= ~ACC_USER_MASK;
 
@@ -861,7 +861,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault
 		 * then we should prevent the kernel from executing it
 		 * if SMEP is enabled.
 		 */
-		if (is_cr4_smep(&vcpu->arch.mmu->w))
+		if (is_cr4_smep(vcpu->arch.mmu->w))
 			walker.pte_access &= ~ACC_EXEC_MASK;
 	}
 #endif
@@ -959,7 +959,7 @@ static int FNAME(sync_spte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, int
 	gfn = gpte_to_gfn(gpte);
 	pte_access = sp->role.access;
 	pte_access &= FNAME(gpte_access)(gpte);
-	FNAME(protect_clean_gpte)(&vcpu->arch.mmu->w, &pte_access, gpte);
+	FNAME(protect_clean_gpte)(vcpu->arch.mmu->w, &pte_access, gpte);
 
 	if (sync_mmio_spte(vcpu, &sp->spt[i], gfn, pte_access))
 		return 0;
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 713b2508b8ca..97d3fabb8c0d 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -114,17 +114,16 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 				svm->nested.ctl.nested_cr3,
 				svm->nested.ctl.misc_ctl);
 
-	vcpu->arch.mmu->w.get_guest_pgd     = nested_svm_get_tdp_cr3;
-	vcpu->arch.mmu->w.get_pdptr       = nested_svm_get_tdp_pdptr;
-
-	vcpu->arch.mmu->w.inject_page_fault = nested_svm_inject_npf_exit;
+	vcpu->arch.ngpa_walk.get_guest_pgd     = nested_svm_get_tdp_cr3;
+	vcpu->arch.ngpa_walk.get_pdptr         = nested_svm_get_tdp_pdptr;
+	vcpu->arch.ngpa_walk.inject_page_fault = nested_svm_inject_npf_exit;
 	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
+	vcpu->arch.gva_walk = vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
@@ -2153,7 +2152,7 @@ static gpa_t svm_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
 				      u64 pte_access)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
-	struct kvm_pagewalk *w = &vcpu->arch.mmu->w;
+	struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk;
 
 	if (WARN_ON_ONCE(!mmu_is_nested(vcpu)))
 		return gpa;
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 345ee3323a93..3596d15ae405 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -408,7 +408,7 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
 			roots |= KVM_MMU_ROOT_PREVIOUS(i);
 	}
 	if (roots)
-		kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.guest_mmu.w, addr, roots);
+		kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.ngpa_walk, addr, roots);
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
@@ -512,10 +512,10 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu = &vcpu->arch.guest_mmu;
 	nested_ept_new_eptp(vcpu);
-	vcpu->arch.mmu->w.get_guest_pgd     = nested_ept_get_eptp;
-	vcpu->arch.mmu->w.get_pdptr       = kvm_pdptr_read;
+	vcpu->arch.ngpa_walk.get_guest_pgd     = nested_ept_get_eptp;
+	vcpu->arch.ngpa_walk.get_pdptr       = kvm_pdptr_read;
 
-	vcpu->arch.mmu->w.inject_page_fault = nested_ept_inject_page_fault;
+	vcpu->arch.ngpa_walk.inject_page_fault = nested_ept_inject_page_fault;
 
 	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
@@ -523,7 +523,7 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
+	vcpu->arch.gva_walk = vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
@@ -7465,12 +7465,13 @@ __init int nested_vmx_hardware_setup(int (*exit_handlers[])(struct kvm_vcpu *))
 	return 0;
 }
 
+
 static gpa_t vmx_translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa,
 				      u64 access,
 				      struct x86_exception *exception,
 				      u64 pte_access)
 {
-	struct kvm_pagewalk *w = &vcpu->arch.mmu->w;
+	struct kvm_pagewalk *w = &vcpu->arch.ngpa_walk;
 
 	if (WARN_ON_ONCE(!mmu_is_nested(vcpu)))
 		return gpa;
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 43625cc9e934..d6ab17f17d69 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -586,7 +586,7 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 
 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
 
-	fault_walk = fault->nested_page_fault ? &vcpu->arch.mmu->w :
+	fault_walk = fault->nested_page_fault ? &vcpu->arch.ngpa_walk :
 						vcpu->arch.gva_walk;
 
 	/*
-- 
2.52.0



^ permalink raw reply related

* [PATCH 16/19] KVM: x86/mmu: pull page format to a new struct
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

KVM is doing reserved bits checks on both guest and host page tables,
though the latter are only for consistency.  Create a new struct
for this common code as well as for all data that is extracted from
the CPU role.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h | 25 +++++++++++++++----------
 arch/x86/kvm/mmu.h              |  7 ++++---
 arch/x86/kvm/mmu/mmu.c          | 16 ++++++++--------
 arch/x86/kvm/mmu/paging_tmpl.h  | 10 +++++-----
 4 files changed, 32 insertions(+), 26 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 8a2126ca49c4..08aa1e2da582 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -516,16 +516,7 @@ struct kvm_page_fault;
  * and 2-level 32-bit).  The kvm_pagewalk structure abstracts the details of the
  * current mmu mode.
  */
-struct kvm_pagewalk {
-	unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
-	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
-	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
-				  struct x86_exception *fault,
-				  bool from_hardware);
-	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
-			    gpa_t gva_or_gpa, u64 access,
-			    struct x86_exception *exception);
-	union kvm_cpu_role cpu_role;
+struct kvm_page_format {
 	struct rsvd_bits_validate guest_rsvd_check;
 
 	/*
@@ -544,6 +535,20 @@ struct kvm_pagewalk {
 	u16 permissions[16];
 };
 
+struct kvm_pagewalk {
+	unsigned long (*get_guest_pgd)(struct kvm_vcpu *vcpu);
+	u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index);
+	void (*inject_page_fault)(struct kvm_vcpu *vcpu,
+				  struct x86_exception *fault,
+				  bool from_hardware);
+	gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
+			    gpa_t gva_or_gpa, u64 access,
+			    struct x86_exception *exception);
+
+	union kvm_cpu_role cpu_role;
+	struct kvm_page_format fmt;
+};
+
 struct kvm_mmu {
 	int (*page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault);
 	int (*sync_spte)(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 9d00d0eb230b..c9f628b97dae 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -294,15 +294,16 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 	u64 implicit_access = access & PFERR_IMPLICIT_ACCESS;
 	bool not_smap = ((rflags & X86_EFLAGS_AC) | implicit_access) == X86_EFLAGS_AC;
 	int index = (pfec | (not_smap ? PFERR_RSVD_MASK : 0)) >> 1;
+	struct kvm_page_format *fmt = &w->fmt;
 	u32 errcode = PFERR_PRESENT_MASK;
 	bool fault;
 
 	kvm_mmu_refresh_passthrough_bits(vcpu, w);
 
-	fault = (w->permissions[index] >> pte_access) & 1;
+	fault = (fmt->permissions[index] >> pte_access) & 1;
 
 	WARN_ON_ONCE(pfec & (PFERR_PK_MASK | PFERR_SS_MASK | PFERR_RSVD_MASK));
-	if (unlikely(w->pkru_mask)) {
+	if (unlikely(fmt->pkru_mask)) {
 		u32 pkru_bits, offset;
 
 		/*
@@ -316,7 +317,7 @@ static inline u8 permission_fault(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 		/* clear present bit, replace PFEC.RSVD with ACC_USER_MASK. */
 		offset = (pfec & ~1) | ((pte_access & PT_USER_MASK) ? PFERR_RSVD_MASK : 0);
 
-		pkru_bits &= w->pkru_mask >> offset;
+		pkru_bits &= fmt->pkru_mask >> offset;
 		errcode |= -pkru_bits & PFERR_PK_MASK;
 		fault |= (pkru_bits != 0);
 	}
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 71ad933d34c9..f70892b15680 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5479,7 +5479,7 @@ static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
 static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
 					struct kvm_pagewalk *w)
 {
-	__reset_rsvds_bits_mask(&w->guest_rsvd_check,
+	__reset_rsvds_bits_mask(&w->fmt.guest_rsvd_check,
 				vcpu->arch.reserved_gpa_bits,
 				w->cpu_role.base.level, is_efer_nx(w),
 				guest_cpu_cap_has(vcpu, X86_FEATURE_GBPAGES),
@@ -5528,7 +5528,7 @@ static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
 static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
 		bool execonly, int huge_page_level)
 {
-	__reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.guest_rsvd_check,
+	__reset_rsvds_bits_mask_ept(&vcpu->arch.ngpa_walk.fmt.guest_rsvd_check,
 				    vcpu->arch.reserved_gpa_bits, execonly,
 				    huge_page_level);
 }
@@ -5682,7 +5682,7 @@ static void update_permission_bitmask(struct kvm_pagewalk *pw, bool tdp, bool ep
 	 * permission_fault() to indicate accesses that are *not* subject to
 	 * SMAP restrictions.
 	 */
-	for (index = 0; index < ARRAY_SIZE(pw->permissions); ++index) {
+	for (index = 0; index < ARRAY_SIZE(pw->fmt.permissions); ++index) {
 		unsigned pfec = index << 1;
 
 		/*
@@ -5756,7 +5756,7 @@ static void update_permission_bitmask(struct kvm_pagewalk *pw, bool tdp, bool ep
 				smapf = (pfec & (PFERR_RSVD_MASK|PFERR_FETCH_MASK)) ? 0 : kf;
 		}
 
-		pw->permissions[index] = ff | uf | wf | rf | smapf;
+		pw->fmt.permissions[index] = ff | uf | wf | rf | smapf;
 	}
 }
 
@@ -5789,14 +5789,14 @@ static void update_pkru_bitmask(struct kvm_pagewalk *w)
 	unsigned bit;
 	bool wp;
 
-	w->pkru_mask = 0;
+	w->fmt.pkru_mask = 0;
 
 	if (!is_cr4_pke(w))
 		return;
 
 	wp = is_cr0_wp(w);
 
-	for (bit = 0; bit < ARRAY_SIZE(w->permissions); ++bit) {
+	for (bit = 0; bit < ARRAY_SIZE(w->fmt.permissions); ++bit) {
 		unsigned pfec, pkey_bits;
 		bool check_pkey, check_write, ff, uf, wf, pte_user;
 
@@ -5824,7 +5824,7 @@ static void update_pkru_bitmask(struct kvm_pagewalk *w)
 		/* PKRU.WD stops write access. */
 		pkey_bits |= (!!check_write) << 1;
 
-		w->pkru_mask |= (pkey_bits & 3) << pfec;
+		w->fmt.pkru_mask |= (pkey_bits & 3) << pfec;
 	}
 }
 
@@ -6113,7 +6113,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 		context->sync_spte = ept_sync_spte;
 
 		update_permission_bitmask(ngpa_walk, true, true);
-		ngpa_walk->pkru_mask = 0;
+		ngpa_walk->fmt.pkru_mask = 0;
 		reset_rsvds_bits_mask_ept(vcpu, execonly, huge_page_level);
 		reset_ept_shadow_zero_bits_mask(context, execonly);
 	}
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index a46384b7080f..58397f58320f 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -147,10 +147,10 @@ static bool FNAME(is_bad_mt_xwr)(struct rsvd_bits_validate *rsvd_check, u64 gpte
 #endif
 }
 
-static bool FNAME(is_rsvd_bits_set)(struct kvm_pagewalk *w, u64 gpte, int level)
+static bool FNAME(is_rsvd_bits_set)(struct kvm_page_format *fmt, u64 gpte, int level)
 {
-	return __is_rsvd_bits_set(&w->guest_rsvd_check, gpte, level) ||
-	       FNAME(is_bad_mt_xwr)(&w->guest_rsvd_check, gpte);
+	return __is_rsvd_bits_set(&fmt->guest_rsvd_check, gpte, level) ||
+	       FNAME(is_bad_mt_xwr)(&fmt->guest_rsvd_check, gpte);
 }
 
 static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
@@ -167,7 +167,7 @@ static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
 	    !(gpte & PT_GUEST_ACCESSED_MASK))
 		goto no_present;
 
-	if (FNAME(is_rsvd_bits_set)(w, gpte, PG_LEVEL_4K))
+	if (FNAME(is_rsvd_bits_set)(&w->fmt, gpte, PG_LEVEL_4K))
 		goto no_present;
 
 	return false;
@@ -427,7 +427,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 		if (unlikely(!FNAME(is_present_gpte)(w, pte)))
 			goto error;
 
-		if (unlikely(FNAME(is_rsvd_bits_set)(w, pte, walker->level))) {
+		if (unlikely(FNAME(is_rsvd_bits_set)(&w->fmt, pte, walker->level))) {
 			errcode = PFERR_RSVD_MASK | PFERR_PRESENT_MASK;
 			goto error;
 		}
-- 
2.52.0



^ permalink raw reply related

* [PATCH 11/19] KVM: x86/mmu: change walk_mmu to struct kvm_pagewalk
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

Now that walk_mmu is only accessed for its "w" member, store
directly the pointer to it.  This also means that nested_mmu
is only accessed for its "w" member.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/hyperv.c           |  2 +-
 arch/x86/kvm/mmu/mmu.c          |  4 ++--
 arch/x86/kvm/mmu/paging_tmpl.h  |  4 ++--
 arch/x86/kvm/regs.c             |  7 ++++---
 arch/x86/kvm/svm/nested.c       |  4 ++--
 arch/x86/kvm/vmx/nested.c       |  4 ++--
 arch/x86/kvm/x86.c              | 37 +++++++++++++++++----------------
 8 files changed, 33 insertions(+), 31 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index bf1151c91372..a64deb5c05eb 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -923,7 +923,7 @@ struct kvm_vcpu_arch {
 	 * Pointer to the mmu context currently used for
 	 * gva_to_gpa translations.
 	 */
-	struct kvm_mmu *walk_mmu;
+	struct kvm_pagewalk *gva_walk;
 
 	u64 pdptrs[4]; /* pae */
 
diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index e4a0ca0f9fd4..51d812babe73 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -2046,7 +2046,7 @@ static u64 kvm_hv_flush_tlb(struct kvm_vcpu *vcpu, struct kvm_hv_hcall *hc)
 	 * read with kvm_read_guest().
 	 */
 	if (!hc->fast) {
-		hc->ingpa = kvm_translate_gpa(vcpu, &vcpu->arch.walk_mmu->w, hc->ingpa,
+		hc->ingpa = kvm_translate_gpa(vcpu, vcpu->arch.gva_walk, hc->ingpa,
 					      PFERR_GUEST_FINAL_MASK, NULL, 0);
 		if (unlikely(hc->ingpa == INVALID_GPA))
 			return HV_STATUS_INVALID_HYPERCALL_INPUT;
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 30774b562fa1..8ed9876cf3b8 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6730,7 +6730,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	 * be synced when switching to that new cr3, so nothing needs to be
 	 * done here for them.
 	 */
-	kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.walk_mmu->w, gva, KVM_MMU_ROOTS_ALL);
+	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.gva_walk, gva, KVM_MMU_ROOTS_ALL);
 	++vcpu->stat.invlpg;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
@@ -6867,7 +6867,7 @@ int kvm_mmu_create(struct kvm_vcpu *vcpu)
 		vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
 
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
 
 	ret = __kvm_mmu_create(vcpu, &vcpu->arch.guest_mmu);
 	if (ret)
diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h
index e04b646f00d2..9cfae71cd3e6 100644
--- a/arch/x86/kvm/mmu/paging_tmpl.h
+++ b/arch/x86/kvm/mmu/paging_tmpl.h
@@ -548,7 +548,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
 	}
 #endif
 	walker->fault.address = addr;
-	walker->fault.nested_page_fault = w != &vcpu->arch.walk_mmu->w;
+	walker->fault.nested_page_fault = w != vcpu->arch.gva_walk;
 	walker->fault.async_page_fault = false;
 
 #if PTTYPE != PTTYPE_EPT
@@ -906,7 +906,7 @@ static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 
 #ifndef CONFIG_X86_64
 	/* A 64-bit GVA should be impossible on 32-bit KVM. */
-	WARN_ON_ONCE((addr >> 32) && w == &vcpu->arch.walk_mmu->w);
+	WARN_ON_ONCE((addr >> 32) && w == vcpu->arch.gva_walk);
 #endif
 
 	r = FNAME(walk_addr_generic)(&walker, vcpu, w, addr, access);
diff --git a/arch/x86/kvm/regs.c b/arch/x86/kvm/regs.c
index 3b3515fd77e6..02adaa4ef64e 100644
--- a/arch/x86/kvm/regs.c
+++ b/arch/x86/kvm/regs.c
@@ -154,7 +154,7 @@ static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu)
  */
 int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 {
-	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+	struct kvm_pagewalk *w = vcpu->arch.gva_walk;
 	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
 	gpa_t real_gpa;
 	int i;
@@ -165,7 +165,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated
 	 * to an L1 GPA.
 	 */
-	real_gpa = kvm_translate_gpa(vcpu, &mmu->w, gfn_to_gpa(pdpt_gfn),
+	real_gpa = kvm_translate_gpa(vcpu, w, gfn_to_gpa(pdpt_gfn),
 				     PFERR_USER_MASK | PFERR_WRITE_MASK |
 				     PFERR_GUEST_PAGE_MASK, NULL, 0);
 	if (real_gpa == INVALID_GPA)
@@ -189,7 +189,8 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
 	 * Shadow page roots need to be reconstructed instead.
 	 */
 	if (!tdp_enabled && memcmp(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)))
-		kvm_mmu_free_roots(vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT);
+		kvm_mmu_free_roots(vcpu->kvm, &vcpu->arch.root_mmu,
+				   KVM_MMU_ROOT_CURRENT);
 
 	memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs));
 	kvm_register_mark_dirty(vcpu, VCPU_REG_PDPTR);
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 55eace8aa2c8..188e4b06a279 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -118,13 +118,13 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu->w.get_pdptr       = nested_svm_get_tdp_pdptr;
 
 	vcpu->arch.mmu->w.inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+	vcpu->arch.gva_walk              = &vcpu->arch.nested_mmu.w;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmcb_needs_vls_intercept(struct vcpu_svm *svm)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 6c4f38cc9896..5adc6a7c6af4 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -517,13 +517,13 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu->w.inject_page_fault = nested_ept_inject_page_fault;
 
-	vcpu->arch.walk_mmu              = &vcpu->arch.nested_mmu;
+	vcpu->arch.gva_walk              = &vcpu->arch.nested_mmu.w;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
 {
 	vcpu->arch.mmu = &vcpu->arch.root_mmu;
-	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
+	vcpu->arch.gva_walk = &vcpu->arch.root_mmu.w;
 }
 
 static bool nested_vmx_is_page_fault_vmexit(struct vmcs12 *vmcs12,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 0f76e52e2695..43625cc9e934 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -582,11 +582,12 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 				      struct x86_exception *fault,
 				      bool from_hardware)
 {
-	struct kvm_mmu *fault_mmu;
+	struct kvm_pagewalk *fault_walk;
+
 	WARN_ON_ONCE(fault->vector != PF_VECTOR);
 
-	fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu :
-					       vcpu->arch.walk_mmu;
+	fault_walk = fault->nested_page_fault ? &vcpu->arch.mmu->w :
+						vcpu->arch.gva_walk;
 
 	/*
 	 * Invalidate the TLB entry for the faulting address, if it exists,
@@ -594,10 +595,10 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 	 */
 	if ((fault->error_code & PFERR_PRESENT_MASK) &&
 	    !(fault->error_code & PFERR_RSVD_MASK))
-		kvm_mmu_invalidate_addr(vcpu, &fault_mmu->w, fault->address,
+		kvm_mmu_invalidate_addr(vcpu, fault_walk, fault->address,
 					KVM_MMU_ROOT_CURRENT);
 
-	fault_mmu->w.inject_page_fault(vcpu, fault, from_hardware);
+	fault_walk->inject_page_fault(vcpu, fault, from_hardware);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(__kvm_inject_emulated_page_fault);
 
@@ -4768,7 +4769,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
 			      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
@@ -4778,7 +4779,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_read);
 gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 			       struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	access |= PFERR_WRITE_MASK;
@@ -4790,7 +4791,7 @@ EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_gva_to_gpa_write);
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 
 	return gva_walk->gva_to_gpa(vcpu, gva_walk, gva, 0, exception);
 }
@@ -4799,7 +4800,7 @@ static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
 				      struct kvm_vcpu *vcpu, u64 access,
 				      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
@@ -4832,7 +4833,7 @@ static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
 				struct x86_exception *exception)
 {
 	struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 	u64 access = (kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0;
 	unsigned offset;
 	int ret;
@@ -4891,7 +4892,7 @@ static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes
 				      struct kvm_vcpu *vcpu, u64 access,
 				      struct x86_exception *exception)
 {
-	struct kvm_pagewalk *gva_walk = &vcpu->arch.walk_mmu->w;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 	void *data = val;
 	int r = X86EMUL_CONTINUE;
 
@@ -4997,7 +4998,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 				gpa_t *gpa, struct x86_exception *exception,
 				bool write)
 {
-	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 	u64 access = ((kvm_x86_call(get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0)
 		     | (write ? PFERR_WRITE_MASK : 0);
 
@@ -5007,7 +5008,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 	 * shadow page table for L2 guest.
 	 */
 	if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) ||
-	    !permission_fault(vcpu, &vcpu->arch.walk_mmu->w,
+	    !permission_fault(vcpu, gva_walk,
 			      vcpu->arch.mmio_access, 0, access))) {
 		*gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT |
 					(gva & (PAGE_SIZE - 1));
@@ -5015,7 +5016,7 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva,
 		return 1;
 	}
 
-	*gpa = mmu->w.gva_to_gpa(vcpu, &mmu->w, gva, access, exception);
+	*gpa = gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, exception);
 
 	if (*gpa == INVALID_GPA)
 		return -1;
@@ -10600,15 +10601,15 @@ void kvm_arch_gmem_invalidate(kvm_pfn_t start, kvm_pfn_t end)
 
 void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code)
 {
-	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
+	struct kvm_pagewalk *gva_walk = vcpu->arch.gva_walk;
 	struct x86_exception fault;
 	u64 access = error_code &
 		(PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK);
 
 	if (!(error_code & PFERR_PRESENT_MASK) ||
-	    mmu->w.gva_to_gpa(vcpu, &mmu->w, gva, access, &fault) != INVALID_GPA) {
+	    gva_walk->gva_to_gpa(vcpu, gva_walk, gva, access, &fault) != INVALID_GPA) {
 		/*
-		 * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page
+		 * If gva_walk->gva_to_gpa succeeded, the page
 		 * tables probably do not match the TLB.  Just proceed
 		 * with the error code that the processor gave.
 		 */
@@ -10619,7 +10620,7 @@ void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_c
 		fault.address = gva;
 		fault.async_page_fault = false;
 	}
-	vcpu->arch.walk_mmu->w.inject_page_fault(vcpu, &fault, true);
+	gva_walk->inject_page_fault(vcpu, &fault, true);
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_fixup_and_inject_pf_error);
 
-- 
2.52.0



^ permalink raw reply related

* [PATCH 15/19] KVM: x86/mmu: cleanup functions that initialize shadow MMU
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

Now that the GVA->GPA page walker is initialized independently,
init_kvm_softmmu() does not do anything more than calling
kvm_init_shadow_mmu() so eliminate it from the call chain.
At the same time, rename kvm_init_shadow_mmu() to
init_kvm_shadow_mmu() for consistency with init_kvm_tdp_mmu().

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu/mmu.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index a464e3ec26ee..71ad933d34c9 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -5990,7 +5990,7 @@ static void shadow_mmu_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *conte
 	reset_shadow_zero_bits_mask(vcpu, context);
 }
 
-static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
+static void init_kvm_shadow_mmu(struct kvm_vcpu *vcpu,
 				union kvm_cpu_role cpu_role)
 {
 	struct kvm_mmu *context = &vcpu->arch.root_mmu;
@@ -6122,12 +6122,6 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_shadow_ept_mmu);
 
-static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
-			     union kvm_cpu_role cpu_role)
-{
-	kvm_init_shadow_mmu(vcpu, cpu_role);
-}
-
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
 {
 	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
@@ -6139,7 +6133,7 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
 		if (tdp_enabled)
 			init_kvm_tdp_mmu(vcpu, cpu_role);
 		else
-			init_kvm_softmmu(vcpu, cpu_role);
+			init_kvm_shadow_mmu(vcpu, cpu_role);
 	}
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_init_mmu);
-- 
2.52.0



^ permalink raw reply related

* [PATCH 12/19] KVM: x86/mmu: change nested_mmu.w to ngva_walk
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

nested_mmu is now only used for its w member.  Rename it,
and change its type.

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/include/asm/kvm_host.h |  5 ++--
 arch/x86/kvm/mmu.h              |  6 ++---
 arch/x86/kvm/mmu/mmu.c          | 41 ++++++++++++++-------------------
 arch/x86/kvm/svm/nested.c       |  2 +-
 arch/x86/kvm/vmx/nested.c       |  2 +-
 5 files changed, 24 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a64deb5c05eb..25288d73cce7 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -917,11 +917,10 @@ struct kvm_vcpu_arch {
 	 * walking and not for faulting since we never handle l2 page faults on
 	 * the host.
 	 */
-	struct kvm_mmu nested_mmu;
+	struct kvm_pagewalk ngva_walk;
 
 	/*
-	 * Pointer to the mmu context currently used for
-	 * gva_to_gpa translations.
+	 * Pagewalk context used for gva_to_gpa translations.
 	 */
 	struct kvm_pagewalk *gva_walk;
 
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 4f48e2ca2dac..2610b996e144 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -254,8 +254,8 @@ static inline void kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
 	 * be stale.  Refresh CR0.WP and the metadata on-demand when checking
 	 * for permission faults.  Exempt nested MMUs, i.e. MMUs for shadowing
 	 * nEPT and nNPT, as CR0.WP is ignored in both cases.  Note, KVM does
-	 * need to refresh nested_mmu, a.k.a. the walker used to translate L2
-	 * GVAs to GPAs, as that "MMU" needs to honor L2's CR0.WP.
+	 * need to refresh ngva_walk, a.k.a. the walker used to translate L2
+	 * GVAs to GPAs, so as to honor L2's CR0.WP.
 	 */
 	if (!tdp_enabled || w == &vcpu->arch.guest_mmu.w)
 		return;
@@ -382,7 +382,7 @@ static inline gpa_t kvm_translate_gpa(struct kvm_vcpu *vcpu,
 				      struct x86_exception *exception,
 				      u64 pte_access)
 {
-	if (w != &vcpu->arch.nested_mmu.w)
+	if (w != &vcpu->arch.ngva_walk)
 		return gpa;
 	return kvm_x86_ops.nested_ops->translate_nested_gpa(vcpu, gpa, access,
 							    exception,
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 8ed9876cf3b8..8b5ffd78565b 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6126,43 +6126,37 @@ static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
 	context->w.get_guest_pgd     = get_guest_cr3;
 }
 
-static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
+static void init_kvm_ngva_walk(struct kvm_vcpu *vcpu,
 				union kvm_cpu_role new_mode)
 {
-	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
+	struct kvm_pagewalk *g_context = &vcpu->arch.ngva_walk;
 
-	if (new_mode.as_u64 == g_context->w.cpu_role.as_u64)
+	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
 		return;
 
-	g_context->w.cpu_role.as_u64   = new_mode.as_u64;
-	g_context->w.inject_page_fault = kvm_inject_page_fault;
-	g_context->w.get_pdptr         = kvm_pdptr_read;
-	g_context->w.get_guest_pgd     = get_guest_cr3;
-
-	/*
-	 * L2 page tables are never shadowed, so there is no need to sync
-	 * SPTEs.
-	 */
-	g_context->sync_spte         = NULL;
+	g_context->cpu_role.as_u64   = new_mode.as_u64;
+	g_context->inject_page_fault = kvm_inject_page_fault;
+	g_context->get_pdptr         = kvm_pdptr_read;
+	g_context->get_guest_pgd     = get_guest_cr3;
 
 	/*
 	 * Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
 	 * L1's nested page tables (e.g. EPT12). The nested translation
-	 * of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
+	 * of l2_gva to l1_gpa is done by arch.ngva_walk.gva_to_gpa using
 	 * L2's page tables as the first level of translation and L1's
 	 * nested page tables as the second level of translation. Basically
-	 * the gva_to_gpa functions between mmu and nested_mmu are swapped.
+	 * the gva_to_gpa functions between mmu and ngva_walk are swapped.
 	 */
 	if (!is_paging(vcpu))
-		g_context->w.gva_to_gpa = nonpaging_gva_to_gpa;
+		g_context->gva_to_gpa = nonpaging_gva_to_gpa;
 	else if (is_long_mode(vcpu))
-		g_context->w.gva_to_gpa = paging64_gva_to_gpa;
+		g_context->gva_to_gpa = paging64_gva_to_gpa;
 	else if (is_pae(vcpu))
-		g_context->w.gva_to_gpa = paging64_gva_to_gpa;
+		g_context->gva_to_gpa = paging64_gva_to_gpa;
 	else
-		g_context->w.gva_to_gpa = paging32_gva_to_gpa;
+		g_context->gva_to_gpa = paging32_gva_to_gpa;
 
-	reset_guest_paging_metadata(vcpu, &g_context->w);
+	reset_guest_paging_metadata(vcpu, g_context);
 }
 
 void kvm_init_mmu(struct kvm_vcpu *vcpu)
@@ -6171,7 +6165,7 @@ void kvm_init_mmu(struct kvm_vcpu *vcpu)
 	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, &regs);
 
 	if (mmu_is_nested(vcpu))
-		init_kvm_nested_mmu(vcpu, cpu_role);
+		init_kvm_ngva_walk(vcpu, cpu_role);
 	else if (tdp_enabled)
 		init_kvm_tdp_mmu(vcpu, cpu_role);
 	else
@@ -6195,10 +6189,9 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
 	 */
 	vcpu->arch.root_mmu.root_role.invalid = 1;
 	vcpu->arch.guest_mmu.root_role.invalid = 1;
-	vcpu->arch.nested_mmu.root_role.invalid = 1;
 	vcpu->arch.root_mmu.w.cpu_role.ext.valid = 0;
 	vcpu->arch.guest_mmu.w.cpu_role.ext.valid = 0;
-	vcpu->arch.nested_mmu.w.cpu_role.ext.valid = 0;
+	vcpu->arch.ngva_walk.cpu_role.ext.valid = 0;
 	kvm_mmu_reset_context(vcpu);
 
 	KVM_BUG_ON(!kvm_can_set_cpuid_and_feature_msrs(vcpu), vcpu->kvm);
@@ -6700,7 +6693,7 @@ void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 			return;
 
 		kvm_x86_call(flush_tlb_gva)(vcpu, addr);
-		if (w == &vcpu->arch.nested_mmu.w)
+		if (w == &vcpu->arch.ngva_walk)
 			return;
 	}
 
diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c
index 188e4b06a279..713b2508b8ca 100644
--- a/arch/x86/kvm/svm/nested.c
+++ b/arch/x86/kvm/svm/nested.c
@@ -118,7 +118,7 @@ static void nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
 	vcpu->arch.mmu->w.get_pdptr       = nested_svm_get_tdp_pdptr;
 
 	vcpu->arch.mmu->w.inject_page_fault = nested_svm_inject_npf_exit;
-	vcpu->arch.gva_walk              = &vcpu->arch.nested_mmu.w;
+	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
 
 static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 5adc6a7c6af4..345ee3323a93 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -517,7 +517,7 @@ static void nested_ept_init_mmu_context(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.mmu->w.inject_page_fault = nested_ept_inject_page_fault;
 
-	vcpu->arch.gva_walk              = &vcpu->arch.nested_mmu.w;
+	vcpu->arch.gva_walk              = &vcpu->arch.ngva_walk;
 }
 
 static void nested_ept_uninit_mmu_context(struct kvm_vcpu *vcpu)
-- 
2.52.0



^ permalink raw reply related

* [PATCH 10/19] KVM: x86/mmu: pass struct kvm_pagewalk to kvm_mmu_invalidate_addr
From: Paolo Bonzini @ 2026-06-24 21:30 UTC (permalink / raw)
  To: linux-kernel, kvm
In-Reply-To: <20260624213102.71082-1-pbonzini@redhat.com>

kvm_mmu_invalidate_addr only needs to know if what's being invalidated
is a GVA or GPA.  This will ultimately be represented by two different
kvm_pagewalk structs, so adjust the type of the parameter.

For now the GVA case is represented by both root_mmu and nested_mmu.
Since nested_mmu never has a sync_spte callback, it would exit at its
check; but really nested_mmu should not be a kvm_mmu in the first place
and the container_of() would be bogus, so introduce a separate check
for whether the invalidation is happening for a nested GVA.  In that
case there's nothing needed beyond kvm_x86_call(flush_tlb_gva).

Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/x86/kvm/mmu.h        |  2 +-
 arch/x86/kvm/mmu/mmu.c    | 12 ++++++++----
 arch/x86/kvm/vmx/nested.c |  2 +-
 arch/x86/kvm/x86.c        |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index c404ed9dcff2..4f48e2ca2dac 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -151,7 +151,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
 		       void *insn, int insn_len);
 void kvm_mmu_print_sptes(struct kvm_vcpu *vcpu, gpa_t gpa, const char *msg);
 void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
-void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 			     u64 addr, unsigned long roots);
 void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid);
 void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd);
diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c
index 29f2948bd38d..30774b562fa1 100644
--- a/arch/x86/kvm/mmu/mmu.c
+++ b/arch/x86/kvm/mmu/mmu.c
@@ -6685,22 +6685,26 @@ static void __kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu
 	write_unlock(&vcpu->kvm->mmu_lock);
 }
 
-void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
+void kvm_mmu_invalidate_addr(struct kvm_vcpu *vcpu, struct kvm_pagewalk *w,
 			     u64 addr, unsigned long roots)
 {
+	struct kvm_mmu *mmu;
 	int i;
 
 	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
 
 	/* It's actually a GPA for vcpu->arch.guest_mmu.  */
-	if (mmu != &vcpu->arch.guest_mmu) {
+	if (w != &vcpu->arch.guest_mmu.w) {
 		/* INVLPG on a non-canonical address is a NOP according to the SDM.  */
 		if (is_noncanonical_invlpg_address(addr, vcpu))
 			return;
 
 		kvm_x86_call(flush_tlb_gva)(vcpu, addr);
+		if (w == &vcpu->arch.nested_mmu.w)
+			return;
 	}
 
+	mmu = container_of(w, struct kvm_mmu, w);
 	if (!mmu->sync_spte)
 		return;
 
@@ -6726,7 +6730,7 @@ void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
 	 * be synced when switching to that new cr3, so nothing needs to be
 	 * done here for them.
 	 */
-	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
+	kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.walk_mmu->w, gva, KVM_MMU_ROOTS_ALL);
 	++vcpu->stat.invlpg;
 }
 EXPORT_SYMBOL_FOR_KVM_INTERNAL(kvm_mmu_invlpg);
@@ -6748,7 +6752,7 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid)
 	}
 
 	if (roots)
-		kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
+		kvm_mmu_invalidate_addr(vcpu, &mmu->w, gva, roots);
 	++vcpu->stat.invlpg;
 
 	/*
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 466f838a05bc..6c4f38cc9896 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -408,7 +408,7 @@ static void nested_ept_invalidate_addr(struct kvm_vcpu *vcpu, gpa_t eptp,
 			roots |= KVM_MMU_ROOT_PREVIOUS(i);
 	}
 	if (roots)
-		kvm_mmu_invalidate_addr(vcpu, vcpu->arch.mmu, addr, roots);
+		kvm_mmu_invalidate_addr(vcpu, &vcpu->arch.guest_mmu.w, addr, roots);
 }
 
 static void nested_ept_inject_page_fault(struct kvm_vcpu *vcpu,
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 484ed409e420..0f76e52e2695 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -594,7 +594,7 @@ void __kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu,
 	 */
 	if ((fault->error_code & PFERR_PRESENT_MASK) &&
 	    !(fault->error_code & PFERR_RSVD_MASK))
-		kvm_mmu_invalidate_addr(vcpu, fault_mmu, fault->address,
+		kvm_mmu_invalidate_addr(vcpu, &fault_mmu->w, fault->address,
 					KVM_MMU_ROOT_CURRENT);
 
 	fault_mmu->w.inject_page_fault(vcpu, fault, from_hardware);
-- 
2.52.0



^ permalink raw reply related


This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.