LinuxPPC-Dev Archive on lore.kernel.org

LinuxPPC-Dev Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v3 2/9] pseries/papr-hvpipe: Prevent kernel stack memory leak to userspace
From: Ritesh Harjani (IBM) @ 2026-05-01  4:11 UTC (permalink / raw)
  To: linuxppc-dev, Haren Myneni
  Cc: Madhavan Srinivasan, Christophe Leroy, Venkat Rao Bagalkote,
	Nicholas Piggin, linux-kernel, Ritesh Harjani (IBM), stable
In-Reply-To: <cover.1777606826.git.ritesh.list@gmail.com>

The hdr variable is allocated on the stack and only hdr.version and
hdr.flags are initialized explicitly. Because the struct papr_hvpipe_hdr
contains reserved padding bytes (reserved[3] and reserved2[40]), these
could leak the uninitialized bytes to userspace after copy_to_user().

This patch fixes that by initializing the whole struct to 0.

Cc: stable@vger.kernel.org
Fixes: cebdb522fd3ed ("powerpc/pseries: Receive payload with ibm,receive-hvpipe-msg RTAS")
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 arch/powerpc/platforms/pseries/papr-hvpipe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c
index c41d45e1986d..3392874ebdf6 100644
--- a/arch/powerpc/platforms/pseries/papr-hvpipe.c
+++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c
@@ -327,7 +327,7 @@ static ssize_t papr_hvpipe_handle_read(struct file *file,
 {
 
 	struct hvpipe_source_info *src_info = file->private_data;
-	struct papr_hvpipe_hdr hdr;
+	struct papr_hvpipe_hdr hdr = {};
 	long ret;
 
 	/*
-- 
2.39.5



^ permalink raw reply related

* [PATCH v3 1/9] pseries/papr-hvpipe: Fix race with interrupt handler
From: Ritesh Harjani (IBM) @ 2026-05-01  4:11 UTC (permalink / raw)
  To: linuxppc-dev, Haren Myneni
  Cc: Madhavan Srinivasan, Christophe Leroy, Venkat Rao Bagalkote,
	Nicholas Piggin, linux-kernel, Ritesh Harjani (IBM), stable
In-Reply-To: <cover.1777606826.git.ritesh.list@gmail.com>

While executing ->ioctl handler or ->release handler, if an interrupt
fires on the same cpu, then we can enter into a deadlock.

This patch fixes both these handlers to take spin_lock_irq{save|restore}
versions of the lock to prevent this deadlock.

Cc: stable@vger.kernel.org
Fixes: 814ef095f12c9 ("powerpc/pseries: Add papr-hvpipe char driver for HVPIPE interfaces")
Signed-off-by: Ritesh Harjani (IBM) <ritesh.list@gmail.com>
---
 arch/powerpc/platforms/pseries/papr-hvpipe.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/platforms/pseries/papr-hvpipe.c b/arch/powerpc/platforms/pseries/papr-hvpipe.c
index 14ae480d060a..c41d45e1986d 100644
--- a/arch/powerpc/platforms/pseries/papr-hvpipe.c
+++ b/arch/powerpc/platforms/pseries/papr-hvpipe.c
@@ -444,13 +444,14 @@ static int papr_hvpipe_handle_release(struct inode *inode,
 				struct file *file)
 {
 	struct hvpipe_source_info *src_info;
+	unsigned long flags;
 
 	/*
 	 * Hold the lock, remove source from src_list, reset the
 	 * hvpipe status and release the lock to prevent any race
 	 * with message event IRQ.
 	 */
-	spin_lock(&hvpipe_src_list_lock);
+	spin_lock_irqsave(&hvpipe_src_list_lock, flags);
 	src_info = file->private_data;
 	list_del(&src_info->list);
 	file->private_data = NULL;
@@ -461,10 +462,10 @@ static int papr_hvpipe_handle_release(struct inode *inode,
 	 */
 	if (src_info->hvpipe_status & HVPIPE_MSG_AVAILABLE) {
 		src_info->hvpipe_status = 0;
-		spin_unlock(&hvpipe_src_list_lock);
+		spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 		hvpipe_rtas_recv_msg(NULL, 0);
 	} else
-		spin_unlock(&hvpipe_src_list_lock);
+		spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 
 	kfree(src_info);
 	return 0;
@@ -480,20 +481,21 @@ static const struct file_operations papr_hvpipe_handle_ops = {
 static int papr_hvpipe_dev_create_handle(u32 srcID)
 {
 	struct hvpipe_source_info *src_info __free(kfree) = NULL;
+	unsigned long flags;
 
-	spin_lock(&hvpipe_src_list_lock);
+	spin_lock_irqsave(&hvpipe_src_list_lock, flags);
 	/*
 	 * Do not allow more than one process communicates with
 	 * each source.
 	 */
 	src_info = hvpipe_find_source(srcID);
 	if (src_info) {
-		spin_unlock(&hvpipe_src_list_lock);
+		spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 		pr_err("pid(%d) is already using the source(%d)\n",
 				src_info->tsk->pid, srcID);
 		return -EALREADY;
 	}
-	spin_unlock(&hvpipe_src_list_lock);
+	spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 
 	src_info = kzalloc_obj(*src_info, GFP_KERNEL_ACCOUNT);
 	if (!src_info)
@@ -510,18 +512,18 @@ static int papr_hvpipe_dev_create_handle(u32 srcID)
 		return fdf.err;
 
 	retain_and_null_ptr(src_info);
-	spin_lock(&hvpipe_src_list_lock);
+	spin_lock_irqsave(&hvpipe_src_list_lock, flags);
 	/*
 	 * If two processes are executing ioctl() for the same
 	 * source ID concurrently, prevent the second process to
 	 * acquire FD.
 	 */
 	if (hvpipe_find_source(srcID)) {
-		spin_unlock(&hvpipe_src_list_lock);
+		spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 		return -EALREADY;
 	}
 	list_add(&src_info->list, &hvpipe_src_list);
-	spin_unlock(&hvpipe_src_list_lock);
+	spin_unlock_irqrestore(&hvpipe_src_list_lock, flags);
 	return fd_publish(fdf);
 }
 
-- 
2.39.5



^ permalink raw reply related

* [PATCH v3 0/9] pseries/papr-hvpipe: Fix deadlock, races and misc cleanups
From: Ritesh Harjani (IBM) @ 2026-05-01  4:11 UTC (permalink / raw)
  To: linuxppc-dev, Haren Myneni
  Cc: Madhavan Srinivasan, Christophe Leroy, Venkat Rao Bagalkote,
	Nicholas Piggin, linux-kernel, Ritesh Harjani (IBM)

While going over papr-hvpipe code, there were a few fixes which were identified.
This patch series is an attempt to fix those along with some misc cleanups.
Me and Haren are trying to get these patches verified on a real HW. The tests
are not straight forward and we are waiting for the results.
Will update on the test results once we hear back from the internal test team.

v2->v3:
======
1. Rearranged the patches in such a way that it is easier to backport the fixes
   if required.
2. Clubbed patch-8 and patch-10 (of v2) since they both were changing the same function.
3. Handled ret>=0 case in copy_to_user patch, when the user itself may request
   for 0 effective bytes (after the HDR_LEN).

[v2]: https://lore.kernel.org/linuxppc-dev/cover.1775648406.git.ritesh.list@gmail.com/

v1->v2:
========
1. Fix a possible deadlock due to use of spin_lock instead of spin_lock_irqsave.
2. Prevent kernel stack uninit memory leak to userspace
3. Fix the race condition in null-ptr-deref case where there may be an
   msg pending to be consumed from the hvpipe.
4. Fixed error handling in init routine in patch-10

[v1]: https://lore.kernel.org/linuxppc-dev/cover.1775569027.git.ritesh.list@gmail.com/#t

Ritesh Harjani (IBM) (9):
  pseries/papr-hvpipe: Fix race with interrupt handler
  pseries/papr-hvpipe: Prevent kernel stack memory leak to userspace
  pseries/papr-hvpipe: Fix null ptr deref in papr_hvpipe_dev_create_handle()
  pseries/papr-hvpipe: Fix & simplify error handling in papr_hvpipe_init()
  pseries/papr-hvpipe: Fix the usage of copy_to_user()
  pseries/papr-hvpipe: Simplify spin unlock usage in papr_hvpipe_handle_release()
  pseries/papr-hvpipe: Kill task_struct pointer from struct hvpipe_source_info
  pseries/papr-hvpipe: Refactor and simplify hvpipe_rtas_recv_msg()
  pseries/papr-hvpipe: Fix style and checkpatch issues in enable_hvpipe_IRQ()

 arch/powerpc/platforms/pseries/papr-hvpipe.c | 181 ++++++++++---------
 arch/powerpc/platforms/pseries/papr-hvpipe.h |   1 -
 2 files changed, 97 insertions(+), 85 deletions(-)

--
2.39.5

^ permalink raw reply

* Re: [PATCH v2 0/5] ASoC: fsl-asoc-card: Add some improvements
From: Mark Brown @ 2026-04-30 12:08 UTC (permalink / raw)
  To: shengjiu.wang, Xiubo.Lee, festevam, nicoleotsuka, lgirdwood,
	perex, tiwai, linux-sound, linuxppc-dev, linux-kernel,
	Shengjiu Wang
In-Reply-To: <20260429100028.2739711-1-shengjiu.wang@nxp.com>

On Wed, 29 Apr 2026 18:00:22 +0800, Shengjiu Wang wrote:
> ASoC: fsl-asoc-card: Add some improvements
> 
> This patch series addresses several issues in the Freescale Generic ASoC
> Sound Card driver related to hardware limitations, DPCM path switching,
> and codec-specific constraints.
> 
> The fsl-asoc-card driver provides a generic machine driver for i.MX SoCs,
> supporting various codecs and optional ASRC (Asynchronous Sample Rate
> Converter) for sample rate conversion. During testing several issues were
> identified:
> 
> [...]

Applied to

   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-7.2

Thanks!

[1/5] ASoC: fsl-asoc-card: enable dpcm_merged_chan flag for ASRC frontend
      https://git.kernel.org/broonie/sound/c/3b9c088aeabf
[2/5] ASoC: fsl-asoc-card: enable ignore_pmdown_time for ASRC case
      https://git.kernel.org/broonie/sound/c/d611feb52de8
[3/5] ASoC: fsl-asoc-card: add channel and rate constraints for CS42888
      https://git.kernel.org/broonie/sound/c/e78abe395d30
[4/5] ASoC: fsl-asoc-card: exclude S20_3LE format for WM8960/WM8962 + SAI
      https://git.kernel.org/broonie/sound/c/fa7d8ea56c58
[5/5] ASoC: fsl-asoc-card: reduce WM8904 PLL ratio to meet frequency limit
      https://git.kernel.org/broonie/sound/c/99b5316f08f3

All being well this means that it will be integrated into the linux-next
tree (usually sometime in the next 24 hours) and sent to Linus during
the next merge window (or sooner if it is a bug fix), however if
problems are discovered then the patch may be dropped or reverted.

You may get further e-mails resulting from automated or manual testing
and review of the tree, please engage with people reporting problems and
send followup patches addressing any issues that are reported if needed.

If any updates are required or you are submitting further changes they
should be sent as incremental updates against current git, existing
patches will not be replaced.

Please add any relevant lists and maintainers to the CCs when replying
to this mail.

Thanks,
Mark

^ permalink raw reply

* Re: [PATCH v6 04/24] PCI/sysfs: Use BAR length in pci_llseek_resource() when attr->size is zero
From: Krzysztof Wilczyński @ 2026-04-30 23:32 UTC (permalink / raw)
  To: Bjorn Helgaas
  Cc: Bjorn Helgaas, Manivannan Sadhasivam, Lorenzo Pieralisi,
	Magnus Lindholm, Matt Turner, Richard Henderson, Christophe Leroy,
	Madhavan Srinivasan, Michael Ellerman, Nicholas Piggin,
	Dexuan Cui, Krzysztof Hałasa, Lukas Wunner,
	Oliver O'Halloran, Saurabh Singh Sengar, Shuan He,
	Srivatsa Bhat, Ilpo Järvinen, linux-pci, linux-alpha,
	linuxppc-dev
In-Reply-To: <20260429203625.GA3724801@rocinante>

Hello,

> The alternative would be separate llseek callbacks for both the legacy
> and resource attributes, which we can add if this would be the preference
> here.

If we were to do this, then it would be as follows:

  static loff_t pci_llseek_resource(struct file *filep,
                                    struct kobject *kobj,
                                    const struct bin_attribute *attr,
                                    loff_t offset, int whence)
  {
        struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
        int bar = (unsigned long)attr->private;

        return fixed_size_llseek(filep, offset, whence,
                                 pci_resource_len(pdev, bar));
  }

  static loff_t pci_llseek_resource_legacy(struct file *filep,
					   struct kobject *kobj __always_unused,
                                           const struct bin_attribute *attr,
                                           loff_t offset, int whence)
  {
        return fixed_size_llseek(filep, offset, whence, attr->size);
  }

Each callback would be placed within the corresponding #ifdef block, so one
for HAVE_PCI_MMAP or ARCH_GENERIC_PCI_MMAP_RESOURCE, and the other for the
legacy attributes, so behind the HAVE_PCI_LEGACY guard.

Note, the names need to be different, as some architectures offer both
type of resource files, like PowerPC, which defines both the HAVE_PCI_LEGACY
and HAVE_PCI_MMAP.

With this split, we can also drop the __maybe_unused annotation.

While I wanted to keep the changes to only what was needed for the
pci_llseek_resource() to cover both type of resources, it would be
also fine to have two distinct callbacks, too.

Thank you!

	Krzysztof


^ permalink raw reply

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song @ 2026-04-30 22:49 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: akpm, linux-mm, david, ljs, liam, vbabka, rppt, surenb, mhocko,
	jack, pfalcato, wanglian, chentao, lianux.mm, kunwu.chan,
	liyangouwen1, chrisl, kasong, shikemeng, nphamcs, bhe,
	youngjun.park, linux-arm-kernel, linux-kernel, loongarch,
	linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <afNM-gIqxpyJ6ro7@casper.infradead.org>

On Thu, Apr 30, 2026 at 8:37 PM Matthew Wilcox <willy@infradead.org> wrote:
>
> On Thu, Apr 30, 2026 at 12:04:22PM +0800, Barry Song (Xiaomi) wrote:
> > (1) If we need to wait for I/O completion, we still drop the per-VMA lock, as
> > current page fault handling already does. Holding it for too long may introduce
> > various priority inversion issues on mobile devices. After I/O completes, we
> > retry the page fault with the per-VMA lock, rather than falling back to
> > mmap_lock.
>
> You're going to have to do better than that.  You know I hate the
> additional complexity you're adding.  You need to explain why my idea of
> ripping out all the complexity now that we have per-VMA locks doesn't
> work.

Yep, I know you don’t like the added complexity, but I would rather prioritize
user experience over simplicity. Let me try to explain in more detail.

1. There is no deterministic latency for I/O completion. It depends on
both the hardware and the software stack (bio/request queues and the
block scheduler). Sometimes the latency is short; at other times it can
be quite long. In such cases, a high-priority thread performing operations
such as mprotect, unmap, prctl_set_vma, or madvise may be forced to wait
for an unpredictable amount of time. For example, if low-priority tasks
trigger page faults and issue low-priority I/O, a high-priority task
requiring the write lock may end up waiting for an unknown amount of time,
depending on the block layer and filesystem behavior.

As a result, high-priority tasks are exposed to unpredictable I/O latency
introduced by many low-priority tasks that may generate a large number of
page faults.

On Android, latency in certain tasks can significantly affect user experience,
such as interactive threads. Priority inversion is particularly problematic and
should be avoided, especially since we have no clear bound on how long we may
have to wait for I/O from other tasks.

Meanwhile, priority inversion can propagate through a long chain: a writer
waiting on I/O from multiple concurrent page faults may end up blocking other
writers and readers as well. A long-waiting writer can also amplify
mmap_lock contention, which we still rely on in many cases.

2. VMA sizes can be highly uneven: some VMAs may be very large while others are
small. We used to have many reasons to release mmap_lock when we did not have a
per-VMA lock. Since VMA sizes are not uniform, those same considerations may
still apply to the per-VMA lock when a small number of VMAs account for most
of a process’s address space. I recall that Suren also mentioned this[1].

So I would prefer that we hold only the per-VMA lock and avoid retrying the
page fault when we are reasonably sure that I/O has already completed and we
are only waiting for short-lived conditions. Uncertainties in the block layer,
filesystem, and GC behavior, as well as latency-induced priority inversion
chains and potentially amplified mmap_lock contention, can significantly hurt
Android user experience.

[1] https://lore.kernel.org/linux-mm/CAJuCfpFVQJtvbj5fV2fmm4APhNZDL1qPg-YExw7gO1pmngC3Rw@mail.gmail.com/

Thanks
Barry

^ permalink raw reply

* Re: [BUG] [powerpc] [next-20260216/17] nfsd: use-after-free in cache_check_rcu() triggered by sosreport on ppc64le
From: Misbah Anjum N @ 2026-04-30 20:45 UTC (permalink / raw)
  To: Linux Kernel, Linux Nfs
  Cc: Linuxppc Dev, chuck.lever, jlayton, venkat88, Linux Next
In-Reply-To: <dcd371d3a95815a84ba7de52cef447b8@linux.ibm.com>

Hi,

Following up on my bug report, I have completed a git bisect and have 
critical new findings to report.
Ref: 
https://lore.kernel.org/linux-next/dcd371d3a95815a84ba7de52cef447b8@linux.ibm.com/T/#u

Current Status: Bug Has Propagated from linux-next to Mainline.
First Bad commit identified: da6b5aae84beb0917ecb0c9fbc71169d145397ff

The use-after-free bug in cache_check_rcu() that I originally reported 
in linux-next (6.19.0-next-20260216/17) has now propagated into mainline 
and is confirmed present in:
- mainline (Tested on Latest kernel as of 2026-04-30 - commit 
08d0d3466664)
- linux-next (Tested on Latest kernel as of 2026-04-30)

This bug is causing failures on ppc64le systems:
1. Kernel panics: 100% reproducible crashes when sosreport runs
2. CI/Testing failures: All automated Avocado-vt KVM testing on ppc64le 
is failing
3. Use-after-free corruption: Memory corruption with corrupted pointers 
containing
    ASCII strings ("libz.so.", "export_cap") or poison patterns 
(0xcccccccccccccccc)

Test Environment:
Architecture: ppc64le (IBM Power11 and IBM Power10)
Hypervisor: phyp (PowerVM)
Distribution: Fedora 42 (Server Edition Prerelease)
Reproducible: 100%

Reproduction Steps:
On ppc64le system with latest kernel:
1. Run: modprobe nfsd
2. Run: sosreport
System crashes (typically within 30-60 seconds)

First bad commit:
commit da6b5aae84beb0917ecb0c9fbc71169d145397ff
Merge: b69e478512080 344bf523d441d
Author: Linus Torvalds <torvalds@linux-foundation.org>
Date:   Mon Apr 20 10:15:32 2026 -0700
     Merge tag 'platform-drivers-x86-v7.1-1' of
     
git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86
     Pull x86 platform driver updates from Ilpo Järvinen:
      "asus-wmi:
        - Retain battery charge threshold during boot which avoids
          unsolicited change to 100%. Return -ENODATA when the limit
          is not yet known
        - Improve screenpad power/brightness handling consistency
        - Fix screenpad brightness range
       barco-p50-gpio:
        - Normalize gpio_get return values
       bitland-mifs-wmi:
        - Add driver for Bitland laptops (supports platform profile,
          hwmon, kbd backlight, gpu mode, hotkeys, and fan boost)
       dell_rbu:
        - Fix using uninitialized value in sysfs write function
       dell-wmi-sysman:
        - Respect destination length when constructing enum strings
       hp-wmi:
        - Propagate fan setting apply failures and log an error
        - Fix sysfs write vs work handler cancel_delayed_work_sync() 
deadlock
        - Correct keepalive schedule_delayed_work() to mod_delayed_work()
        - Fix u8 underflows in GPU delta calculation
        - Use mutex to protect fan pwm/mode
        - Ignore kbd backlight and FnLock key events that are handled by 
FW
        - Fix fan table parsing (use correct field)
        - Add support for Omen 14-fb0xxx, 16-n0xxx, 16-wf1xxx, and
          Omen MAX 16-ak0xxxx
       input: trackpoint & thinkpad_acpi:
        - Enable doubletap by default and add sysfs enable/disable
       int3472:
        - Add support for GPIO type 0x02 (IR flood LED)
       intel-speed-select: (updated to v1.26)
        - Avoid using current base frequency as maximum
        - Fix CPU extended family ID decoding
        - Fix exit code
        - Improve error reporting
       intel/vsec:
        - Refactor to support ACPI-enumerated PMT endpoints.
       pcengines-apuv2:
        - Attach software node to the gpiochip
       uniwill:
        - Refactor hwmon to smaller parts to accomodate HW diversity
        - Support USB-C power/performance priority switch through sysfs
        - Add another XMG Fusion 15 (L19) DMI vendor
        - Enable fine-grained features to device lineup mapping
       wmi:
        - Perform output size check within WMI core to allow simpler WMI
          drivers
       misc:
        - acpi_driver -> platform driver conversions (a large number of
          changes from Rafael J. Wysocki)
        - cleanups / refactoring / improvements"
     * tag 'platform-drivers-x86-v7.1-1' of 
git://git.kernel.org/pub/scm/linux/kernel/git/pdx86/platform-drivers-x86: 
(106 commits)
       platform/x86: hp-wmi: Add support for Omen 16-wf1xxx (8C77)
       platform/x86: hp-wmi: Add support for Omen 16-n0xxx (8A44)
       platform/x86: hp-wmi: Add support for OMEN MAX 16-ak0xxx (8D87)
       platform/x86: hp-wmi: fix fan table parsing
       platform/x86: hp-wmi: add Omen 14-fb0xxx (board 8C58) support
       platform/wmi: Replace .no_notify_data with .min_event_size
       platform/wmi: Extend wmidev_query_block() to reject undersized 
data
       platform/wmi: Extend wmidev_invoke_method() to reject undersized 
data
       platform/wmi: Prepare to reject undersized unmarshalling results
       platform/wmi: Convert drivers to use wmidev_invoke_procedure()
       platform/wmi: Add wmidev_invoke_procedure()
       platform/x86: int3472: Add support for GPIO type 0x02 (IR flood 
LED)
       platform/x86: int3472: Parameterize LED con_id in registration
       platform/x86: int3472: Rename pled to led in LED registration code
       platform/x86: int3472: Use local variable for LED struct access
       platform/x86: thinkpad_acpi: remove obsolete TODO comment
       platform/x86: dell-wmi-sysman: bound enumeration string 
aggregation
       platform/x86: hp-wmi: Ignore backlight and FnLock events
       platform/x86: uniwill-laptop: Fix signedness bug
       platform/x86: dell_rbu: avoid uninit value usage in 
packet_size_write()
       ...
  .../sysfs-driver-uniwill-laptop       |  27 +
  .../laptops/thinkpad-acpi.rst         |  21 +
  .../laptops/uniwill-laptop.rst        |  12 +
  .../wmi/devices/bitland-mifs-wmi.rst  | 207 +++
  .../wmi/driver-development-guide.rst  |  11 +-
  drivers/gpu/drm/xe/xe_debugfs.c       |   2 +-
  drivers/gpu/drm/xe/xe_hwmon.c         |   2 +-
  drivers/gpu/drm/xe/xe_vsec.c          |   7 +-
  drivers/gpu/drm/xe/xe_vsec.h          |   4 +-
  drivers/input/mouse/trackpoint.c      |  46 +
  drivers/input/mouse/trackpoint.h      |   5 +
  .../platform/mellanox/nvsw-sn2201.c   |   1 -
  .../surface/surface_hotplug.c         |   2 +-
  .../surface/surfacepro3_button.c      |  71 +-
  drivers/platform/wmi/core.c           |  89 +-
  drivers/platform/wmi/internal.h       |   3 +-
  drivers/platform/wmi/marshalling.c    |   6 +-
  .../wmi/tests/marshalling_kunit.c     |  24 +-
  drivers/platform/x86/Kconfig          |  18 +
  drivers/platform/x86/Makefile         |   1 +
  drivers/platform/x86/acer-wireless.c  |  48 +-
  drivers/platform/x86/asus-laptop.c    |  44 +-
  drivers/platform/x86/asus-wireless.c  |  55 +-
  drivers/platform/x86/asus-wmi.c       |  77 +-
  drivers/platform/x86/barco-p50-gpio.c |  23 +-
  .../platform/x86/bitland-mifs-wmi.c   | 837 +++++++++++++
  drivers/platform/x86/dell/dell-rbtn.c | 142 ++-
  .../platform/x86/dell/dell-wmi-base.c |   1 +
  .../dell-wmi-sysman/dell-wmi-sysman.h |   4 +-
  .../dell-wmi-sysman/enum-attributes.c |  34 +-
  .../x86/dell/dell-wmi-sysman/sysman.c |  68 +-
  drivers/platform/x86/dell/dell_rbu.c  |   6 +-
  drivers/platform/x86/eeepc-laptop.c   |  45 +-
  drivers/platform/x86/fujitsu-laptop.c | 489 ++++----
  drivers/platform/x86/fujitsu-tablet.c |  30 +-
  drivers/platform/x86/hp/hp-wmi.c      | 125 +-
  .../x86/intel/int3472/discrete.c      |  13 +-
  .../platform/x86/intel/int3472/led.c  |  55 +-
  drivers/platform/x86/intel/pmc/core.c |   4 +-
  .../x86/intel/pmc/ssram_telemetry.c   |   2 +-
  .../platform/x86/intel/pmt/class.c    |   8 +-
  .../platform/x86/intel/pmt/class.h    |   5 +-
  .../x86/intel/pmt/discovery.c         |   4 +-
  .../x86/intel/pmt/telemetry.c         |  13 +-
  .../x86/intel/pmt/telemetry.h         |  12 +-
  drivers/platform/x86/intel/rst.c      |  23 +-
  drivers/platform/x86/intel/sdsi.c     |   5 +-
  .../platform/x86/intel/smartconnect.c |  23 +-
  drivers/platform/x86/intel/vsec.c     | 121 +-
  .../platform/x86/intel/vsec_tpmi.c    |  12 +-
  .../x86/intel/wmi/sbl-fw-update.c     |   7 +-
  .../x86/intel/wmi/thunderbolt.c       |   2 +-
  .../x86/lenovo/ideapad-laptop.c       |   1 +
  .../x86/lenovo/thinkpad_acpi.c        | 193 ++-
  .../platform/x86/lenovo/wmi-camera.c  |   1 +
  .../platform/x86/lenovo/wmi-events.c  |   1 +
  drivers/platform/x86/lenovo/ymc.c     |   1 +
  .../platform/x86/lenovo/yogabook.c    |   2 +-
  drivers/platform/x86/lg-laptop.c      |  51 +-
  drivers/platform/x86/mxm-wmi.c        |  12 -
  .../platform/x86/panasonic-laptop.c   |  79 +-
  .../platform/x86/pcengines-apuv2.c    |   3 +-
  drivers/platform/x86/redmi-wmi.c      |   1 +
  drivers/platform/x86/sony-laptop.c    | 122 +-
  drivers/platform/x86/system76_acpi.c  |  63 +-
  drivers/platform/x86/topstar-laptop.c |  43 +-
  drivers/platform/x86/toshiba_acpi.c   | 182 +--
  .../platform/x86/toshiba_bluetooth.c  |  74 +-
  drivers/platform/x86/toshiba_haps.c   |  57 +-
  .../x86/uniwill/uniwill-acpi.c        | 440 +++++--
  .../x86/uniwill/uniwill-wmi.c         |   1 +
  .../platform/x86/wireless-hotkey.c    |  49 +-
  drivers/platform/x86/wmi-bmof.c       |   2 +-
  drivers/platform/x86/xiaomi-wmi.c     |   1 +
  include/linux/intel_vsec.h            |  39 +-
  .../linux/platform_data/x86/int3472.h |  12 +-
  include/linux/wmi.h                   |  15 +-
  .../intel-speed-select/isst-config.c  |  41 +-
  78 files changed, 3073 insertions(+), 1309 deletions(-)
  create mode 100644 Documentation/wmi/devices/bitland-mifs-wmi.rst
  create mode 100644 drivers/platform/x86/bitland-mifs-wmi.c

Complete Bisect Log:
git bisect start
# good: [eb0d6d97c27c29cd7392c8fd74f46edf7dff7ec2] Merge tag 'bpf-fixes'
git bisect good eb0d6d97c27c29cd7392c8fd74f46edf7dff7ec2
# bad: [d46dd0d88341e45f8e0226fdef5462f5270898fc] Merge tag 
'f2fs-for-7.1-rc1'
git bisect bad d46dd0d88341e45f8e0226fdef5462f5270898fc
# good: [99ef60d119f3b2621067dd5fc1ea4a37360709e4] Merge tag 
'usb-7.1-rc1'
git bisect good 99ef60d119f3b2621067dd5fc1ea4a37360709e4
# good: [b69e478512080f9bb03ed3e812b759bb73e2837b] Merge tag 
'backlight-next-7.1'
git bisect good b69e478512080f9bb03ed3e812b759bb73e2837b
# bad: [a85d6ff99411eb21536a750ad02205e8a97894c6] Merge tag 'scsi-misc'
git bisect bad a85d6ff99411eb21536a750ad02205e8a97894c6
# bad: [ce9e93383ad71da468dafb9944a539808bf91c06] Merge tag 
'sh-for-v7.1-tag1'
git bisect bad ce9e93383ad71da468dafb9944a539808bf91c06
# good: [378500dc1313e2c06a2f675bb00ab5d7880433ba] platform/x86: 
asus-laptop: Register ACPI notify handler directly
git bisect good 378500dc1313e2c06a2f675bb00ab5d7880433ba
# good: [9d317a54e46d3b6420567dc5b63e9d7ff5c064a3] platform/x86: hp-wmi: 
fix fan table parsing
git bisect good 9d317a54e46d3b6420567dc5b63e9d7ff5c064a3
# bad: [b66cb4f156fe47f52065e70eb1b2f12ccd0c2884] Merge tag 
'printk-for-7.1'
git bisect bad b66cb4f156fe47f52065e70eb1b2f12ccd0c2884
# good: [add9d911be9b141706ccf41d17b4043ed1bc12a1] Merge branch 
'rework/prb-fixes' into for-linus
git bisect good add9d911be9b141706ccf41d17b4043ed1bc12a1
# bad: [da6b5aae84beb0917ecb0c9fbc71169d145397ff] Merge tag 
'platform-drivers-x86-v7.1-1'
git bisect bad da6b5aae84beb0917ecb0c9fbc71169d145397ff
# good: [899225257e78585e2e10b0f7ba472b3c212a8d16] platform/x86: hp-wmi: 
Add support for Omen 16-n0xxx (8A44)
git bisect good 899225257e78585e2e10b0f7ba472b3c212a8d16
# good: [344bf523d441d44c75c429ea6cdcfa8f12efde4d] platform/x86: hp-wmi: 
Add support for Omen 16-wf1xxx (8C77)
git bisect good 344bf523d441d44c75c429ea6cdcfa8f12efde4d
# first bad commit: [da6b5aae84beb0917ecb0c9fbc71169d145397ff] Merge tag 
'platform-drivers-x86-v7.1-1'

Crash Log Call Trace:
[ 1721.304746] BUG: Unable to handle kernel data access on read at 
0x50000004e
[ 1721.304751] Faulting instruction address: 0xc008000015b11d9c
[ 1721.304756] Oops: Kernel access of bad area, sig: 11 [#1]
[ 1721.304760] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA pSeries
[ 1721.304767] Modules linked in: nft_masq nft_ct nft_reject_ipv4 
nf_reject_ipv4 nft_reject act_csum cls_u32 sch_htb nft_chain_nat nf_nat 
nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables bridge stp llc 
binfmt_misc rpcrdma rdma_cm iw_cm ib_cm kvm_hv ib_core kvm bonding 
rfkill pseries_rng vmx_crypto nfsd auth_rpcgss nfs_acl drm lockd grace 
loop drm_panel_orientation_quirks nfnetlink vsock_loopback 
vmw_vsock_virtio_transport_common vsock zram xfs dm_service_time sd_mod 
ibmvscsi ibmveth scsi_transport_srp ipr btrfs xor libblake2b raid6_pq 
zstd_compress sunrpc dm_mirror dm_region_hash dm_log be2iscsi bnx2i cnic 
uio cxgb4i cxgb4 tls libcxgbi libcxgb qla4xxx iscsi_boot_sysfs iscsi_tcp 
libiscsi_tcp libiscsi scsi_transport_iscsi dm_multipath fuse dm_mod
[ 1721.304844] CPU: 32 UID: 0 PID: 7187 Comm: sosreport Not tainted 
7.0.0-12182-gda6b5aae84be #17 PREEMPTLAZY
[ 1721.304849] Hardware name: IBM,9080-HEX POWER10 (architected) 
0x800200 0xf000006 of:IBM,FW1060.70 (NH1060_166) hv:phyp pSeries
[ 1721.304854] NIP:  c008000015b11d9c LR: c008000015b121a0 CTR: 
c008000015b12138
[ 1721.304858] REGS: c0000010bfef7750 TRAP: 0300   Not tainted  
(7.0.0-12182-gda6b5aae84be)
[ 1721.304862] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR: 
28044402  XER: 00000000
[ 1721.304871] CFAR: c008000015b1219c DAR: 000000050000004e DSISR: 
40000000 IRQMASK: 0
[ 1721.304871] GPR00: c008000015b121a0 c0000010bfef79f0 c008000014737a00 
c00000002091f400
[ 1721.304871] GPR04: 0000000500000026 0000000000000000 0000000000000000 
c0000000a66ce800
[ 1721.304871] GPR08: c00000002091f400 0000000000000000 0000000000400cc0 
0000000000000000
[ 1721.304871] GPR12: c008000015b12138 c000001bfffff300 0000000000000000 
0000000000000000
[ 1721.304871] GPR16: 0000000000000000 0000000000000000 0000000000000000 
0000000000000000
[ 1721.304871] GPR20: 0000000000000000 0000000000000000 c00000101bb29f08 
c00000101bb29ef8
[ 1721.304871] GPR24: 000000007fff0000 0000000000000000 fffffffffffff000 
0000000000000000
[ 1721.304871] GPR28: c00000002091f400 0000000000000000 c00000101bb29ed0 
0000000500000026
[ 1721.304911] NIP [c008000015b11d9c] cache_check_rcu+0x44/0x2c0 
[sunrpc]
[ 1721.304950] LR [c008000015b121a0] c_show+0x68/0x1c0 [sunrpc]
[ 1721.304984] Call Trace:
[ 1721.304986] [c0000010bfef79f0] [c0000010bfef7a30] 0xc0000010bfef7a30 
(unreliable)
[ 1721.304992] [c0000010bfef7aa0] [c008000015b121a0] c_show+0x68/0x1c0 
[sunrpc]
[ 1721.305027] [c0000010bfef7b50] [c0000000007b9b28] 
seq_read_iter+0x1a8/0x680
[ 1721.305034] [c0000010bfef7c20] [c0000000007ba104] 
seq_read+0x104/0x150
[ 1721.305038] [c0000010bfef7cc0] [c000000000863920] 
proc_reg_read+0xf0/0x160
[ 1721.305043] [c0000010bfef7cf0] [c000000000768b00] vfs_read+0xe0/0x3d0
[ 1721.305049] [c0000010bfef7db0] [c000000000769a08] 
ksys_read+0x78/0x140
[ 1721.305054] [c0000010bfef7e00] [c000000000034908] 
system_call_exception+0x128/0x360
[ 1721.305061] [c0000010bfef7e50] [c00000000000d6a0] 
system_call_common+0x160/0x2e4
[ 1721.305066] ---- interrupt: c00 at 0x7fffba6b9fc8
[ 1721.305069] NIP:  00007fffba6b9fc8 LR: 00007fffba6a8438 CTR: 
0000000000000000
[ 1721.305072] REGS: c0000010bfef7e80 TRAP: 0c00   Not tainted  
(7.0.0-12182-gda6b5aae84be)
[ 1721.305075] MSR:  800000000280f033 
<SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28044404  XER: 00000000
[ 1721.305085] IRQMASK: 0
[ 1721.305085] GPR00: 0000000000000003 00007fffa77ed9b0 00007fffba847c00 
0000000000000007
[ 1721.305085] GPR04: 00007fff940230f0 0000000000010000 0000000000000000 
0000000000000000
[ 1721.305085] GPR08: 0000000000000000 0000000000000000 0000000000000000 
0000000000000000
[ 1721.305085] GPR12: 0000000000000000 00007fffa77f6880 0000000000000000 
0000000000000000
[ 1721.305085] GPR16: 0000000000000000 0000000000000000 00007fffb87f0828 
00007fffa77edf68
[ 1721.305085] GPR20: 00007fffb87f0830 00007fffbaded480 00007fffb87f0838 
00007fffbae0d480
[ 1721.305085] GPR24: 00007fffbaf8e0f0 00007fff940230f0 0000000000000007 
00007fffac001290
[ 1721.305085] GPR28: 0000000000000000 00007fffa77ef8b0 00007fffa4590b40 
0000000000010000
[ 1721.305120] NIP [00007fffba6b9fc8] 0x7fffba6b9fc8
[ 1721.305122] LR [00007fffba6a8438] 0x7fffba6a8438
[ 1721.305125] ---- interrupt: c00
[ 1721.305127] Code: fba1ffe8 fbe1fff8 fb61ffd8 fbc1fff0 7c9f2378 
7c7c1b78 7cbd2b78 f8010010 f821ff51 e92d0c78 f9210078 39200000 
<e9240028> 71290001 418201cc fb410080
[ 1721.305141] ---[ end trace 0000000000000000 ]---
[ 1721.307464] pstore: backend (nvram) writing error (-1)
[ 1721.307468]
[ 1722.307472] Kernel panic - not syncing: Fatal exception
[ 1722.321570] Rebooting in 10 seconds..

Thanks,
Misbah Anjum N <misanjum@linux.ibm.com>


On 2026-02-19 18:57, Misbah Anjum N wrote:
> Hi,
> 
> I'm reporting a critical use-after-free bug in linux-next NFS server
> code that causes kernel crashes when sosreport reads /proc/fs/nfsd/*
> files. This appears to be a recent regression affecting ppc64le
> systems.
> The bug is 100% reproducible and shows corrupted pointers containing
> ASCII strings (library names, export cache names) instead of valid
> kernel addresses, indicating freed memory has been reallocated.
> 
> Thanks,
> Misbah Anjum N
> 
> Bug Description:
> The kernel crashes with use-after-free in cache_check_rcu() [sunrpc]
> when sosreport reads NFS export information from /proc. The bug is
> highly reproducible and consistently shows corrupted pointers
> containing ASCII strings (library names, export cache names,
> filesystem paths) instead of valid kernel addresses.
> This is a critical regression in linux-next that needs to be fixed
> before reaching mainline.
> 
> System Information:
> Kernel: 6.19.0-next-20260216 and 6.19.0-next-20260217
> Architecture: ppc64le (IBM Power11, 9080-HEX)
> Hardware: IBM,9080-HEX Power11 (architected) 0x820200 0xf000007
> Firmware: IBM,FW1110.11 (NH1110_102)
> Hypervisor: phyp (PowerVM)
> Distribution: Fedora 42 (Server Edition Prerelease)
> Reproducible: 100%
> 
> Reproduction Steps:
> On ppc64le system with kernel 6.19.0-next-20260216/17:
> 1. Run: modprobe nfsd
> 2. Run: sosreport
> System crashes (typically within 30-60 seconds)
> 
> Important notes:
> 1. Direct cat /proc/fs/nfsd/exports does NOT trigger the crash
> 2. The crash is triggered by sosreport's specific access pattern to
> /proc/fs/nfsd/* files
> 3. No NFS exports or active NFS server configuration needed
> 4. Reproducible 100% of the time with sosreport
> 
> Kernel Configuration:
> Relevant NFS configuration options:
> CONFIG_NFSD=m
> CONFIG_NFSD_V3_ACL=y
> CONFIG_NFSD_V4=y
> CONFIG_NFSD_PNFS=y
> CONFIG_NFSD_SCSILAYOUT=y
> CONFIG_NFSD_V4_2_INTER_SSC=y
> CONFIG_NFSD_V4_SECURITY_LABEL=y
> CONFIG_NFS_FS=m
> CONFIG_NFS_V3=m
> CONFIG_NFS_V3_ACL=y
> CONFIG_NFS_V4=m
> CONFIG_NFS_V4_1=y
> CONFIG_NFS_V4_2=y
> CONFIG_NFS_V4_SECURITY_LABEL=y
> CONFIG_NFS_FSCACHE=y
> CONFIG_NFS_DEBUG=y
> CONFIG_NFS_DISABLE_UDP_SUPPORT=y
> CONFIG_NFS_ACL_SUPPORT=m
> CONFIG_NFS_COMMON=y
> CONFIG_SUNRPC=m
> CONFIG_SUNRPC_DEBUG=y
> 
> Detailed Crash Traces:
> Crash #1 - cache_check_rcu() with "export_cap" pointer 
> (6.19.0-next-20260216)
> [ 3162.071511] BUG: Unable to handle kernel data access at 
> 0x657079745f70618b
> [ 3162.071529] Faulting instruction address: 0xc0080000083322bc
> [ 3162.071534] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 3162.071537] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA 
> pSeries
> [ 3162.071542] Modules linked in: binfmt_misc vhost_net vhost
> vhost_iotlb tap tun nft_masq nft_ct nft_reject_ipv4 nf_reject_ipv4
> nft_reject act_csum cls_u32 sch_htb nft_chain_nat nf_nat nf_conntrack
> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables bridge stp llc rpcrdma rdma_cm
> iw_cm kvm_hv ib_cm ib_core kvm bonding rfkill nfsd auth_rpcgss nfs_acl
> lockd grace pseries_rng vmx_crypto drm loop
> drm_panel_orientation_quirks nfnetlink vsock_loopback
> vmw_vsock_virtio_transport_common vsock zram xfs dm_service_time
> sd_mod ibmvscsi ibmveth scsi_transport_srp tg3 ipr btrfs xor
> libblake2b raid6_pq zstd_compress sunrpc dm_mirror dm_region_hash
> dm_log be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls libcxgbi libcxgb
> qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi
> scsi_transport_iscsi dm_multipath fuse dm_mod
> [ 3162.071618] CPU: 51 UID: 0 PID: 52936 Comm: sosreport Kdump: loaded
> Not tainted 6.19.0-next-20260216 #1 PREEMPTLAZY
> [ 3162.071623] Hardware name: IBM,9080-HEX Power11 (architected)
> 0x820200 0xf000007 of:IBM,FW1110.11 (NH1110_102) hv:phyp pSeries
> [ 3162.071627] NIP:  c0080000083322bc LR: c0080000115f6b48 CTR: 
> c008000008332278
> [ 3162.071631] REGS: c0000000b353f7c0 TRAP: 0380   Not tainted
> (6.19.0-next-20260216)
> [ 3162.071635] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR:
> 48044402  XER: 00000000
> [ 3162.071643] CFAR: c00800001164e15c IRQMASK: 0
> [ 3162.071643] GPR00: c0080000115f6b48 c0000000b353fa60
> c008000008397600 c00000012a758700
> [ 3162.071643] GPR04: 657079745f706163 0000000000000000
> 0000000000000000 c000000144b4d000
> [ 3162.071643] GPR08: c00000012a758700 0000000000000000
> 0000000000400cc0 c00800001164e148
> [ 3162.071643] GPR12: c008000008332278 c0000027fde49f00
> 0000000000000000 0000000000000000
> [ 3162.071643] GPR16: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [ 3162.071643] GPR20: 0000000000000000 0000000000000000
> c000000145433788 c000000145433778
> [ 3162.071643] GPR24: 000000007fff0000 0000000000000000
> fffffffffffff000 0000000000000000
> [ 3162.071643] GPR28: c00000012a758700 0000000000000000
> c00000012a758700 657079745f706163
> [ 3162.071682] NIP [c0080000083322bc] cache_check_rcu+0x44/0x2c0 
> [sunrpc]
> [ 3162.071716] LR [c0080000115f6b48] e_show+0x40/0x260 [nfsd]
> [ 3162.071747] Call Trace:
> [ 3162.071749] [c0000000b353fa60] [c0000000b353fb50]
> 0xc0000000b353fb50 (unreliable)
> [ 3162.071754] [c0000000b353fb10] [c0080000115f6b48] e_show+0x40/0x260 
> [nfsd]
> [ 3162.071780] [c0000000b353fb50] [c0000000007a7468] 
> seq_read_iter+0x1a8/0x680
> [ 3162.071787] [c0000000b353fc20] [c0000000007a7a44] 
> seq_read+0x104/0x150
> [ 3162.071791] [c0000000b353fcc0] [c00000000084ecb0] 
> proc_reg_read+0xf0/0x160
> [ 3162.071796] [c0000000b353fcf0] [c000000000756b00] 
> vfs_read+0xe0/0x3d0
> [ 3162.071800] [c0000000b353fdb0] [c000000000757a08] 
> ksys_read+0x78/0x140
> [ 3162.071804] [c0000000b353fe00] [c0000000000348c8]
> system_call_exception+0x128/0x350
> [ 3162.071809] [c0000000b353fe50] [c00000000000d6a0]
> system_call_common+0x160/0x2e4
> [ 3162.071815] ---- interrupt: c00 at 0x7fff7ecb9fc8
> [ 3162.071818] NIP:  00007fff7ecb9fc8 LR: 00007fff7eca8438 CTR: 
> 0000000000000000
> [ 3162.071821] REGS: c0000000b353fe80 TRAP: 0c00   Not tainted
> (6.19.0-next-20260216)
> [ 3162.071824] MSR:  800000000280f033
> <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28044404  XER: 00000000
> [ 3162.071834] IRQMASK: 0
> [ 3162.071834] GPR00: 0000000000000003 00007fff6afdd9d0
> 00007fff7ee47c00 0000000000000005
> [ 3162.071834] GPR04: 00007fff5c0223c0 0000000000010000
> 0000000000000000 0000000000000000
> [ 3162.071834] GPR08: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [ 3162.071834] GPR12: 0000000000000000 00007fff6afe68a0
> 0000000000000000 0000000000000000
> [ 3162.071834] GPR16: 0000000000000000 0000000000000000
> 00007fff7d800828 00007fff6afddf88
> [ 3162.071834] GPR20: 00007fff7d800830 00007fff7f3ed480
> 00007fff7d800838 00007fff7f40d480
> [ 3162.071834] GPR24: 00007fff7f58e0f0 00007fff5c0223c0
> 0000000000000005 00007fff6c001290
> [ 3162.071834] GPR28: 0000000000000000 00007fff6afdf8d0
> 00007fff79db3140 0000000000010000
> [ 3162.071870] NIP [00007fff7ecb9fc8] 0x7fff7ecb9fc8
> [ 3162.071872] LR [00007fff7eca8438] 0x7fff7eca8438
> [ 3162.071875] ---- interrupt: c00
> [ 3162.071877] Code: fba1ffe8 fbe1fff8 fb61ffd8 fbc1fff0 7c9f2378
> 7c7c1b78 7cbd2b78 f8010010 f821ff51 e92d0c78 f9210078 39200000
> <e9240028> 71290001 418201cc fb410080
> [ 3162.071890] ---[ end trace 0000000000000000 ]---
> 
> Crash #2 - d_path() NULL pointer dereference (6.19.0-next-20260217)
> [ 5489.374563] Kernel attempted to read user page (60) - exploit
> attempt? (uid: 0)
> [ 5489.374582] BUG: Kernel NULL pointer dereference on read at 
> 0x00000060
> [ 5489.374586] Faulting instruction address: 0xc0000000007cb354
> [ 5489.374590] Oops: Kernel access of bad area, sig: 11 [#1]
> [ 5489.374593] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA 
> pSeries
> [ 5489.374598] Modules linked in: binfmt_misc vhost_net vhost
> vhost_iotlb tap tun nft_masq nft_ct nft_reject_ipv4 nf_reject_ipv4
> nft_reject act_csum cls_u32 sch_htb nft_chain_nat nf_nat nf_conntrack
> nf_defrag_ipv6 nf_defrag_ipv4 nf_tables bridge stp llc rpcrdma rdma_cm
> iw_cm kvm_hv ib_cm kvm ib_core bonding rfkill nfsd auth_rpcgss nfs_acl
> lockd grace pseries_rng vmx_crypto drm loop
> drm_panel_orientation_quirks nfnetlink vsock_loopback
> vmw_vsock_virtio_transport_common vsock zram xfs dm_service_time
> sd_mod ibmvscsi tg3 ibmveth scsi_transport_srp ipr btrfs xor
> libblake2b raid6_pq zstd_compress sunrpc dm_mirror dm_region_hash
> dm_log be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls libcxgbi libcxgb
> qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi
> scsi_transport_iscsi dm_multipath fuse dm_mod
> [ 5489.374671] CPU: 2 UID: 0 PID: 45718 Comm: sosreport Kdump: loaded
> Not tainted 6.19.0-next-20260217 #1 PREEMPTLAZY
> [ 5489.374676] Hardware name: IBM,9080-HEX Power11 (architected)
> 0x820200 0xf000007 of:IBM,FW1110.11 (NH1110_102) hv:phyp pSeries
> [ 5489.374680] NIP:  c0000000007cb354 LR: c0000000007a7ed0 CTR: 
> c0000000007a7e60
> [ 5489.374683] REGS: c00000026f2676b0 TRAP: 0300   Not tainted
> (6.19.0-next-20260217)
> [ 5489.374688] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR:
> 88044408  XER: 00000000
> [ 5489.374696] CFAR: c0000000007a7ecc DAR: 0000000000000060 DSISR:
> 40000000 IRQMASK: 0
> [ 5489.374696] GPR00: c0000000007a7ed0 c00000026f267950
> c000000001868100 0000000000000000
> [ 5489.374696] GPR04: c0000012e1350002 000000000000fffe
> c00800000ee360f0 c0000012e1350002
> [ 5489.374696] GPR08: 000000000000fffe c000000146400840
> c0000012e1360000 0000000000000000
> [ 5489.374696] GPR12: c0000000007a7e60 c0000027ffffdf00
> 0000000000000000 0000000000000000
> [ 5489.374696] GPR16: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [ 5489.374696] GPR20: 0000000000000000 0000000000000000
> c0000000bbca06c8 c0000000bbca06b8
> [ 5489.374696] GPR24: 000000007fff0000 0000000000000000
> fffffffffffff000 0000000000000000
> [ 5489.374696] GPR28: c00000026f267c50 c000000140db5800
> c000000146400800 c0000012e1350002
> [ 5489.374736] NIP [c0000000007cb354] d_path+0x44/0x210
> [ 5489.374742] LR [c0000000007a7ed0] seq_path+0x70/0x160
> [ 5489.374747] Call Trace:
> [ 5489.374749] [c00000026f267950] [0000000000000006] 0x6 (unreliable)
> [ 5489.374755] [c00000026f2679b0] [c0000000007a7ed0] 
> seq_path+0x70/0x160
> [ 5489.374759] [c00000026f2679f0] [c00800001144673c]
> svc_export_show+0x1d4/0x5a0 [nfsd]
> [ 5489.374789] [c00000026f267aa0] [c008000004a126fc] c_show+0xa4/0x1c0 
> [sunrpc]
> [ 5489.374819] [c00000026f267b50] [c0000000007a7468] 
> seq_read_iter+0x1a8/0x680
> [ 5489.374824] [c00000026f267c20] [c0000000007a7a44] 
> seq_read+0x104/0x150
> [ 5489.374829] [c00000026f267cc0] [c00000000084ecb0] 
> proc_reg_read+0xf0/0x160
> [ 5489.374833] [c00000026f267cf0] [c000000000756af0] 
> vfs_read+0xe0/0x3d0
> [ 5489.374837] [c00000026f267db0] [c0000000007579f8] 
> ksys_read+0x78/0x140
> [ 5489.374841] [c00000026f267e00] [c0000000000348c8]
> system_call_exception+0x128/0x350
> [ 5489.374846] [c00000026f267e50] [c00000000000d6a0]
> system_call_common+0x160/0x2e4
> [ 5489.374852] ---- interrupt: c00 at 0x7fff866b9fc8
> [ 5489.374855] NIP:  00007fff866b9fc8 LR: 00007fff866a8438 CTR: 
> 0000000000000000
> [ 5489.374858] REGS: c00000026f267e80 TRAP: 0c00   Not tainted
> (6.19.0-next-20260217)
> [ 5489.374861] MSR:  800000000280f033
> <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28044404  XER: 00000000
> [ 5489.374871] IRQMASK: 0
> [ 5489.374871] GPR00: 0000000000000003 00007fff71fbd9d0
> 00007fff86847c00 0000000000000008
> [ 5489.374871] GPR04: 00007fff600228e0 0000000000010000
> 0000000000000000 0000000000000000
> [ 5489.374871] GPR08: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [ 5489.374871] GPR12: 0000000000000000 00007fff71fc68a0
> 0000000000000000 0000000000000000
> [ 5489.374871] GPR16: 0000000000000000 0000000000000000
> 00007fff847f0828 00007fff71fbdf88
> [ 5489.374871] GPR20: 00007fff847f0830 00007fff86ded480
> 00007fff847f0838 00007fff86e0d480
> [ 5489.374871] GPR24: 00007fff86f8e0f0 00007fff600228e0
> 0000000000000008 00007fff6c0016a0
> [ 5489.374871] GPR28: 0000000000000000 00007fff71fbf8d0
> 00007fff80548c40 0000000000010000
> [ 5489.374906] NIP [00007fff866b9fc8] 0x7fff866b9fc8
> [ 5489.374909] LR [00007fff866a8438] 0x7fff866a8438
> [ 5489.374912] ---- interrupt: c00
> [ 5489.374914] Code: f8010010 f821ffa1 f8410018 e92d0c78 f9210058
> 39200000 91410044 7c691b78 7d442a14 f9410038 e8630008 90a10040
> <e9430060> 2c2a0000 41820064 e98a0048
> [ 5489.374927] ---[ end trace 0000000000000000 ]---
> 
> Crash #3 - cache_check_rcu() with "libz.so." pointer 
> (6.19.0-next-20260217)
> [   63.748591] BUG: Unable to handle kernel data access at 
> 0x2e6f732e7a626994
> [   63.748601] Faulting instruction address: 0xc008000009de22bc
> [   63.748606] Oops: Kernel access of bad area, sig: 11 [#1]
> [   63.748609] LE PAGE_SIZE=64K MMU=Radix  SMP NR_CPUS=2048 NUMA 
> pSeries
> [   63.748614] Modules linked in: nft_masq nft_ct nft_reject_ipv4
> nf_reject_ipv4 nft_reject act_csum cls_u32 sch_htb nft_chain_nat
> nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 nf_tables bridge stp
> llc binfmt_misc rpcrdma rdma_cm iw_cm kvm_hv ib_cm kvm ib_core bonding
> rfkill nfsd auth_rpcgss nfs_acl lockd grace pseries_rng vmx_crypto drm
> loop drm_panel_orientation_quirks nfnetlink vsock_loopback
> vmw_vsock_virtio_transport_common vsock zram xfs dm_service_time
> sd_mod tg3 ibmvscsi ibmveth scsi_transport_srp ipr btrfs xor
> libblake2b raid6_pq zstd_compress sunrpc dm_mirror dm_region_hash
> dm_log be2iscsi bnx2i cnic uio cxgb4i cxgb4 tls libcxgbi libcxgb
> qla4xxx iscsi_boot_sysfs iscsi_tcp libiscsi_tcp libiscsi
> scsi_transport_iscsi dm_multipath fuse dm_mod
> [   63.748680] CPU: 58 UID: 0 PID: 5675 Comm: sosreport Kdump: loaded
> Not tainted 6.19.0-next-20260217 #1 PREEMPTLAZY
> [   63.748686] Hardware name: IBM,9080-HEX Power11 (architected)
> 0x820200 0xf000007 of:IBM,FW1110.11 (NH1110_102) hv:phyp pSeries
> [   63.748690] NIP:  c008000009de22bc LR: c00800000f086b48 CTR: 
> c008000009de2278
> [   63.748693] REGS: c0000000a3a4f7c0 TRAP: 0380   Not tainted
> (6.19.0-next-20260217)
> [   63.748697] MSR:  8000000000009033 <SF,EE,ME,IR,DR,RI,LE>  CR:
> 48044402  XER: 00000000
> [   63.748706] CFAR: c00800000f0de15c IRQMASK: 0
> [   63.748706] GPR00: c00800000f086b48 c0000000a3a4fa60
> c008000006f47600 c0000000b70f9b00
> [   63.748706] GPR04: 2e6f732e7a62696c 0000000000000000
> 0000000000000000 c000000152f70800
> [   63.748706] GPR08: c0000000b70f9b00 0000000000000000
> 0000000000400cc0 c00800000f0de148
> [   63.748706] GPR12: c008000009de2278 c0000027fde40700
> 0000000000000000 0000000000000000
> [   63.748706] GPR16: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [   63.748706] GPR20: 0000000000000000 0000000000000000
> c0000000e2e17b08 c0000000e2e17af8
> [   63.748706] GPR24: 000000007fff0000 0000000000000000
> fffffffffffff000 0000000000000000
> [   63.748706] GPR28: c0000000b70f9b00 0000000000000000
> c0000000b70f9b00 2e6f732e7a62696c
> [   63.748744] NIP [c008000009de22bc] cache_check_rcu+0x44/0x2c0 
> [sunrpc]
> [   63.748776] LR [c00800000f086b48] e_show+0x40/0x260 [nfsd]
> [   63.748805] Call Trace:
> [   63.748807] [c0000000a3a4fa60] [c0000000a3a4fb50]
> 0xc0000000a3a4fb50 (unreliable)
> [   63.748812] [c0000000a3a4fb10] [c00800000f086b48] e_show+0x40/0x260 
> [nfsd]
> [   63.748839] [c0000000a3a4fb50] [c0000000007a7468] 
> seq_read_iter+0x1a8/0x680
> [   63.748845] [c0000000a3a4fc20] [c0000000007a7a44] 
> seq_read+0x104/0x150
> [   63.748850] [c0000000a3a4fcc0] [c00000000084ecb0] 
> proc_reg_read+0xf0/0x160
> [   63.748855] [c0000000a3a4fcf0] [c000000000756af0] 
> vfs_read+0xe0/0x3d0
> [   63.748859] [c0000000a3a4fdb0] [c0000000007579f8] 
> ksys_read+0x78/0x140
> [   63.748862] [c0000000a3a4fe00] [c0000000000348c8]
> system_call_exception+0x128/0x350
> [   63.748868] [c0000000a3a4fe50] [c00000000000d6a0]
> system_call_common+0x160/0x2e4
> [   63.748873] ---- interrupt: c00 at 0x7fffa74b9fc8
> [   63.748876] NIP:  00007fffa74b9fc8 LR: 00007fffa74a8438 CTR: 
> 0000000000000000
> [   63.748879] REGS: c0000000a3a4fe80 TRAP: 0c00   Not tainted
> (6.19.0-next-20260217)
> [   63.748882] MSR:  800000000280f033
> <SF,VEC,VSX,EE,PR,FP,ME,IR,DR,RI,LE>  CR: 28044404  XER: 00000000
> [   63.748892] IRQMASK: 0
> [   63.748892] GPR00: 0000000000000003 00007fff8b7ed9d0
> 00007fffa7647c00 0000000000000008
> [   63.748892] GPR04: 00007fff7c021af0 0000000000010000
> 0000000000000000 0000000000000000
> [   63.748892] GPR08: 0000000000000000 0000000000000000
> 0000000000000000 0000000000000000
> [   63.748892] GPR12: 0000000000000000 00007fff8b7f68a0
> 0000000000000000 0000000000000000
> [   63.748892] GPR16: 0000000000000000 0000000000000000
> 00007fffa55f0828 00007fff8b7edf88
> [   63.748892] GPR20: 00007fffa55f0830 00007fffa7bed480
> 00007fffa55f0838 00007fffa7c0d480
> [   63.748892] GPR24: 00007fffa7d8e0f0 00007fff7c021af0
> 0000000000000008 00007fff94001290
> [   63.748892] GPR28: 0000000000000000 00007fff8b7ef8d0
> 00007fffa062be00 0000000000010000
> [   63.748927] NIP [00007fffa74b9fc8] 0x7fffa74b9fc8
> [   63.748930] LR [00007fffa74a8438] 0x7fffa74a8438
> [   63.748933] ---- interrupt: c00
> [   63.748935] Code: fba1ffe8 fbe1fff8 fb61ffd8 fbc1fff0 7c9f2378
> 7c7c1b78 7cbd2b78 f8010010 f821ff51 e92d0c78 f9210078 39200000
> <e9240028> 71290001 418201cc fb410080
> [   63.748948] ---[ end trace 0000000000000000 ]---
> 
> Next Steps:
> I have vmcore dumps from multiple crashes and am working on:
> 1. Crash utility analysis to examine the corrupted cache structures
> 2. Git bisect to identify the problematic commit


^ permalink raw reply

* Re: [PATCH 0/5] ibmvfc: make ibmvfc support FPIN messages
From: Martin K. Petersen @ 2026-04-30 16:25 UTC (permalink / raw)
  To: Dave Marquardt via B4 Relay
  Cc: James E.J. Bottomley, Martin K. Petersen, Madhavan Srinivasan,
	Michael Ellerman, Nicholas Piggin, Christophe Leroy (CS GROUP),
	Tyrel Datwyler, davemarq, linux-kernel, linux-scsi, linuxppc-dev,
	Brian King, Greg Joyce, Kyle Mahlkuch
In-Reply-To: <20260408-ibmvfc-fpin-support-v1-0-52b06c464e03@linux.ibm.com>


Dave,

> This patch series adds FPIN (fabric performance impact notification)
> support to the ibmvfc (IBM Virtual Fibre Channel) driver. This comes
> in three flavors:

https://sashiko.dev/#/patchset/20260408-ibmvfc-fpin-support-v1-0-52b06c464e03%40linux.ibm.com

-- 
Martin K. Petersen


^ permalink raw reply

* Re: [mainline][BUG] Observed Workqueue lockups on offline CPUs.
From: Samir M @ 2026-04-30 15:36 UTC (permalink / raw)
  To: paulmck
  Cc: Shrikanth Hegde, Tejun Heo, Boqun Feng, LKML, RCU, linuxppc-dev,
	Boqun Feng
In-Reply-To: <16676927-5a4a-4007-9ece-593a4699adb0@paulmck-laptop>


On 30/04/26 8:30 pm, Paul E. McKenney wrote:
> On Thu, Apr 30, 2026 at 11:49:02AM +0530, Samir M wrote:
>> On 29/04/26 11:21 pm, Shrikanth Hegde wrote:
>>> Hi Samir.
>>>
>>> On 4/29/26 3:46 PM, Samir M wrote:
>>>> Hi Boqun,
>>>>
>>>> Thank you for pointing me to the existing patches. I have tested
>>>> both Paul's patch [1] and TJ's workqueue patch [2] on my PowerPC
>>>> system (80 CPUs), and can confirm that the workqueue lockup issue is
>>>> not observed.
>>>>
>>> Can you try only paul's patch and confirm if the issue is fixed?
>>>
>> Hi Shrikanth,
>>
>> I have verified Paul’s “alone” patch, and it resolves the issue no workqueue
>> lockups are observed.
> Thank you, Samir!  May I add your Tested-by?
>
> 							Thanx, Paul

Hi Paul,

Yes, you may add:
Tested-by: Samir <samir@linux.ibm.com>

Thanks,
Samir
>> Regards,
>> Samir.
>>
>>
>>>> Test Environment:
>>>> - System: PowerPC with 80 CPUs ( e.g. PowerPC LPARs with 80 online
>>>> and 384 possible CPUs)
>>>> - Kernel version: Latest upstream (7.1-rc1)
>>>>
>>>> Regression Testing Results:
>>>> All tests completed successfully with no issues observed:
>>>> - Hackbench
>>>> - Kernel selftests
>>>> - LTP scheduler tests
>>>>
>>>> The workqueue lockup that was previously occurring is no longer
>>>> present with the patches applied.
>>>>
>>>> References:
>>>> [1]: https://lore.kernel.org/rcu/ed1fa6cd-7343-4ca3-8b9d-
>>>> d699ca496f83@paulmck-laptop/
>>>> [2]: https://lore.kernel.org/rcu/adlHKowvhn8AGXCc@slm.duckdns.org/
>>>>
>>>> Best regards,
>>>> Samir


^ permalink raw reply

* Re: [mainline][BUG] Observed Workqueue lockups on offline CPUs.
From: Paul E. McKenney @ 2026-04-30 15:00 UTC (permalink / raw)
  To: Samir M
  Cc: Shrikanth Hegde, Tejun Heo, Boqun Feng, LKML, RCU, linuxppc-dev,
	Boqun Feng
In-Reply-To: <6c51cebc-3b98-439d-adcd-a0f382b98f40@linux.ibm.com>

On Thu, Apr 30, 2026 at 11:49:02AM +0530, Samir M wrote:
> 
> On 29/04/26 11:21 pm, Shrikanth Hegde wrote:
> > Hi Samir.
> > 
> > On 4/29/26 3:46 PM, Samir M wrote:
> > > 
> > > Hi Boqun,
> > > 
> > > Thank you for pointing me to the existing patches. I have tested
> > > both Paul's patch [1] and TJ's workqueue patch [2] on my PowerPC
> > > system (80 CPUs), and can confirm that the workqueue lockup issue is
> > > not observed.
> > > 
> > 
> > Can you try only paul's patch and confirm if the issue is fixed?
> > 
> 
> Hi Shrikanth,
> 
> I have verified Paul’s “alone” patch, and it resolves the issue no workqueue
> lockups are observed.

Thank you, Samir!  May I add your Tested-by?

							Thanx, Paul

> Regards,
> Samir.
> 
> 
> > > Test Environment:
> > > - System: PowerPC with 80 CPUs ( e.g. PowerPC LPARs with 80 online
> > > and 384 possible CPUs)
> > > - Kernel version: Latest upstream (7.1-rc1)
> > > 
> > > Regression Testing Results:
> > > All tests completed successfully with no issues observed:
> > > - Hackbench
> > > - Kernel selftests
> > > - LTP scheduler tests
> > > 
> > > The workqueue lockup that was previously occurring is no longer
> > > present with the patches applied.
> > > 
> > > References:
> > > [1]: https://lore.kernel.org/rcu/ed1fa6cd-7343-4ca3-8b9d-
> > > d699ca496f83@paulmck-laptop/
> > > [2]: https://lore.kernel.org/rcu/adlHKowvhn8AGXCc@slm.duckdns.org/
> > > 
> > > Best regards,
> > > Samir
> > 


^ permalink raw reply

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Matthew Wilcox @ 2026-04-30 12:37 UTC (permalink / raw)
  To: Barry Song (Xiaomi)
  Cc: akpm, linux-mm, david, ljs, liam, vbabka, rppt, surenb, mhocko,
	jack, pfalcato, wanglian, chentao, lianux.mm, kunwu.chan,
	liyangouwen1, chrisl, kasong, shikemeng, nphamcs, bhe,
	youngjun.park, linux-arm-kernel, linux-kernel, loongarch,
	linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <20260430040427.4672-1-baohua@kernel.org>

On Thu, Apr 30, 2026 at 12:04:22PM +0800, Barry Song (Xiaomi) wrote:
> (1) If we need to wait for I/O completion, we still drop the per-VMA lock, as
> current page fault handling already does. Holding it for too long may introduce
> various priority inversion issues on mobile devices. After I/O completes, we
> retry the page fault with the per-VMA lock, rather than falling back to
> mmap_lock.

You're going to have to do better than that.  You know I hate the
additional complexity you're adding.  You need to explain why my idea of
ripping out all the complexity now that we have per-VMA locks doesn't
work.


^ permalink raw reply

* Re: [PATCH v2 4/5] mm: Don't retry page fault if folio is uptodate during swap-in
From: Matthew Wilcox @ 2026-04-30 12:35 UTC (permalink / raw)
  To: Barry Song (Xiaomi)
  Cc: akpm, linux-mm, david, ljs, liam, vbabka, rppt, surenb, mhocko,
	jack, pfalcato, wanglian, chentao, lianux.mm, kunwu.chan,
	liyangouwen1, chrisl, kasong, shikemeng, nphamcs, bhe,
	youngjun.park, linux-arm-kernel, linux-kernel, loongarch,
	linuxppc-dev, linux-riscv, linux-s390
In-Reply-To: <20260430040427.4672-5-baohua@kernel.org>

On Thu, Apr 30, 2026 at 12:04:26PM +0800, Barry Song (Xiaomi) wrote:
> If we are waiting for long I/O to complete, it makes sense to
> avoid holding locks for too long. However, if the folio is
> uptodate, we are likely only waiting for a concurrent PTE
> update to finish. Retrying the entire page fault seems
> excessive.

I think the idea is good, but the implementation is misplaced.
The check for folio_uptodate() should be inside folio_lock_or_retry()
rather than tampering with FAULT_FLAG_ALLOW_RETRY in its caller.

Similarly for your next patch.

> Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
> ---
>  mm/memory.c | 7 +++++++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/mm/memory.c b/mm/memory.c
> index 0c740ca363cc..a2e4f2d87ec8 100644
> --- a/mm/memory.c
> +++ b/mm/memory.c
> @@ -4949,6 +4949,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
>  	}
>  
>  	swapcache = folio;
> +	/*
> +	 * If the folio is uptodate, we are likely only waiting for
> +	 * another concurrent PTE mapping to complete, which should
> +	 * be brief. No need to drop the lock and retry the fault.
> +	 */
> +	if (folio_test_uptodate(folio))
> +		vmf->flags &= ~FAULT_FLAG_ALLOW_RETRY;
>  	ret |= folio_lock_or_retry(folio, vmf);
>  	if (ret & VM_FAULT_RETRY) {
>  		if (fault_flag_allow_retry_first(vmf->flags) &&
> -- 
> 2.39.3 (Apple Git-146)
> 
> 


^ permalink raw reply

* Re: [PATCH 2/2] powerpc/text-patching: Fix possible stringop-overread compilation error
From: Christophe Leroy (CS GROUP) @ 2026-04-30  9:41 UTC (permalink / raw)
  To: Xie Yuanbin, andriy.shevchenko, maddy, mpe, npiggin, kees, andy
  Cc: linuxppc-dev, linux-kernel, linux-hardening, lilinjie8, liaohua4
In-Reply-To: <20260430072913.62348-1-xieyuanbin1@huawei.com>



Le 30/04/2026 à 09:28, Xie Yuanbin a écrit :
> Hi, Christophe Leroy!
> 
> On Mon, 9 Feb 2026 14:41:51 +0100, Christophe Leroy (CS GROUP) wrote:
>> On 09/02/2026 14:25, Xie Yuanbin wrote:
>>> When CONFIG_PPC64_ELF_ABI_V1=y, it seems that the try of lookupinp
>>> the original non-dot symbol is missing.
>>>
>>> What about this (Only the compilation test is performed):
>>> ```c
>>> static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
>>> {
>>> 	unsigned long addr = kallsyms_lookup_name(name);
>>>
>>> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
>>> 		addr = ppc_function_entry((void *)addr);
>>>
>>> 	return addr;
>>> }
>>>
>>> #define ppc_kallsyms_lookup_name(x) ({					\
>>> 		unsigned long addr = 0;					\
>>> 		if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1))		\
>>> 			addr = __ppc_kallsyms_lookup_name("." x);	\
>>> 		if (!addr)						\
>>> 			addr = __ppc_kallsyms_lookup_name(x);		\
>>> 		addr;							\
>>> 	})
>>> ```
>>
>> Good point.
>>
>> To avoid duplicating the string I'd suggest:
>>
>> static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
>> {
>> 	unsigned long addr = kallsyms_lookup_name(name);
>>
>> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1) && !addr)
>> 		addr = kallsyms_lookup_name(name + 1);
>> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
>> 		addr = ppc_function_entry((void *)addr);
>>
>> 	return addr;
>> }
>>
>> #ifdef CONFIG_PPC64_ELF_ABI_V1
>> #define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name("." x);
>> #else
>> #define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name(x)
>> #endif
> 
> I would like to submit the modification as a new patch, and include you
> as Co-developer. Do you agree?

Add me as Suggested-by: instead.

Thanks
Christophe

> 
> Full of the patch:
> ```patch
>  From fbf07e5c1a97da7c8572435537f2b92213ede39d Mon Sep 17 00:00:00 2001
> From: Xie Yuanbin <xieyuanbin1@huawei.com>
> Date: Thu, 30 Apr 2026 14:15:26 +0800
> Subject: [PATCH V2] powerpc/text-patching: simplify the implementation of ppc_kallsyms_lookup_name()
> 
> ppc_kallsyms_lookup_name() is called only twice in the kernel code, and
> the parameters are all constant strings. strnlen(name, KSYM_NAME_LEN) is
> called inside ppc_kallsyms_lookup_name(), when the compiler detects that
> KSYM_NAME_LEN is larger then the constant strings,
> the following error will be triggered:
> ```log
>    CC      arch/powerpc/kernel/optprobes.o
> In file included from ./arch/powerpc/include/asm/kprobes.h:24,
>                   from ./include/linux/kprobes.h:31,
>                   from arch/powerpc/kernel/optprobes.c:8:
> In function ‘ppc_kallsyms_lookup_name’,
>      inlined from ‘arch_prepare_optimized_kprobe’ at arch/powerpc/kernel/optprobes.c:209:21:
> ./arch/powerpc/include/asm/text-patching.h:232:13: error: ‘strnlen’ specified bound 512 exceeds source size 19 [-Werror=stringop-overread]
>    232 |         if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
>        |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
> In function ‘ppc_kallsyms_lookup_name’,
>      inlined from ‘arch_prepare_optimized_kprobe’ at arch/powerpc/kernel/optprobes.c:210:22:
> ./arch/powerpc/include/asm/text-patching.h:232:13: error: ‘strnlen’ specified bound 512 exceeds source size 13 [-Werror=stringop-overread]
>    232 |         if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
>        |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
> cc1: all warnings being treated as errors
> ```
> 
> The error can be reproduced in the following ways:
> Use lastest linux-next source, change ppc_kallsyms_lookup_name() to
> __always_inline, use default ppc64_defconfig, set CONFIG_EXPERT=y,
> CONFIG_PPC64_BIG_ENDIAN_ELF_ABI_V2=n, CONFIG_CC_OPTIMIZE_FOR_SIZE=y,
> and use gcc-14 or a later version for compilation.
> 
> Since ppc_kallsyms_lookup_name() is called only twice in the kernel,
> and the parameters are all constant strins, simplify the implementation
> of ppc_kallsyms_lookup_name() and avoid calling strnlen().
> 
> Cc: Andy Shevchenko <andriy.shevchenko@intel.com>
> Cc: Kees Cook <kees@kernel.org>
> Co-developed-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
> Signed-off-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
> Signed-off-by: Xie Yuanbin <xieyuanbin1@huawei.com>
> ---
> v1->v2: https://eur01.safelinks.protection.outlook.com/?url=https%3A%2F%2Flore.kernel.org%2F20260205100517.292858-2-xieyuanbin1%40huawei.com&data=05%7C02%7Cchristophe.leroy%40csgroup.eu%7C525ecaba66ea4e3b15f208dea68a3fa1%7C8b87af7d86474dc78df45f69a2011bb5%7C0%7C0%7C639131309913493421%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=aaK9PO2mGdxM3vuJBVP0lcl6VKfqs3VKih5VPcG0IxI%3D&reserved=0
>    - Not use strlen()
> 
>   arch/powerpc/include/asm/text-patching.h | 42 ++++++++----------------
>   1 file changed, 17 insertions(+), 33 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/text-patching.h b/arch/powerpc/include/asm/text-patching.h
> index e7f14720f630..2d3f698cb4f1 100644
> --- a/arch/powerpc/include/asm/text-patching.h
> +++ b/arch/powerpc/include/asm/text-patching.h
> @@ -221,39 +221,23 @@ static inline unsigned long ppc_global_function_entry(void *func)
>    * - For ABIv1, we lookup the dot variant.
>    * - For ABIv2, we return the local entry point.
>    */
> -static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
> -{
> -	unsigned long addr;
> -#ifdef CONFIG_PPC64_ELF_ABI_V1
> -	/* check for dot variant */
> -	char dot_name[1 + KSYM_NAME_LEN];
> -	bool dot_appended = false;
> -
> -	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
> -		return 0;
> -
> -	if (name[0] != '.') {
> -		dot_name[0] = '.';
> -		dot_name[1] = '\0';
> -		strlcat(dot_name, name, sizeof(dot_name));
> -		dot_appended = true;
> -	} else {
> -		dot_name[0] = '\0';
> -		strlcat(dot_name, name, sizeof(dot_name));
> -	}
> -	addr = kallsyms_lookup_name(dot_name);
> -	if (!addr && dot_appended)
> -		/* Let's try the original non-dot symbol lookup	*/
> -		addr = kallsyms_lookup_name(name);
> -#elif defined(CONFIG_PPC64_ELF_ABI_V2)
> -	addr = kallsyms_lookup_name(name);
> -	if (addr)
> -		addr = ppc_function_entry((void *)addr);
> -#else
> -	addr = kallsyms_lookup_name(name);
> -#endif
> -	return addr;
> -}
> +static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
> +{
> +	unsigned long addr = kallsyms_lookup_name(name);
> +
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1) && !addr)
> +		addr = kallsyms_lookup_name(name + 1);
> +	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
> +		addr = ppc_function_entry((void *)addr);
> +
> +	return addr;
> +}
> +
> +#ifdef CONFIG_PPC64_ELF_ABI_V1
> +#define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name("." x)
> +#else
> +#define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name(x)
> +#endif
>   
>   /*
>    * Some instruction encodings commonly used in dynamic ftracing



^ permalink raw reply

* Re: [PATCH 2/2] powerpc/text-patching: Fix possible stringop-overread compilation error
From: Xie Yuanbin @ 2026-04-30  7:28 UTC (permalink / raw)
  To: chleroy, andriy.shevchenko, maddy, mpe, npiggin, kees, andy
  Cc: linuxppc-dev, linux-kernel, linux-hardening, lilinjie8, liaohua4,
	xieyuanbin1
In-Reply-To: <5cfae419-427a-471b-8bbe-645f56442e2c@kernel.org>

Hi, Christophe Leroy!

On Mon, 9 Feb 2026 14:41:51 +0100, Christophe Leroy (CS GROUP) wrote:
> On 09/02/2026 14:25, Xie Yuanbin wrote:
>> When CONFIG_PPC64_ELF_ABI_V1=y, it seems that the try of lookupinp
>> the original non-dot symbol is missing.
>>
>> What about this (Only the compilation test is performed):
>> ```c
>> static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
>> {
>> 	unsigned long addr = kallsyms_lookup_name(name);
>>
>> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
>> 		addr = ppc_function_entry((void *)addr);
>>
>> 	return addr;
>> }
>>
>> #define ppc_kallsyms_lookup_name(x) ({					\
>> 		unsigned long addr = 0;					\
>> 		if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1))		\
>> 			addr = __ppc_kallsyms_lookup_name("." x);	\
>> 		if (!addr)						\
>> 			addr = __ppc_kallsyms_lookup_name(x);		\
>> 		addr;							\
>> 	})
>> ```
>
> Good point.
>
> To avoid duplicating the string I'd suggest:
>
> static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
> {
> 	unsigned long addr = kallsyms_lookup_name(name);
>
> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1) && !addr)
> 		addr = kallsyms_lookup_name(name + 1);
> 	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
> 		addr = ppc_function_entry((void *)addr);
>
> 	return addr;
> }
>
> #ifdef CONFIG_PPC64_ELF_ABI_V1
> #define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name("." x);
> #else
> #define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name(x)
> #endif

I would like to submit the modification as a new patch, and include you
as Co-developer. Do you agree?

Full of the patch:
```patch
From fbf07e5c1a97da7c8572435537f2b92213ede39d Mon Sep 17 00:00:00 2001
From: Xie Yuanbin <xieyuanbin1@huawei.com>
Date: Thu, 30 Apr 2026 14:15:26 +0800
Subject: [PATCH V2] powerpc/text-patching: simplify the implementation of ppc_kallsyms_lookup_name()

ppc_kallsyms_lookup_name() is called only twice in the kernel code, and
the parameters are all constant strings. strnlen(name, KSYM_NAME_LEN) is
called inside ppc_kallsyms_lookup_name(), when the compiler detects that
KSYM_NAME_LEN is larger then the constant strings,
the following error will be triggered:
```log
  CC      arch/powerpc/kernel/optprobes.o
In file included from ./arch/powerpc/include/asm/kprobes.h:24,
                 from ./include/linux/kprobes.h:31,
                 from arch/powerpc/kernel/optprobes.c:8:
In function ‘ppc_kallsyms_lookup_name’,
    inlined from ‘arch_prepare_optimized_kprobe’ at arch/powerpc/kernel/optprobes.c:209:21:
./arch/powerpc/include/asm/text-patching.h:232:13: error: ‘strnlen’ specified bound 512 exceeds source size 19 [-Werror=stringop-overread]
  232 |         if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
In function ‘ppc_kallsyms_lookup_name’,
    inlined from ‘arch_prepare_optimized_kprobe’ at arch/powerpc/kernel/optprobes.c:210:22:
./arch/powerpc/include/asm/text-patching.h:232:13: error: ‘strnlen’ specified bound 512 exceeds source size 13 [-Werror=stringop-overread]
  232 |         if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
      |             ^~~~~~~~~~~~~~~~~~~~~~~~~~~~
cc1: all warnings being treated as errors
```

The error can be reproduced in the following ways:
Use lastest linux-next source, change ppc_kallsyms_lookup_name() to
__always_inline, use default ppc64_defconfig, set CONFIG_EXPERT=y,
CONFIG_PPC64_BIG_ENDIAN_ELF_ABI_V2=n, CONFIG_CC_OPTIMIZE_FOR_SIZE=y,
and use gcc-14 or a later version for compilation.

Since ppc_kallsyms_lookup_name() is called only twice in the kernel,
and the parameters are all constant strins, simplify the implementation
of ppc_kallsyms_lookup_name() and avoid calling strnlen().

Cc: Andy Shevchenko <andriy.shevchenko@intel.com>
Cc: Kees Cook <kees@kernel.org>
Co-developed-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Signed-off-by: Christophe Leroy (CS GROUP) <chleroy@kernel.org>
Signed-off-by: Xie Yuanbin <xieyuanbin1@huawei.com>
---
v1->v2: https://lore.kernel.org/20260205100517.292858-2-xieyuanbin1@huawei.com
  - Not use strlen()

 arch/powerpc/include/asm/text-patching.h | 42 ++++++++----------------
 1 file changed, 17 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/include/asm/text-patching.h b/arch/powerpc/include/asm/text-patching.h
index e7f14720f630..2d3f698cb4f1 100644
--- a/arch/powerpc/include/asm/text-patching.h
+++ b/arch/powerpc/include/asm/text-patching.h
@@ -221,39 +221,23 @@ static inline unsigned long ppc_global_function_entry(void *func)
  * - For ABIv1, we lookup the dot variant.
  * - For ABIv2, we return the local entry point.
  */
-static inline unsigned long ppc_kallsyms_lookup_name(const char *name)
-{
-	unsigned long addr;
-#ifdef CONFIG_PPC64_ELF_ABI_V1
-	/* check for dot variant */
-	char dot_name[1 + KSYM_NAME_LEN];
-	bool dot_appended = false;
-
-	if (strnlen(name, KSYM_NAME_LEN) >= KSYM_NAME_LEN)
-		return 0;
-
-	if (name[0] != '.') {
-		dot_name[0] = '.';
-		dot_name[1] = '\0';
-		strlcat(dot_name, name, sizeof(dot_name));
-		dot_appended = true;
-	} else {
-		dot_name[0] = '\0';
-		strlcat(dot_name, name, sizeof(dot_name));
-	}
-	addr = kallsyms_lookup_name(dot_name);
-	if (!addr && dot_appended)
-		/* Let's try the original non-dot symbol lookup	*/
-		addr = kallsyms_lookup_name(name);
-#elif defined(CONFIG_PPC64_ELF_ABI_V2)
-	addr = kallsyms_lookup_name(name);
-	if (addr)
-		addr = ppc_function_entry((void *)addr);
-#else
-	addr = kallsyms_lookup_name(name);
-#endif
-	return addr;
-}
+static inline unsigned long __ppc_kallsyms_lookup_name(const char *name)
+{
+	unsigned long addr = kallsyms_lookup_name(name);
+
+	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V1) && !addr)
+		addr = kallsyms_lookup_name(name + 1);
+	if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2) && addr)
+		addr = ppc_function_entry((void *)addr);
+
+	return addr;
+}
+
+#ifdef CONFIG_PPC64_ELF_ABI_V1
+#define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name("." x)
+#else
+#define ppc_kallsyms_lookup_name(x)	__ppc_kallsyms_lookup_name(x)
+#endif
 
 /*
  * Some instruction encodings commonly used in dynamic ftracing
-- 
2.53.0
```


^ permalink raw reply related

* Re: [mainline][BUG] Observed Workqueue lockups on offline CPUs.
From: Samir M @ 2026-04-30  6:19 UTC (permalink / raw)
  To: Shrikanth Hegde, Paul E . McKenney, Tejun Heo
  Cc: Boqun Feng, LKML, RCU, linuxppc-dev, Boqun Feng
In-Reply-To: <5ab8d72e-1a93-4d50-b097-0dde35d81f25@linux.ibm.com>


On 29/04/26 11:21 pm, Shrikanth Hegde wrote:
> Hi Samir.
>
> On 4/29/26 3:46 PM, Samir M wrote:
>>
>> Hi Boqun,
>>
>> Thank you for pointing me to the existing patches. I have tested both 
>> Paul's patch [1] and TJ's workqueue patch [2] on my PowerPC system 
>> (80 CPUs), and can confirm that the workqueue lockup issue is not 
>> observed.
>>
>
> Can you try only paul's patch and confirm if the issue is fixed?
>

Hi Shrikanth,

I have verified Paul’s “alone” patch, and it resolves the issue no 
workqueue lockups are observed.

Regards,
Samir.


>> Test Environment:
>> - System: PowerPC with 80 CPUs ( e.g. PowerPC LPARs with 80 online 
>> and 384 possible CPUs)
>> - Kernel version: Latest upstream (7.1-rc1)
>>
>> Regression Testing Results:
>> All tests completed successfully with no issues observed:
>> - Hackbench
>> - Kernel selftests
>> - LTP scheduler tests
>>
>> The workqueue lockup that was previously occurring is no longer 
>> present with the patches applied.
>>
>> References:
>> [1]: https://lore.kernel.org/rcu/ed1fa6cd-7343-4ca3-8b9d- 
>> d699ca496f83@paulmck-laptop/
>> [2]: https://lore.kernel.org/rcu/adlHKowvhn8AGXCc@slm.duckdns.org/
>>
>> Best regards,
>> Samir
>


^ permalink raw reply

* [PATCH 6/6] KVM: PPC: Document KVM_PPC_GET_COMPAT_CAPS ioctl
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Paolo Bonzini, Jonathan Corbet,
	Shuah Khan, kvm, linux-kernel, linux-doc
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

Add documentation for the KVM_PPC_GET_COMPAT_CAPS ioctl to the KVM API
documentation.

The ioctl exposes host processor compatibility modes supported for
nested KVM guests on PowerPC systems.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 Documentation/virt/kvm/api.rst | 35 ++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst
index 52bbbb553ce1..7a10c3c6cbf1 100644
--- a/Documentation/virt/kvm/api.rst
+++ b/Documentation/virt/kvm/api.rst
@@ -6555,6 +6555,41 @@ KVM_S390_KEYOP_SSKE
 
 .. _kvm_run:
 
+4.145 KVM_PPC_GET_COMPAT_CAPS
+-----------------------------
+:Capability: KVM_CAP_PPC_COMPAT_CAPS
+:Architectures: powerpc
+:Type: vm ioctl
+:Parameters: struct kvm_ppc_compat_caps (out)
+:Returns:
+	0 on successful completion,
+	-EFAULT if ``struct kvm_ppc_compat_caps`` cannot be written
+
+IBM POWER system server-based processors provide a compatibility mode feature
+where an Nth generation processor can operate in modes consistent with earlier
+generations such as (N-1) and (N-2).
+
+This ioctl provides userspace with information about the CPU compatibility modes
+supported by the current host processor for booting the nested KVM guests on
+PowerNV (KVM nested APIv1) and PowerVM (KVM nested APIv2) platforms.
+
+::
+
+  struct kvm_ppc_compat_caps {
+         __u32   flags;
+         __u64   compat_capabilities;    /* Capabilities supported by the host */
+  };
+
+The ``compat_capabilities`` bit field describes the processor compatibility
+modes supported by the host. For example, the following bits indicate support
+for specific processor modes.
+
+::
+
+ bit 1: KVM guests can run in Power9 processor mode
+ bit 2: KVM guests can run in Power10 processor mode
+ bit 3: KVM guests can run in Power11 processor mode
+
 5. The kvm_run structure
 ========================
 
-- 
2.50.1



^ permalink raw reply related

* [PATCH 5/6] KVM: PPC: Book3S HV: Add support for compat CPU capabilities for KVM on PowerNV
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Nicholas Piggin, Michael Ellerman,
	Christophe Leroy (CS GROUP), kvm, linux-kernel
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

Currently, when booting a compatibility-mode KVM guest (L1) on a PowerNV
hypervisor (L0), the guest runs with the expected processor
compatibility level. However, when booting a nested KVM guest (L2)
inside the L1, QEMU derives the CPU model from the raw host PVR and
attempts to run the nested guest at that level, instead of honoring the
compatibility mode of the L1.

Extend host CPU compatibility capability reporting to support nested
virtualization on PowerNV systems (PAPR nested API v1).

For nested API v2 (PowerVM), compatibility capabilities are obtained
from the hypervisor via the H_GUEST_GET_CAPABILITIES hcall. This
information is not available on PowerNV systems.

For nested API v1, derive the compatibility capabilities from the L1
guest by reading the "cpu-version" property from the device tree, which
reflects the effective (logical) processor compatibility level. Map this
value to the corresponding compatibility capability bitmap.

Introduce a helper to translate CPU version values into compatibility
capability bits and integrate it into kvmppc_get_compat_cpu_caps().

This allows userspace to query host CPU compatibility modes on both
PowerVM and PowerNV platforms via the KVM_PPC_GET_COMPAT_CAPS ioctl.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv.c | 37 +++++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d602d90111d1..25d05f1ccb72 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6516,16 +6516,51 @@ static bool kvmppc_hash_v3_possible(void)
 	return true;
 }

+static int kvmppc_map_compat_capabilities(const __be32 cpu_version,
+				      unsigned long *capabilities)
+{
+	switch (cpu_version) {
+	case PVR_ARCH_31_P11:
+		*capabilities |= H_GUEST_CAP_POWER11;
+		break;
+	case PVR_ARCH_31:
+		*capabilities |= H_GUEST_CAP_POWER10;
+		break;
+	case PVR_ARCH_300:
+		*capabilities |= H_GUEST_CAP_POWER9;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return 0;
+}

 static int kvmppc_get_compat_cpu_caps(struct kvm_ppc_compat_caps *host_caps)
 {

+	struct device_node *np;
 	unsigned long capabilities = 0;
+	const __be32 *prop = NULL;
 	long rc = -EINVAL;
+	u32 cpu_version;

 	if (kvmhv_on_pseries()) {
-		if (kvmhv_is_nestedv2())
+		if (kvmhv_is_nestedv2()) {
 			rc = plpar_guest_get_capabilities(0, &capabilities);
+		} else {
+			for_each_node_by_type(np, "cpu") {
+				prop = of_get_property(np, "cpu-version", NULL);
+				if (prop) {
+					cpu_version = be32_to_cpup(prop);
+					break;
+				}
+			}
+			if (!prop)
+				return -EINVAL;
+			rc = kvmppc_map_compat_capabilities(cpu_version,
+								&capabilities);
+		}
 		host_caps->compat_capabilities = capabilities;
 	}

-- 
2.50.1

^ permalink raw reply related

* [PATCH 4/6] KVM: PPC: Book3S HV: Implement compat CPU capability retrieval for KVM on PowerVM
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Nicholas Piggin, Michael Ellerman,
	Christophe Leroy (CS GROUP), kvm, linux-kernel
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

On POWER systems, the host CPU may run in a compatibility mode (e.g., a
Power11 processor operating in Power10 compatibility mode). In such
cases, the effective CPU level exposed to guests differs from the
physical processor generation.

When running nested KVM guests, QEMU derives the host CPU type using
mfpvr(), which reflects the physical processor version. This can result
in a mismatch between the CPU model selected by QEMU and the
compatibility mode enforced by the host, leading to guest boot failures.

For example, booting a nested guest on a Power11 LPAR configured in
Power10 compatibility mode fails with:

  KVM-NESTEDv2: couldn't set guest wide elements
  [..KVM reg dump..]

This occurs because QEMU selects a CPU model corresponding to the
physical processor (via mfpvr()), while the host operates in a lower
compatibility mode. As a result, KVM rejects the requested compatibility
level during guest initialization.

Add support for retrieving host CPU compatibility capabilities for
nested guests on PowerVM (PAPR nested API v2). The hypervisor provides
the effective compatibility levels via the H_GUEST_GET_CAPABILITIES
hcall, which reflects the processor modes negotiated between the Power
hypervisor (L0) and the host partition (L1).

On pseries systems, obtain the capability bitmap using
plpar_guest_get_capabilities() and return it via struct
kvm_ppc_compat_caps. This information is then exposed to userspace
through the KVM_PPC_GET_COMPAT_CAPS ioctl.

Hook the implementation into the Book3S HV kvmppc_ops so that it can be
invoked by the generic KVM ioctl handling code.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 948c6b099a29..d602d90111d1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -6516,6 +6516,22 @@ static bool kvmppc_hash_v3_possible(void)
 	return true;
 }

+
+static int kvmppc_get_compat_cpu_caps(struct kvm_ppc_compat_caps *host_caps)
+{
+
+	unsigned long capabilities = 0;
+	long rc = -EINVAL;
+
+	if (kvmhv_on_pseries()) {
+		if (kvmhv_is_nestedv2())
+			rc = plpar_guest_get_capabilities(0, &capabilities);
+		host_caps->compat_capabilities = capabilities;
+	}
+
+	return rc;
+}
+
 static struct kvmppc_ops kvm_ops_hv = {
 	.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
 	.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
@@ -6558,6 +6574,7 @@ static struct kvmppc_ops kvm_ops_hv = {
 	.hash_v3_possible = kvmppc_hash_v3_possible,
 	.create_vcpu_debugfs = kvmppc_arch_create_vcpu_debugfs_hv,
 	.create_vm_debugfs = kvmppc_arch_create_vm_debugfs_hv,
+	.get_compat_cpu_ver = kvmppc_get_compat_cpu_caps,
 };

 static int kvm_init_subcore_bitmap(void)
-- 
2.50.1

^ permalink raw reply related

* [PATCH 3/6] KVM: PPC: Wire up KVM_PPC_GET_COMPAT_CAPS ioctl
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Nicholas Piggin, Michael Ellerman,
	Christophe Leroy (CS GROUP), kvm, linux-kernel
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

Add handling for KVM_PPC_GET_COMPAT_CAPS in kvm_arch_vm_ioctl() and
advertise support via KVM_CAP_PPC_COMPAT_CAPS.

The ioctl retrieves host CPU compatibility capabilities via a
PowerPC-specific backend implementation when available. If the
capability is not supported, the ioctl returns success with no
capabilities set, allowing userspace to fall back gracefully.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 arch/powerpc/include/asm/kvm_ppc.h |  1 +
 arch/powerpc/kvm/powerpc.c         | 19 +++++++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h
index 0953f2daa466..cadfb839e836 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -319,6 +319,7 @@ struct kvmppc_ops {
 	bool (*hash_v3_possible)(void);
 	int (*create_vm_debugfs)(struct kvm *kvm);
 	int (*create_vcpu_debugfs)(struct kvm_vcpu *vcpu, struct dentry *debugfs_dentry);
+	int (*get_compat_cpu_ver)(struct kvm_ppc_compat_caps *host_caps);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 00302399fc37..f35017d83d77 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -697,6 +697,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 			}
 		}
 		break;
+#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE)
+	case KVM_CAP_PPC_COMPAT_CAPS:
+		if (kvmhv_on_pseries())
+			r = 1;
+		break;
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 	default:
 		r = 0;
 		break;
@@ -2463,6 +2469,19 @@ int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg)
 		r = kvm->arch.kvm_ops->svm_off(kvm);
 		break;
 	}
+	case KVM_PPC_GET_COMPAT_CAPS: {
+		struct kvm_ppc_compat_caps host_caps;
+
+		memset(&host_caps, 0, sizeof(host_caps));
+		if (!kvm->arch.kvm_ops->get_compat_cpu_ver)
+			goto out;
+
+		r = kvm->arch.kvm_ops->get_compat_cpu_ver(&host_caps);
+		if (!r && copy_to_user(argp, &host_caps,
+				     sizeof(host_caps)))
+			r = -EFAULT;
+		break;
+	}
 	default: {
 		struct kvm *kvm = filp->private_data;
 		r = kvm->arch.kvm_ops->arch_vm_ioctl(filp, ioctl, arg);
-- 
2.50.1



^ permalink raw reply related

* [PATCH 2/6] KVM: PPC: Introduce KVM_CAP_PPC_COMPAT_CAPS and KVM_PPC_GET_COMPAT_CAPS
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Paolo Bonzini, Nicholas Piggin,
	Michael Ellerman, Christophe Leroy (CS GROUP), kvm, linux-kernel
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

Introduce a new capability and ioctl to expose CPU compatibility modes
supported by the host processor for nested guests.

Introduce a new KVM capability, KVM_CAP_PPC_COMPAT_CAPS, and a
corresponding vm ioctl, KVM_PPC_GET_COMPAT_CAPS, to expose processor
compatibility modes supported by the host.

On IBM POWER systems, newer processor generations (N) can operate in
compatibility modes corresponding to earlier generations, like (N-1) and
(N-2). This is particularly relevant for nested virtualization, where
nested KVM guests may need to run with a specific processor compatibility
level.

The new ioctl returns a bitmap describing the compatibility modes
supported by the host in respective bit numbers. This allows userspace
to select an appropriate compatibility level when configuring nested KVM
guests.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 arch/powerpc/include/uapi/asm/kvm.h | 6 ++++++
 include/uapi/linux/kvm.h            | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h
index 077c5437f521..a38dff7a8aea 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -437,6 +437,12 @@ struct kvm_ppc_cpu_char {
 	__u64	behaviour_mask;		/* valid bits in behaviour */
 };
 
+/* For KVM_PPC_GET_COMPAT_CAPS */
+struct kvm_ppc_compat_caps {
+	__u32	flags;
+	__u64	compat_capabilities;	/* Capabilities supported by the host */
+};
+
 /*
  * Values for character and character_mask.
  * These are identical to the values used by H_GET_CPU_CHARACTERISTICS.
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 6c8afa2047bf..1788a0068662 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -996,6 +996,7 @@ struct kvm_enable_cap {
 #define KVM_CAP_S390_USER_OPEREXEC 246
 #define KVM_CAP_S390_KEYOP 247
 #define KVM_CAP_S390_VSIE_ESAMODE 248
+#define KVM_CAP_PPC_COMPAT_CAPS 249
 
 struct kvm_irq_routing_irqchip {
 	__u32 irqchip;
@@ -1349,6 +1350,9 @@ struct kvm_s390_keyop {
 #define KVM_GET_DEVICE_ATTR	  _IOW(KVMIO,  0xe2, struct kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR	  _IOW(KVMIO,  0xe3, struct kvm_device_attr)
 
+/* Available with KVM_CAP_PPC_COMPAT_CAPS */
+#define KVM_PPC_GET_COMPAT_CAPS	_IOR(KVMIO,  0xe4, struct kvm_ppc_compat_caps)
+
 /*
  * ioctls for vcpu fds
  */
-- 
2.50.1



^ permalink raw reply related

* [PATCH 1/6] KVM: PPC: Book3S HV: Validate arch_compat against host compatibility mode
From: Amit Machhiwal @ 2026-04-30  5:49 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Nicholas Piggin, Michael Ellerman,
	Christophe Leroy (CS GROUP), kvm, linux-kernel
In-Reply-To: <20260430054906.94431-1-amachhiw@linux.ibm.com>

On IBM POWER systems, newer processor generations can operate in
compatibility modes corresponding to earlier generations. This becomes
relevant for nested virtualization, where nested KVM guests may need to
run with a specific processor compatibility level.

Currently, when running a nested KVM guest (L2) inside a Power11 pSeries
logical partition (L1) booted in Power10 compatibility mode, the guest
fails to boot while setting 'arch_compat'. This happens because the CPU
class is derived from the hardware PVR (via mfspr()), which reflects the
physical processor generation (Power11), rather than the effective
compatibility mode (Power10).

As a result, userspace may request a Power11 arch_compat for the L2
guest. However, the L1 partition, running in Power10 compatibility, has
only negotiated support up to Power10 with the Power Hypervisor (L0).
When H_SET_STATE is invoked with a Power11 Logical PVR, the hypervisor
rejects the request, leading to a late guest boot failure:

  KVM-NESTEDv2: couldn't set guest wide elements
  [..KVM reg dump..]

This situation should be detected earlier. Rejecting unsupported
'arch_compat' values in 'kvmppc_set_arch_compat()' avoids issuing an
invalid H_SET_STATE hcall and provides a clearer failure mode.

Add a check to reject Power11 'arch_compat' requests when the host is
running in Power10 compatibility mode, returning -EINVAL early instead
of deferring the failure to the hypervisor.

Signed-off-by: Amit Machhiwal <amachhiw@linux.ibm.com>
---
 arch/powerpc/kvm/book3s_hv.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 61dbeea317f3..948c6b099a29 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -446,7 +446,13 @@ static int kvmppc_set_arch_compat(struct kvm_vcpu *vcpu, u32 arch_compat)
 			guest_pcr_bit = PCR_ARCH_300;
 			break;
 		case PVR_ARCH_31:
+			guest_pcr_bit = PCR_ARCH_31;
+			break;
 		case PVR_ARCH_31_P11:
+			if ((PVR_ARCH_31 & cur_cpu_spec->pvr_mask) ==
+				cur_cpu_spec->pvr_value) {
+				return -EINVAL;
+			}
 			guest_pcr_bit = PCR_ARCH_31;
 			break;
 		default:
-- 
2.50.1

^ permalink raw reply related

* [PATCH 0/6] KVM: PPC: Handle CPU compatibility mode for nested guests
From: Amit Machhiwal @ 2026-04-30  5:48 UTC (permalink / raw)
  To: linuxppc-dev, Madhavan Srinivasan
  Cc: Amit Machhiwal, Vaibhav Jain, Paolo Bonzini, Nicholas Piggin,
	Michael Ellerman, Christophe Leroy (CS GROUP), Jonathan Corbet,
	Shuah Khan, kvm, linux-kernel, linux-doc

On POWER systems, newer processor generations can operate in compatibility
modes corresponding to earlier generations (e.g., a Power11 system running
in Power10 compatibility mode). In such cases, the effective CPU level
exposed to guests differs from the physical processor generation.

This creates a problem for nested virtualization. When booting a nested KVM
guest (L2) inside a host KVM guest (L1) running in a compatibility mode,
userspace (e.g., QEMU) may derive the CPU model from the raw hardware PVR
and attempt to configure the nested guest accordingly. However, the L1
partition is constrained by the compatibility level negotiated with the
hypervisor (L0), and requests exceeding that level are rejected, leading to
guest boot failures such as:

  KVM-NESTEDv2: couldn't set guest wide elements

This series addresses the issue in two steps:

1. Detect and reject invalid compatibility requests early in KVM to avoid
   late failures.

2. Provide a mechanism for userspace to query the effective CPU
   compatibility modes supported by the host, so it can select an
   appropriate CPU model for nested guests.

To achieve this, the series introduces a new KVM capability and ioctl
(KVM_CAP_PPC_COMPAT_CAPS / KVM_PPC_GET_COMPAT_CAPS) that expose the
compatibility modes supported by the host.

The implementation supports both:

  - PowerVM (nested API v2), where compatibility information is obtained
    via the H_GUEST_GET_CAPABILITIES hypercall.
  - PowerNV (nested API v1), where compatibility is derived from the device
    tree ("cpu-version") representing the effective processor compatibility
    level.

This allows userspace (e.g., QEMU) to select a CPU model consistent with
the host compatibility mode, avoiding mismatches and enabling successful
nested guest boot.

Patch summary:
  [1/6] Validate arch_compat against host compatibility mode
  [2/6] Introduce KVM_CAP_PPC_COMPAT_CAPS and ioctl
  [3/6] Wire up ioctl handling
  [4/6] Implement capability retrieval for PowerVM (API v2)
  [5/6] Add PowerNV support (API v1)
  [6/6] Document the new ioctl

Tested on:
  - Power11 pSeries LPAR in Power10 compatibility mode (nested API v2)
  - Power10 PowerNV system (and QEMU TCG PowerNV 11) with nested
    virtualization (API v1) with various combinations of KVM L1/L2 guests
    in various supported compatibility modes.

With this series, nested guests boot successfully in configurations where
they previously failed due to compatibility mismatches.

Amit Machhiwal (6):
  KVM: PPC: Book3S HV: Validate arch_compat against host compatibility
    mode
  KVM: PPC: Introduce KVM_CAP_PPC_COMPAT_CAPS and
    KVM_PPC_GET_COMPAT_CAPS
  KVM: PPC: Wire up KVM_PPC_GET_COMPAT_CAPS ioctl
  KVM: PPC: Book3S HV: Implement compat CPU capability retrieval for KVM
    on PowerVM
  KVM: PPC: Book3S HV: Add support for compat CPU capabilities for KVM
    on PowerNV
  KVM: PPC: Document KVM_PPC_GET_COMPAT_CAPS ioctl

 Documentation/virt/kvm/api.rst      | 35 +++++++++++++++++
 arch/powerpc/include/asm/kvm_ppc.h  |  1 +
 arch/powerpc/include/uapi/asm/kvm.h |  6 +++
 arch/powerpc/kvm/book3s_hv.c        | 58 +++++++++++++++++++++++++++++
 arch/powerpc/kvm/powerpc.c          | 19 ++++++++++
 include/uapi/linux/kvm.h            |  4 ++
 6 files changed, 123 insertions(+)

base-commit: dca922e019dd758b4c1b4bec8f1d509efddeaab4
--
2.50.1

^ permalink raw reply

* [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song (Xiaomi) @ 2026-04-30  4:04 UTC (permalink / raw)
  To: akpm, linux-mm, willy
  Cc: david, ljs, liam, vbabka, rppt, surenb, mhocko, jack, pfalcato,
	wanglian, chentao, lianux.mm, kunwu.chan, liyangouwen1, chrisl,
	kasong, shikemeng, nphamcs, bhe, youngjun.park, linux-arm-kernel,
	linux-kernel, loongarch, linuxppc-dev, linux-riscv, linux-s390,
	Barry Song (Xiaomi)

Oven observed most mmap_lock contention and priority inversion
come from page fault retries after waiting for I/O completion.
Oven subsequently raised the following idea:

There is no need to always fall back to mmap_lock when the per-VMA lock
is released only to wait for the page cache to become ready. On a page
fault retry, the per-VMA lock can still be reused.

We believe the same should also apply to anonymous folios. However, there
is a case where I/O has completed but we fail to acquire the folio lock
because a concurrent thread may be installing PTEs for the folio. This
is expected to be short-lived, so retrying the page fault is unnecessary.

This patchset handles two cases:

(1) If we need to wait for I/O completion, we still drop the per-VMA lock, as
current page fault handling already does. Holding it for too long may introduce
various priority inversion issues on mobile devices. After I/O completes, we
retry the page fault with the per-VMA lock, rather than falling back to
mmap_lock.

(2) If I/O has already completed and the folio is up to date, the wait is
likely due to a concurrent PTE installation. In this case, we keep the
per-VMA lock and avoid retrying the page fault.

With (1), the dramatically reduced mmap_lock contention leads to a
significant improvement in Douyin performance. Oven’s data is shown
below.

Douyin (the Chinese version of TikTok) warm start on a smartphone with
8GB RAM.

== mmap_lock Acquisitions And Wait Time ==

Metric                    Before (Avg)    After (Avg)    Change
------------------------------------------------------------------------
Read Lock Count           20,010          5,719          -71.42%
Read Total Wait (us)      10,695,877     408,436        -96.18%
Read Avg Wait (us)        534.00         71.00           -86.70%
Write Lock Count          838             909            +8.47%
Write Total Wait (us)     501,293        97,633          -80.52%
Write Avg Wait (us)       598.00         107.00          -82.11%


== Read Lock Waiting Time Distribution of mmap_lock ==

Range (us)                 Before (Avg)    After (Avg)    Change
------------------------------------------------------------------------
[0, 1)                     9,927           4,286          -56.82%
[1, 10)                    9,179           1,327          -85.54%
[10, 100)                  191             88             -53.93%
[100, 1000)                57              6              -89.47%
[1000, 10000)              328             9              -97.26%
[10000, 100000)            328             6              -98.17%
[100000, 1000000)          0               0              N/A
[1000000, +)               0               0              N/A

== Write Lock Waiting Time Distribution of mmap_lock ==

Range (us)                 Before (Avg)    After (Avg)    Change
------------------------------------------------------------------------
[0, 1)                     250             300            +20.00%
[1, 10)                    483             556            +15.11%
[10, 100)                  52              41             -21.15%
[100, 1000)                12              5              -58.33%
[1000, 10000)              22              4              -81.82%
[10000, 100000)            16              1              -93.75%
[100000, 1000000)          0               0              N/A
[1000000, +)               0               0              N/A

After the optimization, the number of read lock acquisitions is 
significantly reduced, and both lock waiting time and tail latency are 
dramatically improved.

Kunwu and Lian also developed a model to capture the situation described
by Matthew [1], where a memcg with limited memory may fail to make
progress. This happens because after I/O is initiated on the first page
fault, the folios may be reclaimed by the time of the retry, leaving the
workload with little or no forward progress.

A stress setup made by Kunwu and Lian as follows:
* 256-core x86 system
* 500 threads continuously faulting on 16MB files

The model was running within a memcg with limited memory,
as shown below:

systemd-run --scope -p MemoryHigh=1G -p MemoryMax=1.2G -p MemorySwapMax=0 \
--unit=mmap-thrash-$$ ./mmap_lock & \
TEST_PID=$!

The reproducer code is shown below:

 #define THREADS 500 
 #define FILE_SIZE (16 * 1024 * 1024) /* 16MB */ 
 static _Atomic int g_stop = 0; 
 #define RUN_SECONDS 600 
 
 struct worker_arg { 
         long id; 
         uint64_t *counts; 
 }; 
 
 void *worker(void *arg) 
 { 
         struct worker_arg *wa = (struct worker_arg *)arg; 
         long id = wa->id; 
         char path[64]; 
         uint64_t local_rounds = 0; 
 
         snprintf(path, sizeof(path), "./test_file_%d_%ld.dat", 
                  getpid(), id); 
         int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); 
         if (fd < 0) return NULL; 
         if (ftruncate(fd, FILE_SIZE) < 0) { 
                 close(fd); return NULL; 
         } 
 
         while (!atomic_load_explicit(&g_stop, memory_order_relaxed)) { 
                 char *f_map = mmap(NULL, FILE_SIZE, PROT_READ, 
                                    MAP_SHARED, fd, 0); 
                 if (f_map != MAP_FAILED) { 
                         /* Pure page cache thrashing */ 
                         for (int i = 0; i < FILE_SIZE; i += 4096) { 
                                 volatile unsigned char c = 
                                         (unsigned char)f_map[i]; 
                                 (void)c; 
                         } 
                         munmap(f_map, FILE_SIZE); 
                         local_rounds++; 
                 } 
         } 
         wa->counts[id] = local_rounds; 
         close(fd); 
         unlink(path); 
         return NULL; 
 } 
 
 int main(void) 
 { 
         printf("Pure File Thrashing Started. PID: %d\n", getpid()); 
         pthread_t t[THREADS]; 
         uint64_t local_counts[THREADS]; 
         memset(local_counts, 0, sizeof(local_counts)); 
         struct worker_arg args[THREADS]; 
 
         for (long i = 0; i < THREADS; i++) { 
                 args[i].id = i; 
                 args[i].counts = local_counts; 
                 pthread_create(&t[i], NULL, worker, &args[i]); 
         } 
 
         sleep(RUN_SECONDS); 
         atomic_store_explicit(&g_stop, 1, memory_order_relaxed); 
 
         for (int i = 0; i < THREADS; i++) pthread_join(t[i], NULL); 
 
         uint64_t total = 0; 
         for (int i = 0; i < THREADS; i++) total += local_counts[i]; 
 
         printf("Total rounds     : %llu\n", (unsigned long long)total); 
         printf("Throughput       : %.2f rounds/sec\n", 
                (double)total / RUN_SECONDS); 
         return 0; 
 }

They also added temporary counters in page fault retries [2]:
- RETRY_IO_MISS   : folio not present after I/O completion
- RETRY_MMAP_DROP : retry fallback due to waiting for I/O

Their results are as follows:

| Case                | Total Rounds | Throughput | Miss/Drop(%) | RETRY_MMAP_DROP | RETRY_IO_MISS |
| ------------------- | ------------ | ---------- | ------------ | --------------- | ------------- |
| Baseline (Run 1)    | 22,711       | 37.85 /s   | 45.04        | 970,078         | 436,956       |
| Baseline (Run 2)    | 23,530       | 39.22 /s   | 44.96        | 972,043         | 437,077       |
| With Series (Run A) | 54,428       | 90.71 /s   | 1.69         | 1,204,124       | 20,398        |
| With Series (Run B) | 35,949       | 59.91 /s   | 0.03         | 327,023         | 99            |

Without this series, nearly half of the retries fail to observe completed
I/O results, leading to significant CPU and I/O waste. With the finer-
grained VMA lock, faulting threads avoid the heavily contended mmap_lock
during retries and are therefore able to complete the page fault.

With (2), there is a clear improvement in swap-in bandwidth in a model
with five threads issuing MADV_PAGEOUT-based swap-outs and five threads
performing swap-ins on a 100MB anonymous mmap VMA.

 #define SIZE (100 * 1024 * 1024)
 #define PAGE_SIZE 4096
 #define WRITER_THREADS 5
 #define READER_THREADS 5
 #define RUN_SECONDS 30
 
 static uint8_t *buf;
 static atomic_ulong pageout_rounds = 0;
 static atomic_ulong swapin_rounds = 0;
 static atomic_int stop_flag = 0;
 
 static void *pageout_thread(void *arg)
 {
     (void)arg;
     while (!atomic_load(&stop_flag)) {
         if (madvise(buf, SIZE, MADV_PAGEOUT) == 0) {
             atomic_fetch_add(&pageout_rounds, 1);
         }
     }
     return NULL;
 }
 
 static void *reader_thread(void *arg)
 {
     (void)arg;
     volatile uint64_t sum = 0;
 
     while (!atomic_load(&stop_flag)) {
         for (size_t i = 0; i < SIZE; i += PAGE_SIZE) {
             sum += buf[i];
         }
         /* One full pass over 100MB, counted as one swap-in round (approximate) */
         atomic_fetch_add(&swapin_rounds, 1);
     }
     return NULL;
 }
 
 int main(void)
 {
     pthread_t writers[WRITER_THREADS];
     pthread_t readers[READER_THREADS];
 
     buf = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
     if (buf == MAP_FAILED) {
         exit(EXIT_FAILURE);
     }
     memset(buf, 0, SIZE);
 
     for (int i = 0; i < WRITER_THREADS; i++) {
         if (pthread_create(&writers[i], NULL, pageout_thread, NULL) != 0) {
             perror("pthread_create");
             exit(EXIT_FAILURE);
         }
     }
     for (int i = 0; i < READER_THREADS; i++) {
         if (pthread_create(&readers[i], NULL, reader_thread, NULL) != 0) {
             perror("pthread_create");
             exit(EXIT_FAILURE);
         }
     }
 
     sleep(RUN_SECONDS);
     atomic_store(&stop_flag, 1);
     for (int i = 0; i < WRITER_THREADS; i++)
         pthread_join(writers[i], NULL);
     for (int i = 0; i < READER_THREADS; i++)
         pthread_join(readers[i], NULL);
 
     printf("=== Result (30s) ===\n");
     printf("Pageout rounds: %lu\n", pageout_rounds);
     printf("Swap-in rounds (approx): %lu\n", swapin_rounds);
     munmap(buf, SIZE);
     return 0;
 }

W/o patches:
=== Result (30s) ===
Pageout rounds: 1324847
Swap-in rounds (approx): 874

W/patches:
=== Result (30s) ===
Pageout rounds: 1330550
Swap-in rounds (approx): 1017

[1] https://lore.kernel.org/linux-mm/aSip2mWX13sqPW_l@casper.infradead.org/
[2] https://github.com/lianux-mm/ioretry_test/

-v2:
  * collect tags from Pedro, Kunwu and Lian, thanks!
  * handle case (2), for uptodate folios, don't retry PF
-RFC:
  https://lore.kernel.org/linux-mm/20251127011438.6918-1-21cnbao@gmail.com/

Barry Song (Xiaomi) (4):
  mm/swapin: Retry swapin by VMA lock if the lock was released for I/O
  mm: Move folio_lock_or_retry() and drop __folio_lock_or_retry()
  mm: Don't retry page fault if folio is uptodate during swap-in
  mm/filemap: Avoid retrying page faults on uptodate folios in filemap
    faults

Oven Liyang (1):
  mm/filemap: Retry fault by VMA lock if the lock was released for I/O

 arch/arm/mm/fault.c       |  5 +++
 arch/arm64/mm/fault.c     |  5 +++
 arch/loongarch/mm/fault.c |  4 +++
 arch/powerpc/mm/fault.c   |  5 ++-
 arch/riscv/mm/fault.c     |  4 +++
 arch/s390/mm/fault.c      |  4 +++
 arch/x86/mm/fault.c       |  4 +++
 include/linux/mm_types.h  |  9 ++---
 include/linux/pagemap.h   | 17 ----------
 mm/filemap.c              | 57 ++++++-------------------------
 mm/memory.c               | 70 +++++++++++++++++++++++++++++++++++++--
 11 files changed, 114 insertions(+), 70 deletions(-)

-- 
* The work began during my collaboration with OPPO and has continued through
my current collaboration with Xiaomi. Although the OPPO collaboration has
ended, OPPO still deserves more than half of the credit for this series,
if any credit is to be assigned.

2.39.3 (Apple Git-146)


^ permalink raw reply

* [PATCH v2 1/5] mm/filemap: Retry fault by VMA lock if the lock was released for I/O
From: Barry Song (Xiaomi) @ 2026-04-30  4:04 UTC (permalink / raw)
  To: akpm, linux-mm, willy
  Cc: david, ljs, liam, vbabka, rppt, surenb, mhocko, jack, pfalcato,
	wanglian, chentao, lianux.mm, kunwu.chan, liyangouwen1, chrisl,
	kasong, shikemeng, nphamcs, bhe, youngjun.park, linux-arm-kernel,
	linux-kernel, loongarch, linuxppc-dev, linux-riscv, linux-s390,
	Barry Song
In-Reply-To: <20260430040427.4672-1-baohua@kernel.org>

From: Oven Liyang <liyangouwen1@oppo.com>

If the current page fault is using the per-VMA lock, and we only released
the lock to wait for I/O completion (e.g., using folio_lock()), then when
the fault is retried after the I/O completes, it should still qualify for
the per-VMA-lock path.

Acked-by: Pedro Falcato <pfalcato@suse.de>
Tested-by: Wang Lian <wanglian@kylinos.cn>
Tested-by: Kunwu Chan <chentao@kylinos.cn>
Reviewed-by: Wang Lian <lianux.mm@gmail.com>
Reviewed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Oven Liyang <liyangouwen1@oppo.com>
Co-developed-by: Barry Song <baohua@kernel.org>
Signed-off-by: Barry Song <baohua@kernel.org>
---
 arch/arm/mm/fault.c       | 5 +++++
 arch/arm64/mm/fault.c     | 5 +++++
 arch/loongarch/mm/fault.c | 4 ++++
 arch/powerpc/mm/fault.c   | 5 ++++-
 arch/riscv/mm/fault.c     | 4 ++++
 arch/s390/mm/fault.c      | 4 ++++
 arch/x86/mm/fault.c       | 4 ++++
 include/linux/mm_types.h  | 9 +++++----
 mm/filemap.c              | 5 ++++-
 9 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c
index e62cc4be5adf..5971e02845f7 100644
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -391,6 +391,7 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, addr);
 	if (!vma)
 		goto lock_mmap;
@@ -420,6 +421,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 			goto no_context;
 		return 0;
 	}
+
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 retry:
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 739800835920..d0362a3e11b7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -673,6 +673,7 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 	if (!(mm_flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, addr);
 	if (!vma)
 		goto lock_mmap;
@@ -719,6 +720,10 @@ static int __kprobes do_page_fault(unsigned long far, unsigned long esr,
 			goto no_context;
 		return 0;
 	}
+
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 retry:
diff --git a/arch/loongarch/mm/fault.c b/arch/loongarch/mm/fault.c
index 2c93d33356e5..738f495560c0 100644
--- a/arch/loongarch/mm/fault.c
+++ b/arch/loongarch/mm/fault.c
@@ -219,6 +219,7 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, address);
 	if (!vma)
 		goto lock_mmap;
@@ -265,6 +266,9 @@ static void __kprobes __do_page_fault(struct pt_regs *regs,
 			no_context(regs, write, address);
 		return;
 	}
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 retry:
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c
index 806c74e0d5ab..cb7ffc20c760 100644
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -487,6 +487,7 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, address);
 	if (!vma)
 		goto lock_mmap;
@@ -516,7 +517,9 @@ static int ___do_page_fault(struct pt_regs *regs, unsigned long address,
 
 	if (fault_signal_pending(fault, regs))
 		return user_mode(regs) ? 0 : SIGBUS;
-
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 	/* When running in the kernel we expect faults to occur only to
diff --git a/arch/riscv/mm/fault.c b/arch/riscv/mm/fault.c
index 04ed6f8acae4..b94cf57c2b9a 100644
--- a/arch/riscv/mm/fault.c
+++ b/arch/riscv/mm/fault.c
@@ -347,6 +347,7 @@ void handle_page_fault(struct pt_regs *regs)
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, addr);
 	if (!vma)
 		goto lock_mmap;
@@ -376,6 +377,9 @@ void handle_page_fault(struct pt_regs *regs)
 			no_context(regs, addr);
 		return;
 	}
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 retry:
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index 191cc53caead..e0576e629f65 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -294,6 +294,7 @@ static void do_exception(struct pt_regs *regs, int access)
 		flags |= FAULT_FLAG_WRITE;
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
+retry_vma:
 	vma = lock_vma_under_rcu(mm, address);
 	if (!vma)
 		goto lock_mmap;
@@ -318,6 +319,9 @@ static void do_exception(struct pt_regs *regs, int access)
 			handle_fault_error_nolock(regs, 0);
 		return;
 	}
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 retry:
 	vma = lock_mm_and_find_vma(mm, address, regs);
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..0589fc693eea 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -1322,6 +1322,7 @@ void do_user_addr_fault(struct pt_regs *regs,
 	if (!(flags & FAULT_FLAG_USER))
 		goto lock_mmap;
 
+retry_vma:
 	vma = lock_vma_under_rcu(mm, address);
 	if (!vma)
 		goto lock_mmap;
@@ -1351,6 +1352,9 @@ void do_user_addr_fault(struct pt_regs *regs,
 						 ARCH_DEFAULT_PKEY);
 		return;
 	}
+	/* If the first try is only about waiting for the I/O to complete */
+	if (fault & VM_FAULT_RETRY_VMA)
+		goto retry_vma;
 lock_mmap:
 
 retry:
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index a308e2c23b82..5907200ea587 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -1678,10 +1678,11 @@ enum vm_fault_reason {
 	VM_FAULT_NOPAGE         = (__force vm_fault_t)0x000100,
 	VM_FAULT_LOCKED         = (__force vm_fault_t)0x000200,
 	VM_FAULT_RETRY          = (__force vm_fault_t)0x000400,
-	VM_FAULT_FALLBACK       = (__force vm_fault_t)0x000800,
-	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x001000,
-	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x002000,
-	VM_FAULT_COMPLETED      = (__force vm_fault_t)0x004000,
+	VM_FAULT_RETRY_VMA      = (__force vm_fault_t)0x000800,
+	VM_FAULT_FALLBACK       = (__force vm_fault_t)0x001000,
+	VM_FAULT_DONE_COW       = (__force vm_fault_t)0x002000,
+	VM_FAULT_NEEDDSYNC      = (__force vm_fault_t)0x004000,
+	VM_FAULT_COMPLETED      = (__force vm_fault_t)0x008000,
 	VM_FAULT_HINDEX_MASK    = (__force vm_fault_t)0x0f0000,
 };
 
diff --git a/mm/filemap.c b/mm/filemap.c
index ab34cab2416a..a045b771e8de 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3525,6 +3525,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 	struct folio *folio;
 	vm_fault_t ret = 0;
 	bool mapping_locked = false;
+	bool retry_by_vma_lock = false;
 
 	max_idx = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
 	if (unlikely(index >= max_idx))
@@ -3621,6 +3622,8 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 	 */
 	if (fpin) {
 		folio_unlock(folio);
+		if (vmf->flags & FAULT_FLAG_VMA_LOCK)
+			retry_by_vma_lock = true;
 		goto out_retry;
 	}
 	if (mapping_locked)
@@ -3671,7 +3674,7 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
 		filemap_invalidate_unlock_shared(mapping);
 	if (fpin)
 		fput(fpin);
-	return ret | VM_FAULT_RETRY;
+	return ret | VM_FAULT_RETRY | (retry_by_vma_lock ? VM_FAULT_RETRY_VMA : 0);
 }
 EXPORT_SYMBOL(filemap_fault);
 
-- 
2.39.3 (Apple Git-146)



^ permalink raw reply related

* [PATCH v2 2/5] mm/swapin: Retry swapin by VMA lock if the lock was released for I/O
From: Barry Song (Xiaomi) @ 2026-04-30  4:04 UTC (permalink / raw)
  To: akpm, linux-mm, willy
  Cc: david, ljs, liam, vbabka, rppt, surenb, mhocko, jack, pfalcato,
	wanglian, chentao, lianux.mm, kunwu.chan, liyangouwen1, chrisl,
	kasong, shikemeng, nphamcs, bhe, youngjun.park, linux-arm-kernel,
	linux-kernel, loongarch, linuxppc-dev, linux-riscv, linux-s390,
	Barry Song (Xiaomi)
In-Reply-To: <20260430040427.4672-1-baohua@kernel.org>

If the current do_swap_page() took the per-VMA lock and we dropped it only
to wait for I/O completion (e.g., use folio_wait_locked()), then when
do_swap_page() is retried after the I/O completes, it should still qualify
for the per-VMA-lock path.

Tested-by: Wang Lian <wanglian@kylinos.cn>
Tested-by: Kunwu Chan <chentao@kylinos.cn>
Reviewed-by: Wang Lian <lianux.mm@gmail.com>
Reviewed-by: Kunwu Chan <kunwu.chan@gmail.com>
Signed-off-by: Barry Song (Xiaomi) <baohua@kernel.org>
---
 mm/memory.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/mm/memory.c b/mm/memory.c
index 199214f8de08..00ee1599d637 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4791,6 +4791,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	unsigned long page_idx;
 	unsigned long address;
 	pte_t *ptep;
+	bool retry_by_vma_lock = false;
 
 	if (!pte_unmap_same(vmf))
 		goto out;
@@ -4896,8 +4897,13 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 
 	swapcache = folio;
 	ret |= folio_lock_or_retry(folio, vmf);
-	if (ret & VM_FAULT_RETRY)
+	if (ret & VM_FAULT_RETRY) {
+		if (fault_flag_allow_retry_first(vmf->flags) &&
+		    !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT) &&
+		    (vmf->flags & FAULT_FLAG_VMA_LOCK))
+			retry_by_vma_lock = true;
 		goto out_release;
+	}
 
 	page = folio_file_page(folio, swp_offset(entry));
 	/*
@@ -5182,7 +5188,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 	}
 	if (si)
 		put_swap_device(si);
-	return ret;
+	return ret | (retry_by_vma_lock ? VM_FAULT_RETRY_VMA : 0);
 }
 
 static bool pte_range_none(pte_t *pte, int nr_pages)
-- 
2.39.3 (Apple Git-146)



^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox