Netdev List
 help / color / mirror / Atom feed
* [PATCH v5 7/9] rust: configfs: use `LocalModule` for `THIS_MODULE`
From: Alvin Sun @ 2026-06-24 12:57 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel, netdev,
	linux-pci, Alvin Sun
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

Replace the `THIS_MODULE` static reference in the `configfs_attrs!`
macro with `this_module::<LocalModule>()`, and update
rnull to import `LocalModule` instead of `THIS_MODULE`, consistent
with the move of `THIS_MODULE` into the `ModuleMetadata` trait.

Assisted-by: opencode:glm-5.2
Reviewed-by: Andreas Hindborg <a.hindborg@kernel.org>
Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
---
 drivers/block/rnull/configfs.rs | 6 ++----
 rust/kernel/configfs.rs         | 8 +++++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/block/rnull/configfs.rs b/drivers/block/rnull/configfs.rs
index c10a55fc58948..b2547ad1e5ddd 100644
--- a/drivers/block/rnull/configfs.rs
+++ b/drivers/block/rnull/configfs.rs
@@ -1,9 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
-use super::{
-    NullBlkDevice,
-    THIS_MODULE, //
-};
+use super::NullBlkDevice;
+use crate::LocalModule;
 use kernel::{
     block::mq::gen_disk::{
         GenDisk,
diff --git a/rust/kernel/configfs.rs b/rust/kernel/configfs.rs
index 2339c6467325d..c31d7882e216d 100644
--- a/rust/kernel/configfs.rs
+++ b/rust/kernel/configfs.rs
@@ -875,7 +875,7 @@ fn as_ptr(&self) -> *const bindings::config_item_type {
 ///                 configfs::Subsystem<Configuration>,
 ///                 Configuration
 ///                 >::new_with_child_ctor::<N,Child>(
-///             &THIS_MODULE,
+///             ::kernel::module::this_module::<crate::LocalModule>(),
 ///             &CONFIGURATION_ATTRS
 ///         );
 ///
@@ -1021,7 +1021,8 @@ macro_rules! configfs_attrs {
 
                     static [< $data:upper _TPE >] : $crate::configfs::ItemType<$container, $data>  =
                         $crate::configfs::ItemType::<$container, $data>::new::<N>(
-                            &THIS_MODULE, &[<$ data:upper _ATTRS >]
+                            $crate::module::this_module::<LocalModule>(),
+                            &[<$ data:upper _ATTRS >]
                         );
                 )?
 
@@ -1030,7 +1031,8 @@ macro_rules! configfs_attrs {
                         $crate::configfs::ItemType<$container, $data>  =
                             $crate::configfs::ItemType::<$container, $data>::
                             new_with_child_ctor::<N, $child>(
-                                &THIS_MODULE, &[<$ data:upper _ATTRS >]
+                                $crate::module::this_module::<LocalModule>(),
+                                &[<$ data:upper _ATTRS >]
                             );
                 )?
 

-- 
2.43.0



^ permalink raw reply related

* [PATCH v5 5/9] rust: drm: set fops.owner from driver module pointer
From: Alvin Sun @ 2026-06-24 12:57 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel, netdev,
	linux-pci, Alvin Sun
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

Change `create_fops()` to accept an owner module pointer instead of
hardcoding `null_mut()`, ensuring the kernel correctly tracks the
module owning the DRM device's file operations.

Assisted-by: opencode:glm-5.2
Reviewed-by: Andreas Hindborg <a.hindborg@kernel.org>
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
---
 rust/kernel/drm/device.rs  | 3 ++-
 rust/kernel/drm/gem/mod.rs | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/rust/kernel/drm/device.rs b/rust/kernel/drm/device.rs
index 403fc35353c74..d92cacb665366 100644
--- a/rust/kernel/drm/device.rs
+++ b/rust/kernel/drm/device.rs
@@ -111,7 +111,8 @@ impl<T: drm::Driver> Device<T> {
         fops: &Self::GEM_FOPS,
     };
 
-    const GEM_FOPS: bindings::file_operations = drm::gem::create_fops();
+    const GEM_FOPS: bindings::file_operations =
+        drm::gem::create_fops(crate::module::this_module::<T::OwnerModule>().as_ptr());
 
     /// Create a new `drm::Device` for a `drm::Driver`.
     pub fn new(dev: &device::Device, data: impl PinInit<T::Data, Error>) -> Result<ARef<Self>> {
diff --git a/rust/kernel/drm/gem/mod.rs b/rust/kernel/drm/gem/mod.rs
index 01b5bd47a3332..9a203efc59116 100644
--- a/rust/kernel/drm/gem/mod.rs
+++ b/rust/kernel/drm/gem/mod.rs
@@ -357,10 +357,10 @@ impl<T: DriverObject> AllocImpl for Object<T> {
     };
 }
 
-pub(super) const fn create_fops() -> bindings::file_operations {
+pub(super) const fn create_fops(owner: *mut bindings::module) -> bindings::file_operations {
     let mut fops: bindings::file_operations = pin_init::zeroed();
 
-    fops.owner = core::ptr::null_mut();
+    fops.owner = owner;
     fops.open = Some(bindings::drm_open);
     fops.release = Some(bindings::drm_release);
     fops.unlocked_ioctl = Some(bindings::drm_ioctl);

-- 
2.43.0



^ permalink raw reply related

* [PATCH v5 8/9] rust: binder: use `LocalModule` for `THIS_MODULE`
From: Alvin Sun @ 2026-06-24 12:57 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel, netdev,
	linux-pci, Alvin Sun
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

Replace the `THIS_MODULE` static reference in the binder fops with
`this_module::<LocalModule>()`, consistent with the move of
`THIS_MODULE` into the `ModuleMetadata` trait.

Assisted-by: opencode:glm-5.2
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
---
 drivers/android/binder/rust_binder_main.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/android/binder/rust_binder_main.rs b/drivers/android/binder/rust_binder_main.rs
index dc1941cd2407b..d6ceebbd5f94e 100644
--- a/drivers/android/binder/rust_binder_main.rs
+++ b/drivers/android/binder/rust_binder_main.rs
@@ -17,6 +17,7 @@
     bindings::{self, seq_file},
     fs::File,
     list::{ListArc, ListArcSafe, ListLinksSelfPtr, TryNewListArc},
+    module::this_module,
     prelude::*,
     seq_file::SeqFile,
     seq_print,
@@ -318,7 +319,7 @@ unsafe impl<T> Sync for AssertSync<T> {}
     let zeroed_ops = unsafe { core::mem::MaybeUninit::zeroed().assume_init() };
 
     let ops = kernel::bindings::file_operations {
-        owner: THIS_MODULE.as_ptr(),
+        owner: this_module::<LocalModule>().as_ptr(),
         poll: Some(rust_binder_poll),
         unlocked_ioctl: Some(rust_binder_ioctl),
         compat_ioctl: bindings::compat_ptr_ioctl,

-- 
2.43.0



^ permalink raw reply related

* [PATCH v5 6/9] rust: miscdevice: set fops.owner from driver module pointer
From: Alvin Sun @ 2026-06-24 12:57 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel, netdev,
	linux-pci, Alvin Sun
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

Set the miscdevice fops owner field from the driver module pointer
via the `this_module::<T::OwnerModule>()` helper, instead of
defaulting to null.

Assisted-by: opencode:glm-5.2
Reviewed-by: Andreas Hindborg <a.hindborg@kernel.org>
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
---
 rust/kernel/miscdevice.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rust/kernel/miscdevice.rs b/rust/kernel/miscdevice.rs
index 83ce50def5ac9..2a4329f98614e 100644
--- a/rust/kernel/miscdevice.rs
+++ b/rust/kernel/miscdevice.rs
@@ -24,12 +24,13 @@
         IovIterSource, //
     },
     mm::virt::VmaNew,
+    module::this_module,
     prelude::*,
     seq_file::SeqFile,
     types::{
         ForeignOwnable,
         Opaque, //
-    },
+    }, //
 };
 use core::marker::PhantomData;
 
@@ -430,6 +431,7 @@ impl<T: MiscDevice> MiscdeviceVTable<T> {
         } else {
             None
         },
+        owner: this_module::<T::OwnerModule>().as_ptr(),
         ..pin_init::zeroed()
     };
 

-- 
2.43.0



^ permalink raw reply related

* [PATCH v5 9/9] rust: macros: remove `THIS_MODULE` static from `module!`
From: Alvin Sun @ 2026-06-24 12:57 UTC (permalink / raw)
  To: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel, netdev,
	linux-pci, Alvin Sun
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

All users have been migrated to `ModuleMetadata::THIS_MODULE` const or
`this_module::<LocalModule>()` helper. The `static THIS_MODULE`
generated by the `module!` macro is no longer referenced anywhere,
so remove it to avoid having two sources of the same `ThisModule`
pointer.

Assisted-by: opencode:glm-5.2
Reviewed-by: Andreas Hindborg <a.hindborg@kernel.org>
Reviewed-by: Gary Guo <gary@garyguo.net>
Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
---
 rust/macros/module.rs | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/rust/macros/module.rs b/rust/macros/module.rs
index aa9a618d5d19e..23b6a1b456b80 100644
--- a/rust/macros/module.rs
+++ b/rust/macros/module.rs
@@ -497,22 +497,6 @@ pub(crate) fn module(info: ModuleInfo) -> Result<TokenStream> {
         /// Used by the printing macros, e.g. [`info!`].
         const __LOG_PREFIX: &[u8] = #name_cstr.to_bytes_with_nul();
 
-        // SAFETY: `__this_module` is constructed by the kernel at load time and will not be
-        // freed until the module is unloaded.
-        #[cfg(MODULE)]
-        static THIS_MODULE: ::kernel::ThisModule = unsafe {
-            extern "C" {
-                static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
-            };
-
-            ::kernel::ThisModule::from_ptr(__this_module.get())
-        };
-
-        #[cfg(not(MODULE))]
-        static THIS_MODULE: ::kernel::ThisModule = unsafe {
-            ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
-        };
-
         /// The `LocalModule` type is the type of the module created by `module!`,
         /// `module_pci_driver!`, `module_platform_driver!`, etc.
         type LocalModule = #type_;

-- 
2.43.0



^ permalink raw reply related

* Re: [PATCH v12 07/12] static_call: Define EXPORT_STATIC_CALL_FOR_MODULES()
From: Sean Christopherson @ 2026-06-24 12:59 UTC (permalink / raw)
  To: Pawan Gupta
  Cc: x86, Jon Kohler, Nikolay Borisov, H. Peter Anvin, Josh Poimboeuf,
	David Kaplan, Borislav Petkov, Dave Hansen, Peter Zijlstra,
	Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, KP Singh,
	Jiri Olsa, David S. Miller, David Laight, Andy Lutomirski,
	Thomas Gleixner, Ingo Molnar, David Ahern, Martin KaFai Lau,
	Eduard Zingerman, Song Liu, Yonghong Song, John Fastabend,
	Stanislav Fomichev, Hao Luo, Paolo Bonzini, Jonathan Corbet,
	Jason Baron, Alice Ryhl, Steven Rostedt, Ard Biesheuvel,
	Shuah Khan, linux-kernel, kvm, Asit Mallick, Tao Zhang, bpf,
	netdev, linux-doc
In-Reply-To: <20260622-vmscape-bhb-v12-7-76cbda0ae3e5@linux.intel.com>

[-- Attachment #1: Type: text/plain, Size: 3538 bytes --]

On Tue, Jun 23, 2026, Pawan Gupta wrote:
> There is EXPORT_STATIC_CALL_TRAMP() that hides the static key from all
> modules. But there is no equivalent of EXPORT_SYMBOL_FOR_MODULES() to
> restrict symbol visibility to only certain modules.
> 
> Add EXPORT_STATIC_CALL_FOR_MODULES(name, mods) that wraps both the key and
> the trampoline with EXPORT_SYMBOL_FOR_MODULES(), allowing only a limited
> set of modules to see and update the static key.
> 
> The immediate user is KVM, in the following commit.
> 
> checkpatch reported below warnings with this change that I believe don't
> apply in this case:
> 
>   include/linux/static_call.h:219: WARNING: Non-declarative macros with multiple statements should be enclosed in a do - while loop
>   include/linux/static_call.h:220: WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable
> 
> Suggested-by: Peter Zijlstra <peterz@infradead.org>
> Signed-off-by: Pawan Gupta <pawan.kumar.gupta@linux.intel.com>
> ---
>  include/linux/static_call.h | 8 ++++++++
>  1 file changed, 8 insertions(+)
> 
> diff --git a/include/linux/static_call.h b/include/linux/static_call.h
> index 78a77a4ae0ea..b610afd1ed55 100644
> --- a/include/linux/static_call.h
> +++ b/include/linux/static_call.h
> @@ -216,6 +216,9 @@ extern long __static_call_return0(void);
>  #define EXPORT_STATIC_CALL_GPL(name)					\
>  	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
>  	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
> +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods);		\
> +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_TRAMP(name), mods)
>  
>  /* Leave the key unexported, so modules can't change static call targets: */
>  #define EXPORT_STATIC_CALL_TRAMP(name)					\
> @@ -276,6 +279,9 @@ extern long __static_call_return0(void);
>  #define EXPORT_STATIC_CALL_GPL(name)					\
>  	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
>  	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
> +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods);		\
> +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_TRAMP(name), mods)
>  
>  /* Leave the key unexported, so modules can't change static call targets: */
>  #define EXPORT_STATIC_CALL_TRAMP(name)					\
> @@ -346,6 +352,8 @@ static inline int static_call_text_reserved(void *start, void *end)
>  
>  #define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_KEY(name))
>  #define EXPORT_STATIC_CALL_GPL(name)	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name))
> +#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)			\
> +	EXPORT_SYMBOL_FOR_MODULES(STATIC_CALL_KEY(name), mods)
>  
>  #endif /* CONFIG_HAVE_STATIC_CALL */

Drat, I forgot about this.  Exporting static call trampolines for KVM came up in
another conversation[*].  I had already put together patches to effectively default
to exporting only the trampoline, and also to deduplicate this code so that the
CONFIG_HAVE_STATIC_CALL_INLINE=y / CONFIG_HAVE_STATIC_CALL=y / CONFIG_HAVE_STATIC_CALL=n
implementations don't need to copy+paste the same lines of code.

The attached patches touch a lot more code, and will conflict mightily with KVM
changes I want to land in 7.3 (more use of a static_call in KVM).  But if we get
them applied (to tip tree) shortly after 7.2-rc1 and provide a topic branch/tag,
then there shouldn't be too much juggling needed?

If we want to go with the more aggressive cleanup, I'll formally post the patches.

[*] https://lore.kernel.org/all/ahhoDGUz39KSGZ6o@google.com

[-- Attachment #2: 0001-static_call-Add-stub-for-ARCH_ADD_TRAMP_KEY-if-not-p.patch --]
[-- Type: text/x-diff, Size: 1244 bytes --]

From 415eb214ed200ef82244468a0682ac884b14c051 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 28 May 2026 08:09:56 -0700
Subject: [PATCH 1/4] static_call: Add stub for ARCH_ADD_TRAMP_KEY if not
 provided by arch

Add a dummy #define for ARCH_ADD_TRAMP_KEY if one is not provided by arch
code so that EXPORT_STATIC_CALL_TRAMP{,_GPL} can be used in arch-neutral
code.

No functional change intended.

Fixes: 73f44fe19d35 ("static_call: Allow module use without exposing static_call_key")
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 include/linux/static_call.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 78a77a4ae0ea..7539c82dd35f 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -210,6 +210,10 @@ extern long __static_call_return0(void);
 
 #define static_call_cond(name)	(void)__static_call(name)
 
+#ifndef ARCH_ADD_TRAMP_KEY
+#define ARCH_ADD_TRAMP_KEY(name)
+#endif
+
 #define EXPORT_STATIC_CALL(name)					\
 	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
 	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))

base-commit: 9d4853b044beefa21c4ee3e18c40653601a64ced
-- 
2.55.0.rc0.799.gd6f94ed593-goog


[-- Attachment #3: 0002-KVM-x86-Don-t-export-static-call-keys-to-vendor-modu.patch --]
[-- Type: text/x-diff, Size: 2823 bytes --]

From 0166d0df57a43f20a24f5b75a6c34929221c1f30 Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 28 May 2026 07:01:36 -0700
Subject: [PATCH 2/4] KVM: x86: Don't export static call keys to vendor modules

Export only the trampoline, not the full trampoline+key, of KVM's static
calls that are also used by vendor modules, to harden KVM against unwanted
modifications of the static calls, i.e. to allow vendor code to invoke the
static call, but not redirect it.

Use static_call_mod() instead of the vanilla static_call(), which is
required by the objtool magic to glue things together when exporting only
the trampoline.

No functional change intended.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Closes: https://lore.kernel.org/all/20260528091357.GB343181@noisy.programming.kicks-ass.net
Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/include/asm/kvm_host.h | 2 +-
 arch/x86/kvm/pmu.h              | 2 +-
 arch/x86/kvm/x86.c              | 6 +++---
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index eee473717c0e..55d674c647e6 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -2061,7 +2061,7 @@ extern bool __read_mostly enable_ipiv;
 extern bool __read_mostly enable_device_posted_irqs;
 extern struct kvm_x86_ops kvm_x86_ops;
 
-#define kvm_x86_call(func) static_call(kvm_x86_##func)
+#define kvm_x86_call(func) static_call_mod(kvm_x86_##func)
 
 #define KVM_X86_OP(func) \
 	DECLARE_STATIC_CALL(kvm_x86_##func, *(((struct kvm_x86_ops *)0)->func));
diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h
index a5821d7c87f9..77cdee3e4aa6 100644
--- a/arch/x86/kvm/pmu.h
+++ b/arch/x86/kvm/pmu.h
@@ -54,7 +54,7 @@ struct kvm_pmu_ops {
 	const u32 MSR_STRIDE;
 };
 
-#define kvm_pmu_call(func) static_call(kvm_x86_pmu_##func)
+#define kvm_pmu_call(func) static_call_mod(kvm_x86_pmu_##func)
 
 #define KVM_X86_PMU_OP(func) \
 	DECLARE_STATIC_CALL(kvm_x86_pmu_##func, *(((struct kvm_pmu_ops *)0)->func));
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d9d51803b7b2..792f402f493f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -148,9 +148,9 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly;
 #define KVM_X86_OP_OPTIONAL KVM_X86_OP
 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
 #include <asm/kvm-x86-ops.h>
-EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
-EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl);
+EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_get_cpl);
 
 static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, 0644);
-- 
2.55.0.rc0.799.gd6f94ed593-goog


[-- Attachment #4: 0003-static_call-Restrict-exporting-of-static-call-key-to.patch --]
[-- Type: text/x-diff, Size: 10765 bytes --]

From 87068920d63a87235ae9a1e62529a4d897bd6c6c Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 28 May 2026 07:03:24 -0700
Subject: [PATCH 3/4] static_call: Restrict exporting of static call *key* to
 tracepoints

Rename the export macros for static call trampolines and keys so that the
"default" EXPORT_STATIC_CALL{,_GPL}() exports only the trampoline, and full
exports of trampoline+key pairs is restricted to tracepoints (by naming
convention).  Most developers are blissfully unaware of the gory details of
static calls, and so don't understand the implications of using the
innocuous-looking "vanilla" macros.

Effectively defaulting to exporting the key is undesirable as there is no
known use case for allowing a module to change an export static call's
target, outside of tracepoints.

Opportunistically massage the macro magic to deduplicate the
CONFIG_HAVE_STATIC_CALL_INLINE=y vs. CONFIG_HAVE_STATIC_CALL=y vs.
CONFIG_HAVE_STATIC_CALL=n implementations.

No functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/events/amd/brs.c      |  2 +-
 arch/x86/kernel/apic/init.c    |  4 +--
 arch/x86/kernel/cpu/mshyperv.c |  2 +-
 arch/x86/kernel/traps.c        |  2 +-
 arch/x86/kvm/x86.c             |  6 ++--
 arch/x86/xen/enlighten.c       |  2 +-
 include/linux/static_call.h    | 59 +++++++++++++---------------------
 include/linux/tracepoint.h     |  4 +--
 kernel/sched/core.c            |  8 ++---
 9 files changed, 38 insertions(+), 51 deletions(-)

diff --git a/arch/x86/events/amd/brs.c b/arch/x86/events/amd/brs.c
index 06f35a6b58a5..b9a246989bd4 100644
--- a/arch/x86/events/amd/brs.c
+++ b/arch/x86/events/amd/brs.c
@@ -424,7 +424,7 @@ void noinstr perf_amd_brs_lopwr_cb(bool lopwr_in)
 }
 
 DEFINE_STATIC_CALL_NULL(perf_lopwr_cb, perf_amd_brs_lopwr_cb);
-EXPORT_STATIC_CALL_TRAMP_GPL(perf_lopwr_cb);
+EXPORT_STATIC_CALL_GPL(perf_lopwr_cb);
 
 void __init amd_brs_lopwr_init(void)
 {
diff --git a/arch/x86/kernel/apic/init.c b/arch/x86/kernel/apic/init.c
index 821e2e536f19..933b8d2d3af5 100644
--- a/arch/x86/kernel/apic/init.c
+++ b/arch/x86/kernel/apic/init.c
@@ -30,8 +30,8 @@ DEFINE_APIC_CALL(wakeup_secondary_cpu);
 DEFINE_APIC_CALL(wakeup_secondary_cpu_64);
 DEFINE_APIC_CALL(write);
 
-EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_mask);
-EXPORT_STATIC_CALL_TRAMP_GPL(apic_call_send_IPI_self);
+EXPORT_STATIC_CALL_GPL(apic_call_send_IPI_mask);
+EXPORT_STATIC_CALL_GPL(apic_call_send_IPI_self);
 
 /* The container for function call overrides */
 struct apic_override __x86_apic_override __initdata;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
index b5b6a58b67b0..9adfc12be1db 100644
--- a/arch/x86/kernel/cpu/mshyperv.c
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -333,7 +333,7 @@ static void __init x86_setup_ops_for_tsc_pg_clock(void)
 
 #ifdef CONFIG_X86_64
 DEFINE_STATIC_CALL(hv_hypercall, hv_std_hypercall);
-EXPORT_STATIC_CALL_TRAMP_GPL(hv_hypercall);
+EXPORT_STATIC_CALL_GPL(hv_hypercall);
 #define hypercall_update(hc) static_call_update(hv_hypercall, hc)
 #endif
 #endif /* CONFIG_HYPERV */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 0ca3912ecb7f..df05ad454414 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -218,7 +218,7 @@ static inline unsigned long pt_regs_val(struct pt_regs *regs, int nr)
 
 #ifdef HAVE_ARCH_BUG_FORMAT_ARGS
 DEFINE_STATIC_CALL(WARN_trap, __WARN_trap);
-EXPORT_STATIC_CALL_TRAMP(WARN_trap);
+EXPORT_STATIC_CALL(WARN_trap);
 
 /*
  * Create a va_list from an exception context.
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 792f402f493f..d9d51803b7b2 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -148,9 +148,9 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly;
 #define KVM_X86_OP_OPTIONAL KVM_X86_OP
 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
 #include <asm/kvm-x86-ops.h>
-EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_get_cs_db_l_bits);
-EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_TRAMP_GPL(kvm_x86_get_cpl);
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl);
 
 static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, 0644);
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 23b91bf9b663..ec14d2017909 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -23,7 +23,7 @@
 #include "xen-ops.h"
 
 DEFINE_STATIC_CALL(xen_hypercall, xen_hypercall_hvm);
-EXPORT_STATIC_CALL_TRAMP(xen_hypercall);
+EXPORT_STATIC_CALL(xen_hypercall);
 
 /*
  * Pointer to the xen_vcpu_info structure or
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index 7539c82dd35f..c2c667baf8fe 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -26,7 +26,7 @@
  *   static_call_update(name, func);
  *   static_call_query(name);
  *
- *   EXPORT_STATIC_CALL{,_TRAMP}{,_GPL}()
+ *   EXPORT_STATIC_CALL{,_GPL}()
  *
  * Usage example:
  *
@@ -121,14 +121,6 @@
  *   completely eliding any function call overhead.
  *
  *   Notably argument setup is unconditional.
- *
- *
- * EXPORT_STATIC_CALL() vs EXPORT_STATIC_CALL_TRAMP():
- *
- *   The difference is that the _TRAMP variant tries to only export the
- *   trampoline with the result that a module can use static_call{,_cond}() but
- *   not static_call_update().
- *
  */
 
 #include <linux/types.h>
@@ -214,19 +206,8 @@ extern long __static_call_return0(void);
 #define ARCH_ADD_TRAMP_KEY(name)
 #endif
 
-#define EXPORT_STATIC_CALL(name)					\
-	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
-	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-#define EXPORT_STATIC_CALL_GPL(name)					\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
-
-/* Leave the key unexported, so modules can't change static call targets: */
-#define EXPORT_STATIC_CALL_TRAMP(name)					\
-	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name));				\
-	ARCH_ADD_TRAMP_KEY(name)
-#define EXPORT_STATIC_CALL_TRAMP_GPL(name)				\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name));			\
+#define __EXPORT_STATIC_CALL(name, scope)				\
+	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name));			\
 	ARCH_ADD_TRAMP_KEY(name)
 
 #elif defined(CONFIG_HAVE_STATIC_CALL)
@@ -274,18 +255,8 @@ static inline int static_call_text_reserved(void *start, void *end)
 
 extern long __static_call_return0(void);
 
-#define EXPORT_STATIC_CALL(name)					\
-	EXPORT_SYMBOL(STATIC_CALL_KEY(name));				\
-	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-#define EXPORT_STATIC_CALL_GPL(name)					\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name));			\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
-
-/* Leave the key unexported, so modules can't change static call targets: */
-#define EXPORT_STATIC_CALL_TRAMP(name)					\
-	EXPORT_SYMBOL(STATIC_CALL_TRAMP(name))
-#define EXPORT_STATIC_CALL_TRAMP_GPL(name)				\
-	EXPORT_SYMBOL_GPL(STATIC_CALL_TRAMP(name))
+#define __EXPORT_STATIC_CALL(name, scope)				\
+	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name))
 
 #else /* Generic implementation */
 
@@ -348,9 +319,25 @@ static inline int static_call_text_reserved(void *start, void *end)
 	return 0;
 }
 
-#define EXPORT_STATIC_CALL(name)	EXPORT_SYMBOL(STATIC_CALL_KEY(name))
-#define EXPORT_STATIC_CALL_GPL(name)	EXPORT_SYMBOL_GPL(STATIC_CALL_KEY(name))
+#define __EXPORT_STATIC_CALL(name, scope)
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
+#define __EXPORT_TRACEPOINT_STATIC_CALL(name, scope)			\
+	EXPORT_SYMBOL##scope(STATIC_CALL_KEY(name));			\
+	__EXPORT_STATIC_CALL(name, scope)
+#define EXPORT_TRACEPOINT_STATIC_CALL(name)				\
+	__EXPORT_TRACEPOINT_STATIC_CALL(name, )
+#define EXPORT_TRACEPOINT_STATIC_CALL_GPL(name)				\
+	__EXPORT_TRACEPOINT_STATIC_CALL(name, _GPL)
+
+/*
+ * For non-tracepoint usage, leave the key unexported, so modules can't change
+ * static call targets, i.e. can only invoke the static call.
+ */
+#define EXPORT_STATIC_CALL(name)					\
+	__EXPORT_STATIC_CALL(name, )
+#define EXPORT_STATIC_CALL_GPL(name)					\
+	__EXPORT_STATIC_CALL(name, _GPL)
+
 #endif /* _LINUX_STATIC_CALL_H */
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 2d2b9f8cdda4..1b64dcaf683e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -423,12 +423,12 @@ static inline struct tracepoint *tracepoint_ptr_deref(tracepoint_ptr_t *p)
 	TRACEPOINT_CHECK(name)						\
 	EXPORT_SYMBOL_GPL(__tracepoint_##name);				\
 	EXPORT_SYMBOL_GPL(__traceiter_##name);				\
-	EXPORT_STATIC_CALL_GPL(tp_func_##name)
+	EXPORT_TRACEPOINT_STATIC_CALL_GPL(tp_func_##name)
 #define EXPORT_TRACEPOINT_SYMBOL(name)					\
 	TRACEPOINT_CHECK(name)						\
 	EXPORT_SYMBOL(__tracepoint_##name);				\
 	EXPORT_SYMBOL(__traceiter_##name);				\
-	EXPORT_STATIC_CALL(tp_func_##name)
+	EXPORT_TRACEPOINT_STATIC_CALL(tp_func_##name)
 
 
 #else /* !TRACEPOINTS_ENABLED */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8871449d3c6..c4d0db00d036 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7403,7 +7403,7 @@ EXPORT_SYMBOL(preempt_schedule);
 #   define preempt_schedule_dynamic_disabled	NULL
 #  endif
 DEFINE_STATIC_CALL(preempt_schedule, preempt_schedule_dynamic_enabled);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule);
+EXPORT_STATIC_CALL(preempt_schedule);
 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule);
 void __sched notrace dynamic_preempt_schedule(void)
@@ -7476,7 +7476,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
 #   define preempt_schedule_notrace_dynamic_disabled	NULL
 #  endif
 DEFINE_STATIC_CALL(preempt_schedule_notrace, preempt_schedule_notrace_dynamic_enabled);
-EXPORT_STATIC_CALL_TRAMP(preempt_schedule_notrace);
+EXPORT_STATIC_CALL(preempt_schedule_notrace);
 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
 static DEFINE_STATIC_KEY_TRUE(sk_dynamic_preempt_schedule_notrace);
 void __sched notrace dynamic_preempt_schedule_notrace(void)
@@ -7723,12 +7723,12 @@ EXPORT_SYMBOL(__cond_resched);
 #  define cond_resched_dynamic_enabled	__cond_resched
 #  define cond_resched_dynamic_disabled	((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(cond_resched, __cond_resched);
-EXPORT_STATIC_CALL_TRAMP(cond_resched);
+EXPORT_STATIC_CALL(cond_resched);
 
 #  define might_resched_dynamic_enabled	__cond_resched
 #  define might_resched_dynamic_disabled ((void *)&__static_call_return0)
 DEFINE_STATIC_CALL_RET0(might_resched, __cond_resched);
-EXPORT_STATIC_CALL_TRAMP(might_resched);
+EXPORT_STATIC_CALL(might_resched);
 # elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY)
 static DEFINE_STATIC_KEY_FALSE(sk_dynamic_cond_resched);
 int __sched dynamic_cond_resched(void)
-- 
2.55.0.rc0.799.gd6f94ed593-goog


[-- Attachment #5: 0004-static_call-Add-FOR_MODULES-static-call-exports-use-.patch --]
[-- Type: text/x-diff, Size: 4948 bytes --]

From 05623900bddabdc47e6797b72a610b1d4f825e1d Mon Sep 17 00:00:00 2001
From: Sean Christopherson <seanjc@google.com>
Date: Thu, 28 May 2026 07:32:41 -0700
Subject: [PATCH 4/4] static_call: Add FOR_MODULES static call exports, use 'em
 in KVM

Add EXPORT_STATIC_CALL_FOR_MODULES(), along with KVM-specific variants,
and use the KVM-internal variants to export KVM's internal static calls
only to KVM's vendor modules (if any exist).

For all intents and purposes, no functional change intended.

Signed-off-by: Sean Christopherson <seanjc@google.com>
---
 arch/x86/kvm/pmu.c          |  2 +-
 arch/x86/kvm/x86.c          |  6 +++---
 include/linux/kvm_types.h   |  8 ++++++++
 include/linux/static_call.h | 12 +++++++-----
 4 files changed, 19 insertions(+), 9 deletions(-)

diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c
index b92dd2e58335..7e4f6e5ff436 100644
--- a/arch/x86/kvm/pmu.c
+++ b/arch/x86/kvm/pmu.c
@@ -100,7 +100,7 @@ static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
 #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
 #define KVM_X86_PMU_OP_OPTIONAL_RET0 KVM_X86_PMU_OP
 #include <asm/kvm-x86-pmu-ops.h>
-EXPORT_STATIC_CALL_GPL(kvm_x86_pmu_pmc_is_disabled_in_current_mode);
+EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(kvm_x86_pmu_pmc_is_disabled_in_current_mode);
 
 void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
 {
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index d9d51803b7b2..6df084d827b8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -148,9 +148,9 @@ struct kvm_x86_ops kvm_x86_ops __read_mostly;
 #define KVM_X86_OP_OPTIONAL KVM_X86_OP
 #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP
 #include <asm/kvm-x86-ops.h>
-EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits);
-EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg);
-EXPORT_STATIC_CALL_GPL(kvm_x86_get_cpl);
+EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(kvm_x86_get_cs_db_l_bits);
+EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(kvm_x86_cache_reg);
+EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(kvm_x86_get_cpl);
 
 static bool __read_mostly ignore_msrs = 0;
 module_param(ignore_msrs, bool, 0644);
diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h
index a568d8e6f4e8..3bf9d113b001 100644
--- a/include/linux/kvm_types.h
+++ b/include/linux/kvm_types.h
@@ -13,8 +13,14 @@
 	EXPORT_SYMBOL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES))
 #define EXPORT_SYMBOL_FOR_KVM(symbol) \
 	EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES))
+#define EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(symbol) \
+	EXPORT_STATIC_CALL_FOR_MODULES(symbol, __stringify(KVM_SUB_MODULES))
+#define EXPORT_STATIC_CALL_FOR_KVM(symbol) \
+	EXPORT_STATIC_CALL_FOR_MODULES(symbol, "kvm," __stringify(KVM_SUB_MODULES))
 #else
 #define EXPORT_SYMBOL_FOR_KVM_INTERNAL(symbol)
+#define EXPORT_STATIC_CALL_FOR_KVM_INTERNAL(symbol)
+
 /*
  * Allow architectures to provide a custom EXPORT_SYMBOL_FOR_KVM, but only
  * if there are no submodules, e.g. to allow suppressing exports if KVM=m, but
@@ -23,8 +29,10 @@
 #ifndef EXPORT_SYMBOL_FOR_KVM
 #if IS_MODULE(CONFIG_KVM)
 #define EXPORT_SYMBOL_FOR_KVM(symbol) EXPORT_SYMBOL_FOR_MODULES(symbol, "kvm")
+#define EXPORT_STATIC_CALL_FOR_KVM(symbol) EXPORT_STATIC_CALL_FOR_MODULES(symbol, "kvm")
 #else
 #define EXPORT_SYMBOL_FOR_KVM(symbol)
+#define EXPORT_STATIC_CALL_FOR_KVM(symbol)
 #endif /* IS_MODULE(CONFIG_KVM) */
 #endif /* EXPORT_SYMBOL_FOR_KVM */
 #endif
diff --git a/include/linux/static_call.h b/include/linux/static_call.h
index c2c667baf8fe..9b38f82b35c4 100644
--- a/include/linux/static_call.h
+++ b/include/linux/static_call.h
@@ -206,8 +206,8 @@ extern long __static_call_return0(void);
 #define ARCH_ADD_TRAMP_KEY(name)
 #endif
 
-#define __EXPORT_STATIC_CALL(name, scope)				\
-	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name));			\
+#define __EXPORT_STATIC_CALL(name, scope, ...)				\
+	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name), ##__VA_ARGS__);	\
 	ARCH_ADD_TRAMP_KEY(name)
 
 #elif defined(CONFIG_HAVE_STATIC_CALL)
@@ -255,8 +255,8 @@ static inline int static_call_text_reserved(void *start, void *end)
 
 extern long __static_call_return0(void);
 
-#define __EXPORT_STATIC_CALL(name, scope)				\
-	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name))
+#define __EXPORT_STATIC_CALL(name, scope, ...)				\
+	EXPORT_SYMBOL##scope(STATIC_CALL_TRAMP(name), ##__VA_ARGS__)
 
 #else /* Generic implementation */
 
@@ -319,7 +319,7 @@ static inline int static_call_text_reserved(void *start, void *end)
 	return 0;
 }
 
-#define __EXPORT_STATIC_CALL(name, scope)
+#define __EXPORT_STATIC_CALL(name, scope, ...)
 
 #endif /* CONFIG_HAVE_STATIC_CALL */
 
@@ -339,5 +339,7 @@ static inline int static_call_text_reserved(void *start, void *end)
 	__EXPORT_STATIC_CALL(name, )
 #define EXPORT_STATIC_CALL_GPL(name)					\
 	__EXPORT_STATIC_CALL(name, _GPL)
+#define EXPORT_STATIC_CALL_FOR_MODULES(name, mods)				\
+	__EXPORT_STATIC_CALL(name, _FOR_MODULES, mods)
 
 #endif /* _LINUX_STATIC_CALL_H */
-- 
2.55.0.rc0.799.gd6f94ed593-goog


^ permalink raw reply related

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Kaitao Cheng @ 2026-06-24 12:58 UTC (permalink / raw)
  To: David Hildenbrand (Arm), Alexei Starovoitov
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Alexander Viro,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Thomas Gleixner,
	Juri Lelli, Vincent Guittot, Paul Moore, Andy Shevchenko,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao
In-Reply-To: <8f98a3a6-f97b-4673-964f-fb09c8879e2e@kernel.org>



在 2026/6/22 19:27, David Hildenbrand (Arm) 写道:
> On 6/22/26 07:28, Alexei Starovoitov wrote:
>> On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>>
>>> From: chengkaitao <chengkaitao@kylinos.cn>
>>>
>>> The list_for_each*_safe() helpers are used when the loop body may remove
>>> the current entry.  Their current interface, however, forces every caller
>>> to define a temporary cursor outside the macro and pass it in, even when
>>> the caller never uses that cursor directly.  For most call sites this
>>> extra cursor is just boilerplate required by the macro implementation.
>>>
>>> This is awkward because the saved next pointer is an internal detail of
>>> the iteration.  Callers that only remove or move the current entry do not
>>> need to spell it out.
>>>
>>> The _safe() suffix has also caused confusion.  Christian Koenig pointed
>>> out that the name is easy to read as a thread-safe variant, especially
>>> for beginners, even though it only means that the iterator keeps enough
>>> state to tolerate removal of the current entry.  He suggested _mutable()
>>> as a clearer description of what the loop permits.
>>>
>>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>>> helpers are variadic and support both forms.  In the common case, the
>>> caller omits the temporary cursor and the macro creates a unique internal
>>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>>> explicit temporary cursor, the caller can still pass it and the helper
>>> keeps the existing *_safe() behaviour.
>>>
>>> For example, a call site may use the shorter form:
>>>
>>>   list_for_each_entry_mutable(pos, head, member)
>>>
>>> or keep the explicit temporary cursor form:
>>>
>>>   list_for_each_entry_mutable(pos, tmp, head, member)
>>>
>>> The existing *_safe() helpers remain available for compatibility.  This
>>> series only converts users in mm, block, kernel, init and io_uring.  If
>>> this approach looks acceptable, the remaining users can be converted in
>>> follow-up series.
>>>
>>> Changes in v3 (Christian König, Andy Shevchenko):
>>> - Convert safe list walks to mutable iterators
>>>
>>> Changes in v2 (Muchun Song, Andy Shevchenko):
>>> - Drop the list_for_each_entry_mutable*() helpers from v1 and make the
>>>   cursor change directly in the existing list_for_each_entry*() helpers.
>>> - Open-code special list walks that rely on updating the loop cursor in
>>>   the body, preserving their existing traversal semantics.
>>>
>>> Link to v2:
>>> https://lore.kernel.org/all/20260609061347.93688-1-kaitao.cheng@linux.dev/
>>>
>>> Link to v1:
>>> https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/
>>>
>>> Kaitao Cheng (7):
>>>   list: Add mutable iterator variants
>>>   llist: Add mutable iterator variants
>>>   mm: Use mutable list iterators
>>>   block: Use mutable list iterators
>>>   kernel: Use mutable list iterators
>>>   initramfs: Use mutable list iterator
>>>   io_uring: Use mutable list iterators
>>>
>>>  block/bfq-iosched.c                 |  17 +-
>>>  block/blk-cgroup.c                  |  12 +-
>>>  block/blk-flush.c                   |   4 +-
>>>  block/blk-iocost.c                  |  18 +-
>>>  block/blk-mq.c                      |   8 +-
>>>  block/blk-throttle.c                |   4 +-
>>>  block/kyber-iosched.c               |   4 +-
>>>  block/partitions/ldm.c              |   8 +-
>>>  block/sed-opal.c                    |   4 +-
>>>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
>>>  include/linux/llist.h               |  81 +++++++--
>>>  init/initramfs.c                    |   5 +-
>>>  io_uring/cancel.c                   |   6 +-
>>>  io_uring/poll.c                     |   3 +-
>>>  io_uring/rw.c                       |   4 +-
>>>  io_uring/timeout.c                  |   8 +-
>>>  io_uring/uring_cmd.c                |   3 +-
>>>  kernel/audit_tree.c                 |   4 +-
>>>  kernel/audit_watch.c                |  16 +-
>>>  kernel/auditfilter.c                |   4 +-
>>>  kernel/auditsc.c                    |   4 +-
>>>  kernel/bpf/arena.c                  |  10 +-
>>>  kernel/bpf/arraymap.c               |   8 +-
>>>  kernel/bpf/bpf_local_storage.c      |   3 +-
>>>  kernel/bpf/bpf_lru_list.c           |  25 ++-
>>>  kernel/bpf/btf.c                    |  18 +-
>>>  kernel/bpf/cgroup.c                 |   7 +-
>>>  kernel/bpf/cpumap.c                 |   4 +-
>>>  kernel/bpf/devmap.c                 |  10 +-
>>>  kernel/bpf/helpers.c                |   8 +-
>>>  kernel/bpf/local_storage.c          |   4 +-
>>>  kernel/bpf/memalloc.c               |  16 +-
>>>  kernel/bpf/offload.c                |   8 +-
>>>  kernel/bpf/states.c                 |   4 +-
>>>  kernel/bpf/stream.c                 |   4 +-
>>>  kernel/bpf/verifier.c               |   6 +-
>>>  kernel/cgroup/cgroup-v1.c           |   4 +-
>>>  kernel/cgroup/cgroup.c              |  54 +++---
>>>  kernel/cgroup/dmem.c                |  12 +-
>>>  kernel/cgroup/rdma.c                |   8 +-
>>>  kernel/events/core.c                |  44 +++--
>>>  kernel/events/uprobes.c             |  12 +-
>>>  kernel/exit.c                       |   8 +-
>>>  kernel/fail_function.c              |   4 +-
>>>  kernel/gcov/clang.c                 |   4 +-
>>>  kernel/irq_work.c                   |   4 +-
>>>  kernel/kexec_core.c                 |   4 +-
>>>  kernel/kprobes.c                    |  16 +-
>>>  kernel/livepatch/core.c             |   4 +-
>>>  kernel/livepatch/core.h             |   4 +-
>>>  kernel/liveupdate/kho_block.c       |   4 +-
>>>  kernel/liveupdate/luo_flb.c         |   4 +-
>>>  kernel/locking/rwsem.c              |   2 +-
>>>  kernel/locking/test-ww_mutex.c      |   2 +-
>>>  kernel/module/main.c                |  11 +-
>>>  kernel/padata.c                     |   4 +-
>>>  kernel/power/snapshot.c             |   8 +-
>>>  kernel/power/wakelock.c             |   4 +-
>>>  kernel/printk/printk.c              |  11 +-
>>>  kernel/ptrace.c                     |   4 +-
>>>  kernel/rcu/rcutorture.c             |   3 +-
>>>  kernel/rcu/tasks.h                  |   9 +-
>>>  kernel/rcu/tree.c                   |   6 +-
>>>  kernel/resource.c                   |   4 +-
>>>  kernel/sched/core.c                 |   4 +-
>>>  kernel/sched/ext.c                  |  22 +--
>>>  kernel/sched/fair.c                 |  28 +--
>>>  kernel/sched/topology.c             |   4 +-
>>>  kernel/sched/wait.c                 |   4 +-
>>>  kernel/seccomp.c                    |   4 +-
>>>  kernel/signal.c                     |  11 +-
>>>  kernel/smp.c                        |   4 +-
>>>  kernel/taskstats.c                  |   8 +-
>>>  kernel/time/clockevents.c           |   6 +-
>>>  kernel/time/clocksource.c           |   4 +-
>>>  kernel/time/posix-cpu-timers.c      |   4 +-
>>>  kernel/time/posix-timers.c          |   3 +-
>>>  kernel/torture.c                    |   3 +-
>>>  kernel/trace/bpf_trace.c            |   4 +-
>>>  kernel/trace/ftrace.c               |  49 +++--
>>>  kernel/trace/ring_buffer.c          |  25 ++-
>>>  kernel/trace/trace.c                |  12 +-
>>>  kernel/trace/trace_dynevent.c       |   6 +-
>>>  kernel/trace/trace_dynevent.h       |   5 +-
>>>  kernel/trace/trace_events.c         |  35 ++--
>>>  kernel/trace/trace_events_filter.c  |   4 +-
>>>  kernel/trace/trace_events_hist.c    |   8 +-
>>>  kernel/trace/trace_events_trigger.c |  17 +-
>>>  kernel/trace/trace_events_user.c    |  16 +-
>>>  kernel/trace/trace_stat.c           |   4 +-
>>>  kernel/user-return-notifier.c       |   3 +-
>>>  kernel/workqueue.c                  |  16 +-
>>>  mm/backing-dev.c                    |   8 +-
>>>  mm/balloon.c                        |   8 +-
>>>  mm/cma.c                            |   4 +-
>>>  mm/compaction.c                     |   4 +-
>>>  mm/damon/core.c                     |   4 +-
>>>  mm/damon/sysfs-schemes.c            |   4 +-
>>>  mm/dmapool.c                        |   4 +-
>>>  mm/huge_memory.c                    |   8 +-
>>>  mm/hugetlb.c                        |  56 +++---
>>>  mm/hugetlb_vmemmap.c                |  16 +-
>>>  mm/khugepaged.c                     |  14 +-
>>>  mm/kmemleak.c                       |   7 +-
>>>  mm/ksm.c                            |  25 +--
>>>  mm/list_lru.c                       |   4 +-
>>>  mm/memcontrol-v1.c                  |   8 +-
>>>  mm/memory-failure.c                 |  12 +-
>>>  mm/memory-tiers.c                   |   4 +-
>>>  mm/migrate.c                        |  23 ++-
>>>  mm/mmu_notifier.c                   |   9 +-
>>>  mm/page_alloc.c                     |   8 +-
>>>  mm/page_reporting.c                 |   2 +-
>>>  mm/percpu.c                         |  11 +-
>>>  mm/pgtable-generic.c                |   4 +-
>>>  mm/rmap.c                           |  10 +-
>>>  mm/shmem.c                          |   9 +-
>>>  mm/slab_common.c                    |  14 +-
>>>  mm/slub.c                           |  33 ++--
>>>  mm/swapfile.c                       |   4 +-
>>>  mm/userfaultfd.c                    |  12 +-
>>>  mm/vmalloc.c                        |  24 +--
>>>  mm/vmscan.c                         |   7 +-
>>>  mm/zsmalloc.c                       |   4 +-
>>>  124 files changed, 875 insertions(+), 681 deletions(-)
>>
>> Not sure what you were thinking, but this diff stat
>> is not landable.
> 
> Agreed. If we decide we want this, I guess we should target per-subsystem
> conversions.
> 
> If this goes through the MM tree, I would even appreciate doing this on a per-MM
> component granularity.
> 
> (unless we have some magic "Linus converts all of them" script, which I doubt we
> will have)

I strongly agree with the point above.

> Is there a way forward to replace list_for_each_*_safe entirely, possibly just
> reusing the old name but simply the parameter?

David Laight, Christian König, and Jani Nikula do not agree with using
clever macro syntax to support both calling forms at the same time,
so for now it is not possible to keep the original macro name and only
simplify the parameter. I may revert to the v1 version and ask everyone
for their opinions again.

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: Kaitao Cheng @ 2026-06-24 13:05 UTC (permalink / raw)
  To: Jani Nikula, Andrew Morton, David Hildenbrand, Jens Axboe,
	Tejun Heo, Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König
  Cc: David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, linux-kernel, cgroups,
	linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf, netdev,
	dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, chengkaitao
In-Reply-To: <88f34c7fa5a3d1700cc8005818751d6aa31f09df@intel.com>



在 2026/6/22 16:37, Jani Nikula 写道:
> On Mon, 22 Jun 2026, Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>> helpers are variadic and support both forms.  In the common case, the
>> caller omits the temporary cursor and the macro creates a unique internal
>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>> explicit temporary cursor, the caller can still pass it and the helper
>> keeps the existing *_safe() behaviour.
>>
>> For example, a call site may use the shorter form:
>>
>>   list_for_each_entry_mutable(pos, head, member)
>>
>> or keep the explicit temporary cursor form:
>>
>>   list_for_each_entry_mutable(pos, tmp, head, member)
> 
> I'm unconvinced it's a good idea to allow two forms with macro trickery,
> *especially* when it's not the last argument you can omit. I think it's
> a footgun.
> 
> IMO stick with the first form only, and there'll always be the _safe
> variant that can be used when the temp pointer is needed.

Could we go back to the v1 version? What do you think of that
implementation approach?

https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH net-next v1] ethernet: 3c509: Improve style of pnp_device_id array terminator
From: Maciej W. Rozycki @ 2026-06-24 13:10 UTC (permalink / raw)
  To: Uwe Kleine-König (The Capable Hub)
  Cc: Andrew Lunn, David S. Miller, Eric Dumazet, Jakub Kicinski,
	Paolo Abeni, netdev, linux-kernel
In-Reply-To: <a0cd057e6a24b9d355b5e4bdfcdb812cdd1e4652.1781082923.git.u.kleine-koenig@baylibre.com>

On Wed, 10 Jun 2026, Uwe Kleine-König (The Capable Hub) wrote:

> To match how device-id array terminators look like for other device
> types drop `.id = ""` from it and let the compiler care for zeroing the
> entry.
> 
> There are no changes in the compiled drivers, only the source looks
> nicer.
> 
> Signed-off-by: Uwe Kleine-König (The Capable Hub) <u.kleine-koenig@baylibre.com>
> ---
> Hello,
> 
> I'm currently working on changing various *_device_id definitions.
> This patch is irrelevant for this quest and a pure style update for
> consistency reasons without further dependencies on it. I just stumbled
> over this while working on that quest.
> 
> So if you don't like this patch, I won't insist.

 Absolutely fine with me, though I can see it's been taken already in my 
absence.  So for the record only:

Acked-by: Maciej W. Rozycki <macro@orcam.me.uk>

 Thanks,

  Maciej

^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Kaitao Cheng @ 2026-06-24 13:14 UTC (permalink / raw)
  To: David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt,
	Christian König, David Howells, Simona Vetter, Randy Dunlap,
	Luca Ceresoli, Philipp Stanner, linux-block, linux-kernel,
	cgroups, linux-ntfs-dev, linux-fsdevel, io-uring, audit, bpf,
	netdev, dri-devel, linux-perf-users, linux-trace-kernel, kexec,
	live-patching, linux-modules, linux-crypto, linux-pm, rcu,
	sched-ext, linux-mm, virtualization, damon, llvm, Kaitao Cheng
In-Reply-To: <20260622094242.64531b9a@pumpkin>



在 2026/6/22 16:42, David Laight 写道:
> On Mon, 22 Jun 2026 12:05:31 +0800
> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
> 
>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>
>> The list_for_each*_safe() helpers are used when the loop body may
>> remove the current entry.  Their API exposes the temporary cursor at
>> every call site, even though most users only need it for the iterator
>> implementation and never reference it in the loop body.
>>
>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>> support both forms: callers may keep passing an explicit temporary cursor
>> when they need to inspect or reset it, or omit it and let the helper use
>> a unique internal cursor.
> 
> I'm not really sure 'mutable' means anything either.
> It is possible to make it valid for the loop body (or even other threads)
> to delete arbitrary list items - but that needs significant extra overheads.
> 
> It might be worth doing something that doesn't need the extra variable,
> but there is little point doing all the churn just to rename things.
> 
>>
>> This makes call sites that only mutate the list through the current entry
>> less noisy, while keeping the existing *_safe() helpers available for
>> compatibility.
>>
>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>> ---
>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>
>> diff --git a/include/linux/list.h b/include/linux/list.h
>> index 09d979976b3b..1081def7cea9 100644
>> --- a/include/linux/list.h
>> +++ b/include/linux/list.h
>> @@ -7,6 +7,7 @@
>>  #include <linux/stddef.h>
>>  #include <linux/poison.h>
>>  #include <linux/const.h>
>> +#include <linux/args.h>
>>  
>>  #include <asm/barrier.h>
>>  
>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>  #define list_for_each_prev(pos, head) \
>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>  
>> -/**
>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>> - * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> +/*
>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>   */
>>  #define list_for_each_safe(pos, n, head) \
>>  	for (pos = (head)->next, n = pos->next; \
>>  	     !list_is_head(pos, (head)); \
>>  	     pos = n, n = pos->next)
>>  
>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
> 
> Use auto
> 
>> +	     !list_is_head(pos, (head));				\
>> +	     pos = tmp, tmp = pos->next)
>> +
>> +#define __list_for_each_mutable1(pos, head)				\
>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>> +
>> +#define __list_for_each_mutable2(pos, next, head)			\
>> +	list_for_each_safe(pos, next, head)
>> +
>>  /**
>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>   * @pos:	the &struct list_head to use as a loop cursor.
>> - * @n:		another &struct list_head to use as temporary storage
>> - * @head:	the head for your list.
>> + * @...:	either (head) or (next, head)
>> + *
>> + * next:	another &struct list_head to use as optional temporary storage.
>> + *		The temporary cursor is internal unless explicitly supplied by
>> + *		the caller.
>> + * head:	the head for your list.
>> + */
>> +#define list_for_each_mutable(pos, ...)					\
>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>> +		(pos, __VA_ARGS__)
> 
> The variable argument count logic really just slows down compilation.
> Maybe there aren't enough copies of this code to make that significant.
> But just because you can do it doesn't mean it is a gooD idea.
> I'm also not sure it really adds anything to the readability.
> 
> And, it you are going to make the middle argument optional there is
> no need to change the macro name.

Christian König and Jani Nikula also disagree with the variadic-argument
implementation approach. If we abandon that method, it means we will
inevitably need to add some new macros. If mutable is not a good name,
suggestions for better alternatives would be welcome; coming up with a
suitable name is indeed rather tricky.

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* Re: [PATCH v5 0/9] Fix missing fops.owner in Rust DRM/misc abstractions
From: Danilo Krummrich @ 2026-06-24 13:22 UTC (permalink / raw)
  To: Alvin Sun
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Luis Chamberlain, Petr Pavlu, Daniel Gomez, Sami Tolvanen,
	Aaron Tomlin, Greg Kroah-Hartman, Rafael J. Wysocki, David Airlie,
	Simona Vetter, Daniel Almeida, Arnd Bergmann, Brendan Higgins,
	David Gow, Rae Moar, Breno Leitao, Jens Axboe, Dave Ertman,
	Leon Romanovsky, Igor Korotin, FUJITA Tomonori, Bjorn Helgaas,
	Krzysztof Wilczyński, Arve Hjønnevåg, Todd Kjos,
	Christian Brauner, Carlos Llamas, rust-for-linux, linux-modules,
	driver-core, dri-devel, nova-gpu, linux-kselftest, kunit-dev,
	linux-block, linux-kernel, netdev, linux-pci
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

On Wed Jun 24, 2026 at 2:57 PM CEST, Alvin Sun wrote:
> The series moves `THIS_MODULE` into the `ModuleMetadata` as a const, threads it
> through `#[vtable]` to set `fops.owner` in DRM/miscdevice, and updates configfs
> and rnull to use `this_module::<LocalModule>()`.

Acked-by: Danilo Krummrich <dakr@kernel.org>

^ permalink raw reply

* Re: [PATCH v5 0/9] Fix missing fops.owner in Rust DRM/misc abstractions
From: Petr Pavlu @ 2026-06-24 13:23 UTC (permalink / raw)
  To: Alvin Sun
  Cc: Miguel Ojeda, Boqun Feng, Gary Guo, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Daniel Gomez, Sami Tolvanen,
	Aaron Tomlin, Greg Kroah-Hartman, Rafael J. Wysocki, David Airlie,
	Simona Vetter, Daniel Almeida, Arnd Bergmann, Brendan Higgins,
	David Gow, Rae Moar, Breno Leitao, Jens Axboe, Dave Ertman,
	Leon Romanovsky, Igor Korotin, FUJITA Tomonori, Bjorn Helgaas,
	Krzysztof Wilczyński, Arve Hjønnevåg, Todd Kjos,
	Christian Brauner, Carlos Llamas, rust-for-linux, linux-modules,
	driver-core, dri-devel, nova-gpu, linux-kselftest, kunit-dev,
	linux-block, linux-kernel, netdev, linux-pci
In-Reply-To: <20260624-fix-fops-owner-v5-0-aa1cba242f05@linux.dev>

On 6/24/26 2:57 PM, Alvin Sun wrote:
> During tyr debugfs development, a kernel NULL pointer dereference was
> encountered after `rmmod tyr` while gnome-shell still held /dev/card1 open:
> 
> ```
>   [158827.868132] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000
>   [158827.868918] Mem abort info:
>   [158827.869177]   ESR = 0x0000000086000004
>   [158827.869519]   EC = 0x21: IABT (current EL), IL = 32 bits
>   [158827.870000]   SET = 0, FnV = 0
>   [158827.870281]   EA = 0, S1PTW = 0
>   [158827.870571]   FSC = 0x04: level 0 translation fault
>   [158827.871043] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000108dec000
>   [158827.871623] [0000000000000000] pgd=0000000000000000, p4d=0000000000000000
>   [158827.872242] Internal error: Oops: 0000000086000004 [#1]  SMP
>   [158827.872246] Modules linked in: tyr sunrpc snd_soc_simple_card rk805_pwrkey snd_soc_simple_card_utils rtw88_8822bu display_connector rtw88_usb rtw88_8822b snd_soc_rockchip_i2s_tdm snd_soc_hdmi_codec
>   rtw88_core]
>   [158827.872337] CPU: 4 UID: 1000 PID: 11276 Comm: gnome-s:disk$0 Tainted: G                 N  7.1.0-rc1+ #331 PREEMPT
>   [158827.880534] Tainted: [N]=TEST
>   [158827.880535] Hardware name: FriendlyElec NanoPi R6C/NanoPi R6C, BIOS v1.1 04/09/2025
>   [158827.880538] pstate: 60400009 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
>   [158827.880542] pc : 0x0
>   [158827.880547] lr : _RNvNtCs257m05FHVbX_3tyr2vm8pt_unmap+0x8c/0x12c [tyr]
>   [158827.880578] sp : ffff800083c236b0
>   [158827.880579] x29: ffff800083c236d0 x28: ffff00013f8a0000 x27: 0000000000000000
>   [158827.880585] x26: 000000000000007c x25: ffff000108e6ed80 x24: 0000000000401000
>   [158827.880590] x23: 0000000000000000 x22: 0000000040000000 x21: 0000000000001000
>   [158827.880595] x20: ffff00010f778138 x19: 0000000000400000 x18: 00000000ffffffff
>   [158827.880600] x17: 000000040044ffff x16: 045000f2b5503510 x15: 0720072007200720
>   [158827.880606] x14: 0720072007200720 x13: 0000000000401000 x12: 0000000000400000
>   [158827.880611] x11: ffff800083c239d0 x10: ffff000141e4fd88 x9 : 0000000000000000
>   [158827.880615] x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000400000
>   [158827.880620] x5 : ffff00013f8a0000 x4 : 0000000000000000 x3 : 0000000000000001
>   [158827.880625] x2 : 0000000000001000 x1 : 0000000000400000 x0 : ffff00010f778138
>   [158827.880630] Call trace:
>   [158827.880632]  0x0 (P)
>   [158827.880635]  _RNvXs6_NtCs257m05FHVbX_3tyr2vmNtB5_9GpuVmDataNtNtNtCsgmSOfgXi5CZ_6kernel3drm5gpuvm11DriverGpuVm13sm_step_unmap+0x3c/0x120 [tyr]
>   [158827.891166]  _RNvMs4_NtNtNtCsgmSOfgXi5CZ_6kernel3drm5gpuvm6sm_opsINtB7_5GpuVmNtNtCs257m05FHVbX_3tyr2vm9GpuVmDataE13sm_step_unmapB13_+0x18/0x34 [tyr]
>   [158827.891187]  op_unmap_cb+0x78/0xb0
>   [158827.891196]  __drm_gpuvm_sm_unmap+0x18c/0x1b4
>   [158827.891204]  drm_gpuvm_sm_unmap+0x38/0x4c
>   [158827.891209]  _RNvMs5_NtCs257m05FHVbX_3tyr2vmNtB5_2Vm7exec_op+0x1cc/0x254 [tyr]
>   [158827.894085]  _RNvMs5_NtCs257m05FHVbX_3tyr2vmNtB5_2Vm11unmap_range+0x124/0x188 [tyr]
>   [158827.894105]  _RINvNtCs5hGKnPbRUFW_4core3ptr13drop_in_placeNtNtCs257m05FHVbX_3tyr3gem8KernelBoEBK_+0x44/0xd8 [tyr]
>   [158827.894125]  _RINvNtCs5hGKnPbRUFW_4core3ptr13drop_in_placeINtNtNtCsgmSOfgXi5CZ_6kernel5alloc4kvec3VecNtNtCs257m05FHVbX_3tyr2fw7SectionNtNtBL_9allocator7KmallocEEB1r_+0x3c/0x100 [tyr]
>   [158827.894147]  _RINvNtCs5hGKnPbRUFW_4core3ptr13drop_in_placeINtNtNtCsgmSOfgXi5CZ_6kernel4sync3arc3ArcNtNtCs257m05FHVbX_3tyr2fw8FirmwareEEB1p_+0x94/0x190 [tyr]
>   [158827.894167]  _RNvMs4_NtNtCsgmSOfgXi5CZ_6kernel3drm6deviceINtB5_6DeviceNtNtCs257m05FHVbX_3tyr6driver12TyrDrmDriverE7releaseBW_+0x30/0x98 [tyr]
>   [158827.899550]  drm_dev_put.part.0+0x88/0xc0
>   [158827.899557]  drm_minor_release+0x18/0x28
>   [158827.899562]  drm_release+0x144/0x170
>   [158827.899567]  __fput+0xe4/0x30c
>   [158827.899573]  ____fput+0x14/0x20
>   [158827.899579]  task_work_run+0x7c/0xe8
>   [158827.899586]  do_exit+0x2a8/0xac4
>   [158827.899590]  do_group_exit+0x34/0x90
>   [158827.899594]  get_signal+0xaac/0xabc
>   [158827.899599]  arch_do_signal_or_restart+0x90/0x3e8
>   [158827.899606]  exit_to_user_mode_loop+0x140/0x1d0
>   [158827.899613]  el0_svc+0x2f4/0x2f8
>   [158827.899620]  el0t_64_sync_handler+0xa0/0xe4
>   [158827.899627]  el0t_64_sync+0x198/0x19c
>   [158827.899632] ---[ end trace 0000000000000000 ]---
> ```
> 
> The root cause: `fops.owner` was `NULL` in Rust DRM drivers, so the kernel
> never blocked module unloading while file descriptors were open. This leads to
> use-after-free when drm_release (or other fops) is called on freed module code.
> 
> The series moves `THIS_MODULE` into the `ModuleMetadata` as a const, threads it
> through `#[vtable]` to set `fops.owner` in DRM/miscdevice, and updates configfs
> and rnull to use `this_module::<LocalModule>()`.

I cannot comment that much on all the details of the Rust
implementation, but the series looks reasonable to me from the modules
perspective. I'm glad this issue will be finally addressed.

I would only suggest adding the new file rust/kernel/module.rs in
patch #1 under the MODULE SUPPORT support entry in the MAINTAINERS file,
similarly to the other module-related Rust code, so that the module
maintainers are emailed when changes to this file are proposed. I think
you can change the existing 'F: rust/kernel/module_param.rs' to
'F: rust/kernel/module*.rs'.

-- 
Thanks,
Petr

^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Christian König @ 2026-06-24 13:23 UTC (permalink / raw)
  To: Kaitao Cheng, David Laight
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Alexei Starovoitov,
	Daniel Borkmann, Andrii Nakryiko, Johannes Weiner, Peter Zijlstra,
	Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim,
	Thomas Gleixner, Juri Lelli, Vincent Guittot, Paul Moore,
	Andy Shevchenko, Paul E. McKenney, Shakeel Butt, David Howells,
	Simona Vetter, Randy Dunlap, Luca Ceresoli, Philipp Stanner,
	linux-block, linux-kernel, cgroups, linux-ntfs-dev, linux-fsdevel,
	io-uring, audit, bpf, netdev, dri-devel, linux-perf-users,
	linux-trace-kernel, kexec, live-patching, linux-modules,
	linux-crypto, linux-pm, rcu, sched-ext, linux-mm, virtualization,
	damon, llvm, Kaitao Cheng
In-Reply-To: <351a6b67-b394-4c58-aee2-88b6c8089ad5@linux.dev>

On 6/24/26 15:14, Kaitao Cheng wrote:
> 
> 
> 在 2026/6/22 16:42, David Laight 写道:
>> On Mon, 22 Jun 2026 12:05:31 +0800
>> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>
>>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>>
>>> The list_for_each*_safe() helpers are used when the loop body may
>>> remove the current entry.  Their API exposes the temporary cursor at
>>> every call site, even though most users only need it for the iterator
>>> implementation and never reference it in the loop body.
>>>
>>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>>> support both forms: callers may keep passing an explicit temporary cursor
>>> when they need to inspect or reset it, or omit it and let the helper use
>>> a unique internal cursor.
>>
>> I'm not really sure 'mutable' means anything either.
>> It is possible to make it valid for the loop body (or even other threads)
>> to delete arbitrary list items - but that needs significant extra overheads.
>>
>> It might be worth doing something that doesn't need the extra variable,
>> but there is little point doing all the churn just to rename things.
>>
>>>
>>> This makes call sites that only mutate the list through the current entry
>>> less noisy, while keeping the existing *_safe() helpers available for
>>> compatibility.
>>>
>>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>>> ---
>>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>>
>>> diff --git a/include/linux/list.h b/include/linux/list.h
>>> index 09d979976b3b..1081def7cea9 100644
>>> --- a/include/linux/list.h
>>> +++ b/include/linux/list.h
>>> @@ -7,6 +7,7 @@
>>>  #include <linux/stddef.h>
>>>  #include <linux/poison.h>
>>>  #include <linux/const.h>
>>> +#include <linux/args.h>
>>>  
>>>  #include <asm/barrier.h>
>>>  
>>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>>  #define list_for_each_prev(pos, head) \
>>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>>  
>>> -/**
>>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>>> - * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> +/*
>>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>>   */
>>>  #define list_for_each_safe(pos, n, head) \
>>>  	for (pos = (head)->next, n = pos->next; \
>>>  	     !list_is_head(pos, (head)); \
>>>  	     pos = n, n = pos->next)
>>>  
>>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\
>>
>> Use auto
>>
>>> +	     !list_is_head(pos, (head));				\
>>> +	     pos = tmp, tmp = pos->next)
>>> +
>>> +#define __list_for_each_mutable1(pos, head)				\
>>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>>> +
>>> +#define __list_for_each_mutable2(pos, next, head)			\
>>> +	list_for_each_safe(pos, next, head)
>>> +
>>>  /**
>>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>>   * @pos:	the &struct list_head to use as a loop cursor.
>>> - * @n:		another &struct list_head to use as temporary storage
>>> - * @head:	the head for your list.
>>> + * @...:	either (head) or (next, head)
>>> + *
>>> + * next:	another &struct list_head to use as optional temporary storage.
>>> + *		The temporary cursor is internal unless explicitly supplied by
>>> + *		the caller.
>>> + * head:	the head for your list.
>>> + */
>>> +#define list_for_each_mutable(pos, ...)					\
>>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>>> +		(pos, __VA_ARGS__)
>>
>> The variable argument count logic really just slows down compilation.
>> Maybe there aren't enough copies of this code to make that significant.
>> But just because you can do it doesn't mean it is a gooD idea.
>> I'm also not sure it really adds anything to the readability.
>>
>> And, it you are going to make the middle argument optional there is
>> no need to change the macro name.
> 
> Christian König and Jani Nikula also disagree with the variadic-argument
> implementation approach. If we abandon that method, it means we will
> inevitably need to add some new macros. If mutable is not a good name,
> suggestions for better alternatives would be welcome; coming up with a
> suitable name is indeed rather tricky.

I don't think you need to add a new macro for the specific use case that people want to modify the next element of the iteration.

If I remember your numbers correctly that is a really corner case and keeping using the existing *_safe() macros for that sounds perfectly fine to me.

Regards,
Christian.

^ permalink raw reply

* Re: [PATCH bpf 1/2] bpf, sockmap: Don't leak UDP socks on lookup-bind-release
From: Jakub Sitnicki @ 2026-06-24 13:36 UTC (permalink / raw)
  To: Michal Luczaj, Willem de Bruijn
  Cc: John Fastabend, Jiayuan Chen, David S. Miller, Eric Dumazet,
	Jakub Kicinski, Paolo Abeni, Simon Horman, Alexei Starovoitov,
	Cong Wang, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman,
	Kumar Kartikeya Dwivedi, Martin KaFai Lau, Song Liu,
	Yonghong Song, Jiri Olsa, Emil Tsalapatis, Shuah Khan, netdev,
	bpf, linux-kernel, linux-kselftest
In-Reply-To: <20260623-sockmap-lookup-udp-leak-v1-1-05804f9308e4@rbox.co>

On Tue, Jun 23, 2026 at 08:03 PM +02, Michal Luczaj wrote:
> UDP sockets get SOCK_RCU_FREE set when (auto-)bound. This means
> sk_is_refcounted(unbound) = true, while sk_is_refcounted(bound) = false.
>
> Because sockmap accepts unbound UDP sockets, a BPF program can increment a
> socket's refcount via lookup. If the socket is subsequently bound, the
> transition from unbound to bound causes bpf_sk_release() to skip the
> decrement of the refcount, causing a memory leak.
>
> unreferenced object 0xffff88810bc2eb40 (size 1984):
>   comm "test_progs", pid 2451, jiffies 4295320596
>   hex dump (first 32 bytes):
>     7f 00 00 01 7f 00 00 01 d2 04 1b b7 04 d2 00 00  ................
>     02 00 01 40 00 00 00 00 00 00 00 00 00 00 00 00  ...@............
>   backtrace (crc bdee079d):
>     kmem_cache_alloc_noprof+0x557/0x660
>     sk_prot_alloc+0x69/0x240
>     sk_alloc+0x30/0x460
>     inet_create+0x2ce/0xf80
>     __sock_create+0x25b/0x5c0
>     __sys_socket+0x119/0x1d0
>     __x64_sys_socket+0x72/0xd0
>     do_syscall_64+0xa1/0x5f0
>     entry_SYSCALL_64_after_hwframe+0x76/0x7e
>
> Maintain balanced refcounts across sk lookup/release: (re-)set
> SOCK_RCU_FREE on proto update to treat the socket (whether bound or
> unbound) as not requiring a refcount increment on (a RCU protected) lookup.
>
> Fixes: 0c48eefae712 ("sock_map: Lift socket state restriction for datagram sockets")
> Signed-off-by: Michal Luczaj <mhal@rbox.co>
> ---
> Note: this issue is related to commit 67312adc96b5 ("bpf: reject unhashed
> sockets in bpf_sk_assign").
> ---
>  net/ipv4/udp_bpf.c | 3 +++
>  1 file changed, 3 insertions(+)
>
> diff --git a/net/ipv4/udp_bpf.c b/net/ipv4/udp_bpf.c
> index ad57c4c9eaab..970327b59582 100644
> --- a/net/ipv4/udp_bpf.c
> +++ b/net/ipv4/udp_bpf.c
> @@ -173,6 +173,9 @@ int udp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
>  	if (sk->sk_family == AF_INET6)
>  		udp_bpf_check_v6_needs_rebuild(psock->sk_proto);
>  
> +	/* Treat all sockets as non-refcounted, regardless of binding state. */
> +	sock_set_flag(sk, SOCK_RCU_FREE);
> +
>  	sock_replace_proto(sk, &udp_bpf_prots[family]);
>  	return 0;
>  }

There is a side effect that an unhashed (unbound) UDP socket can now be
selected in sk_lookup with bpf_sk_assign. Though perhaps that's for the
better because TC bpf_sk_assign doesn't reject non-refcounted UDP
sockets either, so we would have both socket dispatch sites behave the
same way.

Also, with this patch, if we insert & remove an unhashed UDP socket
into/from a sockmap, we end up with an unhashed non-refcounted UDP
socket. Not entirely sure if that is actually a problem or not.

Willem, what is your take on having unhashed non-refcoted UDP sockets?

^ permalink raw reply

* Re: [BUG] KFENCE: use-after-free read in udp_tunnel_nic_device_sync_work
From: Sam Sun @ 2026-06-24 13:42 UTC (permalink / raw)
  To: Eric Dumazet
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, netdev,
	linux-kernel, syzkaller
In-Reply-To: <CANn89iKD=4_A+7K2erw5AZPnzvBrQqTKXJyfKN5ZfNpAYZ+y3A@mail.gmail.com>

On Wed, Jun 24, 2026 at 6:01 PM Eric Dumazet <edumazet@google.com> wrote:
>
> On Wed, Jun 24, 2026 at 2:01 AM Yue Sun <samsun1006219@gmail.com> wrote:
> >
> > Hello,
> >
> > I hit a reproducible use-after-free in the UDP tunnel NIC offload work item.
> > The original local crash was reported by KFENCE as:
> >
> >   KFENCE: use-after-free read in udp_tunnel_nic_device_sync_work
> >
> > On current mainline, the C reproducer below triggers the same lifetime bug,
> > reported by KASAN before KFENCE samples the object:
> >
> >   BUG: KASAN: slab-use-after-free in __mutex_lock
> >   Workqueue: udp_tunnel_nic udp_tunnel_nic_device_sync_work
> >
> > Tested kernel:
> >
> >   840ef6c78e6a ("Merge tag 'nfs-for-7.2-1' of git://git.linux-nfs.org/projects/anna/linux-nfs")
> >   Linux 7.1.0-11240-g840ef6c78e6a #31 SMP PREEMPT_DYNAMIC
> >
>
>
> Thanks or the report.
>
> Can you test the following patch?
>
> diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
> index 9944ed923ddfd10f9adf6ad788c0740daeaf2adb..c5f8d2f9d325de8f4d2247ddaa52e33378851857
> 100644
> --- a/net/ipv4/udp_tunnel_nic.c
> +++ b/net/ipv4/udp_tunnel_nic.c
> @@ -304,8 +304,8 @@ udp_tunnel_nic_device_sync(struct net_device *dev,
> struct udp_tunnel_nic *utn)
>         if (!utn->need_sync)
>                 return;
>
> -       queue_work(udp_tunnel_nic_workqueue, &utn->work);
>         utn->work_pending = 1;
> +       queue_work(udp_tunnel_nic_workqueue, &utn->work);
>  }
>
>  static bool
> @@ -866,6 +866,11 @@ udp_tunnel_nic_unregister(struct net_device *dev,
> struct udp_tunnel_nic *utn)
>
>         udp_tunnel_nic_lock(dev);
>
> +       if (utn->work_pending) {
> +               udp_tunnel_nic_unlock(dev);
> +               return;
> +       }
> +
>         /* For a shared table remove this dev from the list of sharing devices
>          * and if there are other devices just detach.
>          */
> @@ -901,12 +906,6 @@ udp_tunnel_nic_unregister(struct net_device *dev,
> struct udp_tunnel_nic *utn)
>         udp_tunnel_nic_flush(dev, utn);
>         udp_tunnel_nic_unlock(dev);
>
> -       /* Wait for the work to be done using the state, netdev core will
> -        * retry unregister until we give up our reference on this device.
> -        */
> -       if (utn->work_pending)
> -               return;
> -
>         udp_tunnel_nic_free(utn);
>  release_dev:
>         dev->udp_tunnel_nic = NULL;

I tested the patch, but unfortunately the C reproducer still triggers the
same use-after-free for me.

Tested on top of:

  840ef6c78e6a ("Merge tag 'nfs-for-7.2-1' of
git://git.linux-nfs.org/projects/anna/linux-nfs")

I booted the kernel with KASAN/KFENCE enabled and:

  panic_on_warn=1 panic_on_oops=1 kfence.sample_interval=1

Then I ran the same C reproducer:

  timeout -k 10 360 /root/repro

The VM panicked after about 236 seconds:

[ 236.471119][ T58] BUG: KASAN: slab-use-after-free in
__mutex_lock+0x16d0/0x1d80
[ 236.473404][ T58] Read of size 8 at addr ff11000076a63ea8 by task
kworker/u16:3/58
[ 236.476455][ T58] Hardware name: QEMU Standard PC (i440FX + PIIX,
1996), BIOS 1.15.0-1 04/01/2014
[ 236.476478][ T58] Workqueue: udp_tunnel_nic udp_tunnel_nic_device_sync_work
[ 236.476787][ T58] __mutex_lock+0x16d0/0x1d80
[ 236.477020][ T58] udp_tunnel_nic_device_sync_work+0x32/0x9c0
[ 236.477068][ T58] process_one_work+0x9de/0x1bf0

The allocation/free stacks are still the same shape:
```
Allocated by task 11563:
__kmalloc_noprof
udp_tunnel_nic_netdevice_event+0x12d8/0x1e80
register_netdevice
nsim_create
nsim_dev_reload_up
devlink_reload

Freed by task 11609:
kfree
udp_tunnel_nic_netdevice_event+0xc26/0x1e80
unregister_netdevice_many_notify
nsim_destroy
nsim_dev_reload_down
devlink_reload

Last potentially related work creation:
queue_work_on
__udp_tunnel_nic_del_port+0x2af/0x320
udp_tunnel_notify_del_rx_port
__geneve_sock_release.part.0
geneve_stop

Second to last potentially related work creation:
queue_work_on
__udp_tunnel_nic_add_port+0x6ec/0xd70
udp_tunnel_notify_add_rx_port
geneve_open
```

My read of the patch is that it closes the small window where queue_work()
can publish the work before utn->work_pending is set, and it also prevents
udp_tunnel_nic_unregister() from flushing/freeing the object when
work_pending is already set.

However, the test above suggests that work_pending still does not fully
protect the lifetime of struct udp_tunnel_nic. The crashing work was still
queued through udp_tunnel_nic_device_sync() at line 308, so the patched path
was exercised. One suspicious point is that udp_tunnel_nic_device_sync_work()
clears utn->work_pending at the beginning of the worker, while the same work
item can still interact with replay/add/del-port state. The reproducer can
still end up with udp_tunnel_nic_unregister() freeing utn while a
udp_tunnel_nic_device_sync_work item later runs and dereferences the freed
utn->lock.

So this patch does not seem to be sufficient for this reproducer.

Thanks,
Yue

^ permalink raw reply

* [PATCH net-next v2] selftests: tls: size splice_short pipe by page size
From: Nirmoy Das @ 2026-06-24 13:44 UTC (permalink / raw)
  To: Jakub Kicinski, Sabrina Dubroca, John Fastabend
  Cc: Simon Horman, netdev, linux-kernel, Nirmoy Das

splice_short grows its pipe with (MAX_FRAGS + 1) * 0x1000 so it can
queue one short vmsplice() buffer for each fragment before draining the
pipe. That assumes 4K pipe buffers.

On 64K-page kernels the request is rounded to 262144 bytes, which
provides only four pipe buffers. The fifth one-byte vmsplice() blocks in
pipe_wait_writable and the test times out before it reaches the TLS path.

Request enough bytes for the same number of pipe buffers using the
runtime page size, and assert that the kernel granted at least that much.
If an unprivileged run cannot raise the pipe above the system
pipe-max-size limit, skip the test because it cannot exercise the
intended path.

Fixes: 3667e9b442b9 ("selftests: tls: add test for short splice due to full skmsg")
Assisted-by: Codex:gpt-5
Signed-off-by: Nirmoy Das <nirmoyd@nvidia.com>
Reviewed-by: Simon Horman <horms@kernel.org>
---
v2: Remove redundant ASSERT_GE(ret, 0) (Simon Horman).

 tools/testing/selftests/net/tls.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
index 30a236b8e9f73..d805a7dfbdd51 100644
--- a/tools/testing/selftests/net/tls.c
+++ b/tools/testing/selftests/net/tls.c
@@ -997,6 +997,8 @@ TEST_F(tls, splice_short)
 	char sendbuf[0x100];
 	char sendchar = 'S';
 	int pipefds[2];
+	int pipe_sz;
+	int ret;
 	int i;
 
 	sendchar_iov.iov_base = &sendchar;
@@ -1005,7 +1007,11 @@ TEST_F(tls, splice_short)
 	memset(sendbuf, 's', sizeof(sendbuf));
 
 	ASSERT_GE(pipe2(pipefds, O_NONBLOCK), 0);
-	ASSERT_GE(fcntl(pipefds[0], F_SETPIPE_SZ, (MAX_FRAGS + 1) * 0x1000), 0);
+	pipe_sz = (MAX_FRAGS + 1) * getpagesize();
+	ret = fcntl(pipefds[0], F_SETPIPE_SZ, pipe_sz);
+	if (ret < 0 && errno == EPERM)
+		SKIP(return, "insufficient pipe capacity");
+	ASSERT_GE(ret, pipe_sz);
 
 	for (i = 0; i < MAX_FRAGS; i++)
 		ASSERT_GE(vmsplice(pipefds[1], &sendchar_iov, 1, 0), 0);

base-commit: 47186409c092cd7dd70350999186c700233e854d
-- 
2.43.0


^ permalink raw reply related

* Re: [PATCH net-next] selftests: tls: size splice_short pipe by page size
From: Nirmoy Das @ 2026-06-24 13:46 UTC (permalink / raw)
  To: Simon Horman
  Cc: Jakub Kicinski, Sabrina Dubroca, John Fastabend, netdev,
	linux-kernel
In-Reply-To: <20260624125132.GN827683@horms.kernel.org>


On 24.06.26 15:51, Simon Horman wrote:
> On Mon, Jun 22, 2026 at 01:28:47PM -0700, Nirmoy Das wrote:
>> splice_short grows its pipe with (MAX_FRAGS + 1) * 0x1000 so it can
>> queue one short vmsplice() buffer for each fragment before draining the
>> pipe. That assumes 4K pipe buffers.
>>
>> On 64K-page kernels the request is rounded to 262144 bytes, which
>> provides only four pipe buffers. The fifth one-byte vmsplice() blocks in
>> pipe_wait_writable and the test times out before it reaches the TLS path.
>>
>> Request enough bytes for the same number of pipe buffers using the
>> runtime page size, and assert that the kernel granted at least that much.
>> If an unprivileged run cannot raise the pipe above the system
>> pipe-max-size limit, skip the test because it cannot exercise the
>> intended path.
>>
>> Fixes: 3667e9b442b9 ("selftests: tls: add test for short splice due to full skmsg")
>> Assisted-by: Codex:gpt-5
>> Signed-off-by: Nirmoy Das <nirmoyd@nvidia.com>
> The nit below not withstanding, this looks good to me.
>
> Reviewed-by: Simon Horman <horms@kernel.org>
>
>> ---
>>   tools/testing/selftests/net/tls.c | 9 ++++++++-
>>   1 file changed, 8 insertions(+), 1 deletion(-)
>>
>> diff --git a/tools/testing/selftests/net/tls.c b/tools/testing/selftests/net/tls.c
>> index 30a236b8e9f73..e3bf4ade0f770 100644
>> --- a/tools/testing/selftests/net/tls.c
>> +++ b/tools/testing/selftests/net/tls.c
>> @@ -997,6 +997,8 @@ TEST_F(tls, splice_short)
>>   	char sendbuf[0x100];
>>   	char sendchar = 'S';
>>   	int pipefds[2];
>> +	int pipe_sz;
>> +	int ret;
>>   	int i;
>>   
>>   	sendchar_iov.iov_base = &sendchar;
>> @@ -1005,7 +1007,12 @@ TEST_F(tls, splice_short)
>>   	memset(sendbuf, 's', sizeof(sendbuf));
>>   
>>   	ASSERT_GE(pipe2(pipefds, O_NONBLOCK), 0);
>> -	ASSERT_GE(fcntl(pipefds[0], F_SETPIPE_SZ, (MAX_FRAGS + 1) * 0x1000), 0);
>> +	pipe_sz = (MAX_FRAGS + 1) * getpagesize();
>> +	ret = fcntl(pipefds[0], F_SETPIPE_SZ, pipe_sz);
>> +	if (ret < 0 && errno == EPERM)
>> +		SKIP(return, "insufficient pipe capacity");
>> +	ASSERT_GE(ret, 0);
> nit: the line above seems redundant to me given the line below.


Thanks Simon. Sent v2 with the nit addressed.


>
>> +	ASSERT_GE(ret, pipe_sz);
>>   
>>   	for (i = 0; i < MAX_FRAGS; i++)
>>   		ASSERT_GE(vmsplice(pipefds[1], &sendchar_iov, 1, 0), 0);
>> -- 
>> 2.43.0
>>

^ permalink raw reply

* [PATCH net v2] tipc: fix out-of-bounds read in broadcast Gap ACK blocks
From: Samuel Page @ 2026-06-24 13:56 UTC (permalink / raw)
  To: Jon Maloy
  Cc: David S . Miller, Eric Dumazet, Jakub Kicinski, Paolo Abeni,
	Simon Horman, Tung Quang Nguyen, netdev, tipc-discussion,
	linux-kernel, Samuel Page

A broadcast PROTOCOL/STATE_MSG can carry a Gap ACK blocks record in its
data area. tipc_get_gap_ack_blks() only verifies that the record's len
field is self-consistent with its ugack_cnt/bgack_cnt counts
(sz == struct_size(p, gacks, ugack_cnt + bgack_cnt)); it does not check
that the record actually fits in the message data area, msg_data_sz().

The unicast caller tipc_link_proto_rcv() bounds it ("if (glen > dlen)
break;"), but the broadcast caller tipc_bcast_sync_rcv() discards the
returned size, so tipc_link_advance_transmq() copies the record off the
receive skb with an attacker-controlled count:

	this_ga = kmemdup(ga, struct_size(ga, gacks, ga->bgack_cnt),
			  GFP_ATOMIC);

A TIPC neighbour that negotiated TIPC_GAP_ACK_BLOCK triggers it with one
ordinary broadcast STATE_MSG (msg_bc_ack_invalid() clear), sized so its
data area is short, carrying a Gap ACK record with len = 0x400,
bgack_cnt = 0xff and ugack_cnt = 0. len then equals
struct_size(p, gacks, 255), so the consistency check passes and ga is
non-NULL; kmemdup() reads struct_size(ga, gacks, 255) = 1024 bytes out
of the much smaller skb:

  BUG: KASAN: slab-out-of-bounds in kmemdup_noprof+0x48/0x60
  Read of size 1024 at addr ffff0000c7030d38 by task poc864/69
  Call trace:
   kmemdup_noprof+0x48/0x60
   tipc_link_advance_transmq+0x86c/0xb80
   tipc_link_bc_ack_rcv+0x19c/0x1e0
   tipc_bcast_sync_rcv+0x1c4/0x2c4
   tipc_rcv+0x85c/0x1340
   tipc_l2_rcv_msg+0xac/0x104
  The buggy address belongs to the object at ffff0000c7030d00
   which belongs to the cache skbuff_small_head of size 704
  The buggy address is located 56 bytes inside of
   allocated 704-byte region [ffff0000c7030d00, ffff0000c7030fc0)

The copied-out bytes are subsequently consumed as gap/ack values, but
the read is already out of bounds at the kmemdup() regardless of how
they are used.

The unicast STATE path drops such a message: "if (glen > dlen) break;"
skips the rest of STATE_MSG handling and the skb is freed. Make the
broadcast path drop it too. tipc_bcast_sync_rcv() now bounds the record
against msg_data_sz() and, when it does not fit, reports it back through
tipc_node_bc_sync_rcv() to tipc_rcv() so the skb is discarded rather than
processed. ga is not cleared on this path: ga == NULL already means
"legacy peer without Selective ACK", a distinct legitimate state.

Fixes: d7626b5acff9 ("tipc: introduce Gap ACK blocks for broadcast link")
Cc: stable@vger.kernel.org
Assisted-by: Bynario AI
Signed-off-by: Samuel Page <sam@bynar.io>
---
v2, per review of v1 [1]:
 - v1 cleared 'ga' on an oversized Gap ACK record, which let the malformed
   STATE message be processed as a legacy (no Selective ACK) one rather than
   dropped.  v2 drops it instead, matching the unicast STATE path:
   tipc_bcast_sync_rcv() reports the bad record through a bool output
   parameter, propagated by tipc_node_bc_sync_rcv() to tipc_rcv(), which
   discards the skb.
 - v1 touched only net/tipc/bcast.c; v2 also touches net/tipc/{bcast.h,node.c}.

[1] https://lore.kernel.org/netdev/20260623134137.3641275-1-sam@bynar.io/

For reference, an earlier thread proposed validating inside
tipc_get_gap_ack_blks():
  https://lore.kernel.org/netdev/1316452e465e9a96fce44ec15130a14f3872149f.1775809727.git.caoruide123@gmail.com/

 net/tipc/bcast.c | 22 ++++++++++++++--------
 net/tipc/bcast.h |  2 +-
 net/tipc/node.c  | 13 ++++++++++---
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 76a1585d3f6b..08637c3c9db0 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -497,11 +497,12 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
  */
 int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr,
-			struct sk_buff_head *retrq)
+			struct sk_buff_head *retrq, bool *valid)
 {
 	struct sk_buff_head *inputq = &tipc_bc_base(net)->inputq;
 	struct tipc_gap_ack_blks *ga;
 	struct sk_buff_head xmitq;
+	u16 glen;
 	int rc = 0;
 
 	__skb_queue_head_init(&xmitq);
@@ -510,13 +511,18 @@ int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 	if (msg_type(hdr) != STATE_MSG) {
 		tipc_link_bc_init_rcv(l, hdr);
 	} else if (!msg_bc_ack_invalid(hdr)) {
-		tipc_get_gap_ack_blks(&ga, l, hdr, false);
-		if (!sysctl_tipc_bc_retruni)
-			retrq = &xmitq;
-		rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr),
-					  msg_bc_gap(hdr), ga, &xmitq,
-					  retrq);
-		rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq);
+		glen = tipc_get_gap_ack_blks(&ga, l, hdr, false);
+		if (glen > msg_data_sz(hdr)) {
+			/* Malformed Gap ACK blocks; caller drops the msg */
+			*valid = false;
+		} else {
+			if (!sysctl_tipc_bc_retruni)
+				retrq = &xmitq;
+			rc = tipc_link_bc_ack_rcv(l, msg_bcast_ack(hdr),
+						  msg_bc_gap(hdr), ga, &xmitq,
+						  retrq);
+			rc |= tipc_link_bc_sync_rcv(l, hdr, &xmitq);
+		}
 	}
 	tipc_bcast_unlock(net);
 
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 2d9352dc7b0e..55d17b5413e1 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -97,7 +97,7 @@ void tipc_bcast_ack_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr);
 int tipc_bcast_sync_rcv(struct net *net, struct tipc_link *l,
 			struct tipc_msg *hdr,
-			struct sk_buff_head *retrq);
+			struct sk_buff_head *retrq, bool *valid);
 int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg,
 			struct tipc_link *bcl);
 int tipc_nl_bc_link_set(struct net *net, struct nlattr *attrs[]);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 97aa970a0d83..2887f94ee28f 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -1831,12 +1831,13 @@ static void tipc_node_mcast_rcv(struct tipc_node *n)
 }
 
 static void tipc_node_bc_sync_rcv(struct tipc_node *n, struct tipc_msg *hdr,
-				  int bearer_id, struct sk_buff_head *xmitq)
+				  int bearer_id, struct sk_buff_head *xmitq,
+				  bool *valid)
 {
 	struct tipc_link *ucl;
 	int rc;
 
-	rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq);
+	rc = tipc_bcast_sync_rcv(n->net, n->bc_entry.link, hdr, xmitq, valid);
 
 	if (rc & TIPC_LINK_DOWN_EVT) {
 		tipc_node_reset_links(n);
@@ -2140,12 +2141,18 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
 
 	/* Ensure broadcast reception is in synch with peer's send state */
 	if (unlikely(usr == LINK_PROTOCOL)) {
+		bool valid = true;
+
 		if (unlikely(skb_linearize(skb))) {
 			tipc_node_put(n);
 			goto discard;
 		}
 		hdr = buf_msg(skb);
-		tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq);
+		tipc_node_bc_sync_rcv(n, hdr, bearer_id, &xmitq, &valid);
+		if (!valid) {
+			tipc_node_put(n);
+			goto discard;
+		}
 	} else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack)) {
 		tipc_bcast_ack_rcv(net, n->bc_entry.link, hdr);
 	}

base-commit: a986fde914d88af47eb78fd29c5d1af7952c3500
-- 
2.54.0


^ permalink raw reply related

* Re: [BUG] KFENCE: use-after-free read in udp_tunnel_nic_device_sync_work
From: Eric Dumazet @ 2026-06-24 13:59 UTC (permalink / raw)
  To: Sam Sun
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, netdev,
	linux-kernel, syzkaller
In-Reply-To: <CAEkJfYN4Uep_WEzZyaDGkW5p4rW+oVRqq2DgBt2xiYp9ARt0GA@mail.gmail.com>

On Wed, Jun 24, 2026 at 6:42 AM Sam Sun <samsun1006219@gmail.com> wrote:
>
> On Wed, Jun 24, 2026 at 6:01 PM Eric Dumazet <edumazet@google.com> wrote:
> >
> > On Wed, Jun 24, 2026 at 2:01 AM Yue Sun <samsun1006219@gmail.com> wrote:
> > >
> > > Hello,
> > >
> > > I hit a reproducible use-after-free in the UDP tunnel NIC offload work item.
> > > The original local crash was reported by KFENCE as:
> > >
> > >   KFENCE: use-after-free read in udp_tunnel_nic_device_sync_work
> > >
> > > On current mainline, the C reproducer below triggers the same lifetime bug,
> > > reported by KASAN before KFENCE samples the object:
> > >
> > >   BUG: KASAN: slab-use-after-free in __mutex_lock
> > >   Workqueue: udp_tunnel_nic udp_tunnel_nic_device_sync_work
> > >
> > > Tested kernel:
> > >
> > >   840ef6c78e6a ("Merge tag 'nfs-for-7.2-1' of git://git.linux-nfs.org/projects/anna/linux-nfs")
> > >   Linux 7.1.0-11240-g840ef6c78e6a #31 SMP PREEMPT_DYNAMIC
> > >
> >
> >
> > Thanks or the report.
> >
> > Can you test the following patch?
> >
> > diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
> > index 9944ed923ddfd10f9adf6ad788c0740daeaf2adb..c5f8d2f9d325de8f4d2247ddaa52e33378851857
> > 100644
> > --- a/net/ipv4/udp_tunnel_nic.c
> > +++ b/net/ipv4/udp_tunnel_nic.c
> > @@ -304,8 +304,8 @@ udp_tunnel_nic_device_sync(struct net_device *dev,
> > struct udp_tunnel_nic *utn)
> >         if (!utn->need_sync)
> >                 return;
> >
> > -       queue_work(udp_tunnel_nic_workqueue, &utn->work);
> >         utn->work_pending = 1;
> > +       queue_work(udp_tunnel_nic_workqueue, &utn->work);
> >  }
> >
> >  static bool
> > @@ -866,6 +866,11 @@ udp_tunnel_nic_unregister(struct net_device *dev,
> > struct udp_tunnel_nic *utn)
> >
> >         udp_tunnel_nic_lock(dev);
> >
> > +       if (utn->work_pending) {
> > +               udp_tunnel_nic_unlock(dev);
> > +               return;
> > +       }
> > +
> >         /* For a shared table remove this dev from the list of sharing devices
> >          * and if there are other devices just detach.
> >          */
> > @@ -901,12 +906,6 @@ udp_tunnel_nic_unregister(struct net_device *dev,
> > struct udp_tunnel_nic *utn)
> >         udp_tunnel_nic_flush(dev, utn);
> >         udp_tunnel_nic_unlock(dev);
> >
> > -       /* Wait for the work to be done using the state, netdev core will
> > -        * retry unregister until we give up our reference on this device.
> > -        */
> > -       if (utn->work_pending)
> > -               return;
> > -
> >         udp_tunnel_nic_free(utn);
> >  release_dev:
> >         dev->udp_tunnel_nic = NULL;
>
> I tested the patch, but unfortunately the C reproducer still triggers the
> same use-after-free for me.
>
> Tested on top of:
>
>   840ef6c78e6a ("Merge tag 'nfs-for-7.2-1' of
> git://git.linux-nfs.org/projects/anna/linux-nfs")
>
> I booted the kernel with KASAN/KFENCE enabled and:
>
>   panic_on_warn=1 panic_on_oops=1 kfence.sample_interval=1
>
> Then I ran the same C reproducer:
>
>   timeout -k 10 360 /root/repro
>
> The VM panicked after about 236 seconds:
>
> [ 236.471119][ T58] BUG: KASAN: slab-use-after-free in
> __mutex_lock+0x16d0/0x1d80
> [ 236.473404][ T58] Read of size 8 at addr ff11000076a63ea8 by task
> kworker/u16:3/58
> [ 236.476455][ T58] Hardware name: QEMU Standard PC (i440FX + PIIX,
> 1996), BIOS 1.15.0-1 04/01/2014
> [ 236.476478][ T58] Workqueue: udp_tunnel_nic udp_tunnel_nic_device_sync_work
> [ 236.476787][ T58] __mutex_lock+0x16d0/0x1d80
> [ 236.477020][ T58] udp_tunnel_nic_device_sync_work+0x32/0x9c0
> [ 236.477068][ T58] process_one_work+0x9de/0x1bf0
>
> The allocation/free stacks are still the same shape:
> ```
> Allocated by task 11563:
> __kmalloc_noprof
> udp_tunnel_nic_netdevice_event+0x12d8/0x1e80
> register_netdevice
> nsim_create
> nsim_dev_reload_up
> devlink_reload
>
> Freed by task 11609:
> kfree
> udp_tunnel_nic_netdevice_event+0xc26/0x1e80
> unregister_netdevice_many_notify
> nsim_destroy
> nsim_dev_reload_down
> devlink_reload
>
> Last potentially related work creation:
> queue_work_on
> __udp_tunnel_nic_del_port+0x2af/0x320
> udp_tunnel_notify_del_rx_port
> __geneve_sock_release.part.0
> geneve_stop
>
> Second to last potentially related work creation:
> queue_work_on
> __udp_tunnel_nic_add_port+0x6ec/0xd70
> udp_tunnel_notify_add_rx_port
> geneve_open
> ```
>
> My read of the patch is that it closes the small window where queue_work()
> can publish the work before utn->work_pending is set, and it also prevents
> udp_tunnel_nic_unregister() from flushing/freeing the object when
> work_pending is already set.
>
> However, the test above suggests that work_pending still does not fully
> protect the lifetime of struct udp_tunnel_nic. The crashing work was still
> queued through udp_tunnel_nic_device_sync() at line 308, so the patched path
> was exercised. One suspicious point is that udp_tunnel_nic_device_sync_work()
> clears utn->work_pending at the beginning of the worker, while the same work
> item can still interact with replay/add/del-port state. The reproducer can
> still end up with udp_tunnel_nic_unregister() freeing utn while a
> udp_tunnel_nic_device_sync_work item later runs and dereferences the freed
> utn->lock.
>
> So this patch does not seem to be sufficient for this reproducer.
>

Oh well.

u8 need_sync:1;
u8 need_replay:1;
u8 work_pending:1;

These bitfields are not safe, obviously :/

Time to convert them to atomic bit operations.

^ permalink raw reply

* Re: [PATCH v5 0/9] Fix missing fops.owner in Rust DRM/misc abstractions
From: Miguel Ojeda @ 2026-06-24 14:02 UTC (permalink / raw)
  To: Petr Pavlu
  Cc: Alvin Sun, Miguel Ojeda, Boqun Feng, Gary Guo,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg, Alice Ryhl,
	Trevor Gross, Danilo Krummrich, Luis Chamberlain, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe, Dave Ertman, Leon Romanovsky, Igor Korotin,
	FUJITA Tomonori, Bjorn Helgaas, Krzysztof Wilczyński,
	Arve Hjønnevåg, Todd Kjos, Christian Brauner,
	Carlos Llamas, rust-for-linux, linux-modules, driver-core,
	dri-devel, nova-gpu, linux-kselftest, kunit-dev, linux-block,
	linux-kernel, netdev, linux-pci
In-Reply-To: <8ea21b29-9baf-4926-a16f-7d21c5a1a1b8@suse.com>

On Wed, Jun 24, 2026 at 3:23 PM Petr Pavlu <petr.pavlu@suse.com> wrote:
>
> I would only suggest adding the new file rust/kernel/module.rs in
> patch #1 under the MODULE SUPPORT support entry in the MAINTAINERS file,
> similarly to the other module-related Rust code, so that the module
> maintainers are emailed when changes to this file are proposed. I think
> you can change the existing 'F: rust/kernel/module_param.rs' to
> 'F: rust/kernel/module*.rs'.

That would be great, yes -- thanks!

Cheers,
Miguel

^ permalink raw reply

* Re: [BUG] KFENCE: use-after-free read in udp_tunnel_nic_device_sync_work
From: Eric Dumazet @ 2026-06-24 14:10 UTC (permalink / raw)
  To: Sam Sun
  Cc: David S. Miller, Jakub Kicinski, Paolo Abeni, netdev,
	linux-kernel, syzkaller
In-Reply-To: <CANn89iJ5wKuvKy=Ed-JkeyzKYaEoA_8S=mEMBicMjEPxLCY0Vw@mail.gmail.com>

On Wed, Jun 24, 2026 at 6:59 AM Eric Dumazet <edumazet@google.com> wrote:

> Oh well.
>
> u8 need_sync:1;
> u8 need_replay:1;
> u8 work_pending:1;
>
> These bitfields are not safe, obviously :/
>
> Time to convert them to atomic bit operations.

Can you try:

diff --git a/net/ipv4/udp_tunnel_nic.c b/net/ipv4/udp_tunnel_nic.c
index 9944ed923ddfd10f9adf6ad788c0740daeaf2adb..939d6f656bb71814718bc3bf84be665adad27e4b
100644
--- a/net/ipv4/udp_tunnel_nic.c
+++ b/net/ipv4/udp_tunnel_nic.c
@@ -30,9 +30,7 @@ struct udp_tunnel_nic_table_entry {
  * @work:      async work for talking to hardware from process context
  * @dev:       netdev pointer
  * @lock:      protects all fields
- * @need_sync: at least one port start changed
- * @need_replay: space was freed, we need a replay of all ports
- * @work_pending: @work is currently scheduled
+ * @flags:     sync, replay, pending flags
  * @n_tables:  number of tables under @entries
  * @missed:    bitmap of tables which overflown
  * @entries:   table of tables of ports currently offloaded
@@ -44,9 +42,10 @@ struct udp_tunnel_nic {

        struct mutex lock;

-       u8 need_sync:1;
-       u8 need_replay:1;
-       u8 work_pending:1;
+       unsigned long flags;
+#define UDP_TUNNEL_NIC_NEED_SYNC       0
+#define UDP_TUNNEL_NIC_NEED_REPLAY     1
+#define UDP_TUNNEL_NIC_WORK_PENDING    2

        unsigned int n_tables;
        unsigned long missed;
@@ -116,7 +115,7 @@ udp_tunnel_nic_entry_queue(struct udp_tunnel_nic *utn,
                           unsigned int flag)
 {
        entry->flags |= flag;
-       utn->need_sync = 1;
+       set_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
 }

 static void
@@ -283,7 +282,7 @@ udp_tunnel_nic_device_sync_by_table(struct net_device *dev,
 static void
 __udp_tunnel_nic_device_sync(struct net_device *dev, struct
udp_tunnel_nic *utn)
 {
-       if (!utn->need_sync)
+       if (!test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags))
                return;

        if (dev->udp_tunnel_nic_info->sync_table)
@@ -291,21 +290,24 @@ __udp_tunnel_nic_device_sync(struct net_device
*dev, struct udp_tunnel_nic *utn)
        else
                udp_tunnel_nic_device_sync_by_port(dev, utn);

-       utn->need_sync = 0;
+       clear_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
        /* Can't replay directly here, in case we come from the tunnel driver's
         * notification - trying to replay may deadlock inside tunnel driver.
         */
-       utn->need_replay = udp_tunnel_nic_should_replay(dev, utn);
+       if (udp_tunnel_nic_should_replay(dev, utn))
+               set_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
+       else
+               clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 }

 static void
 udp_tunnel_nic_device_sync(struct net_device *dev, struct udp_tunnel_nic *utn)
 {
-       if (!utn->need_sync)
+       if (!test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags))
                return;

+       set_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags);
        queue_work(udp_tunnel_nic_workqueue, &utn->work);
-       utn->work_pending = 1;
 }

 static bool
@@ -348,7 +350,7 @@ udp_tunnel_nic_has_collision(struct net_device
*dev, struct udp_tunnel_nic *utn,
                        if (!udp_tunnel_nic_entry_is_free(entry) &&
                            entry->port == ti->port &&
                            entry->type != ti->type) {
-                               __set_bit(i, &utn->missed);
+                               set_bit(i, &utn->missed);
                                return true;
                        }
                }
@@ -483,7 +485,7 @@ udp_tunnel_nic_add_new(struct net_device *dev,
struct udp_tunnel_nic *utn,
                 * are no devices currently which have multiple tables accepting
                 * the same tunnel type, and false positives are okay.
                 */
-               __set_bit(i, &utn->missed);
+               set_bit(i, &utn->missed);
        }

        return false;
@@ -552,7 +554,7 @@ static void __udp_tunnel_nic_reset_ntf(struct
net_device *dev)

        mutex_lock(&utn->lock);

-       utn->need_sync = false;
+       clear_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags);
        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++) {
                        struct udp_tunnel_nic_table_entry *entry;
@@ -696,8 +698,8 @@ udp_tunnel_nic_flush(struct net_device *dev,
struct udp_tunnel_nic *utn)
        for (i = 0; i < utn->n_tables; i++)
                memset(utn->entries[i], 0, array_size(info->tables[i].n_entries,
                                                      sizeof(**utn->entries)));
-       WARN_ON(utn->need_sync);
-       utn->need_replay = 0;
+       WARN_ON(test_bit(UDP_TUNNEL_NIC_NEED_SYNC, &utn->flags));
+       clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);
 }

 static void
@@ -713,8 +715,8 @@ udp_tunnel_nic_replay(struct net_device *dev,
struct udp_tunnel_nic *utn)
        for (i = 0; i < utn->n_tables; i++)
                for (j = 0; j < info->tables[i].n_entries; j++)
                        udp_tunnel_nic_entry_freeze_used(&utn->entries[i][j]);
-       utn->missed = 0;
-       utn->need_replay = 0;
+       bitmap_zero(&utn->missed, UDP_TUNNEL_NIC_MAX_TABLES);
+       clear_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags);

        if (!info->shared) {
                udp_tunnel_get_rx_info(dev);
@@ -736,10 +738,10 @@ static void
udp_tunnel_nic_device_sync_work(struct work_struct *work)
        rtnl_lock();
        mutex_lock(&utn->lock);

-       utn->work_pending = 0;
+       clear_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags);
        __udp_tunnel_nic_device_sync(utn->dev, utn);

-       if (utn->need_replay)
+       if (test_bit(UDP_TUNNEL_NIC_NEED_REPLAY, &utn->flags))
                udp_tunnel_nic_replay(utn->dev, utn);

        mutex_unlock(&utn->lock);
@@ -866,6 +868,11 @@ udp_tunnel_nic_unregister(struct net_device *dev,
struct udp_tunnel_nic *utn)

        udp_tunnel_nic_lock(dev);

+       if (test_bit(UDP_TUNNEL_NIC_WORK_PENDING, &utn->flags)) {
+               udp_tunnel_nic_unlock(dev);
+               return;
+       }
+
        /* For a shared table remove this dev from the list of sharing devices
         * and if there are other devices just detach.
         */
@@ -901,12 +908,6 @@ udp_tunnel_nic_unregister(struct net_device *dev,
struct udp_tunnel_nic *utn)
        udp_tunnel_nic_flush(dev, utn);
        udp_tunnel_nic_unlock(dev);

-       /* Wait for the work to be done using the state, netdev core will
-        * retry unregister until we give up our reference on this device.
-        */
-       if (utn->work_pending)
-               return;
-
        udp_tunnel_nic_free(utn);
 release_dev:
        dev->udp_tunnel_nic = NULL;

^ permalink raw reply

* [PATCH V2 net 1/4] net: hns3: unify copper port ksettings configuration path
From: Jijie Shao @ 2026-06-24 14:13 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
	netdev, linux-kernel, shaojijie
In-Reply-To: <20260624141319.271439-1-shaojijie@huawei.com>

From: Shuaisong Yang <yangshuaisong@h-partners.com>

Refactor hns3_set_link_ksettings() and hclge_set_phy_link_ksettings()
to unify the configuration path for copper ports.

Previously, netdevs with a native kernel phy attached bypassed the main
MAC parameter caching logic and returned early via
phy_ethtool_ksettings_set(). This prevented the driver from updating
hdev->hw.mac.req_xxx variables for kernel PHY setups, leaving them
out-of-sync during reset recovery.

Clean this up by routing all copper port configurations through
ops->set_phy_link_ksettings(), and perform driver-level or kernel-level
PHY arbitration inside hclge_set_phy_link_ksettings() via
hnae3_dev_phy_imp_supported(). This ensures that the user's intended link
profiles (req_speed, req_duplex, req_autoneg) are uniformly recorded
across all copper and fiber deployment topologies, laying the groundwork
for stable reset recovery.

For copper ports where neither IMP firmware nor a kernel PHY is available
(e.g. PHY_INEXISTENT), hclge_set_phy_link_ksettings() returns -ENODEV.
In hns3_set_link_ksettings(), this is caught so the configuration falls
through to the existing MAC-level path (check_ksettings_param ->
cfg_mac_speed_dup_h), preserving compatibility with PHY-less copper
deployments.

Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
Changes in V2:
- Add NULL phydev guard in hclge_set_phy_link_ksettings() to prevent
  kernel panic when firmware reports PHY_INEXISTENT on a copper port.
- For PHY_INEXISTENT copper ports, return -ENODEV from
  hclge_set_phy_link_ksettings() and catch it in
  hns3_set_link_ksettings() to fall through to the existing MAC-level
  path, preserving compatibility with PHY-less copper deployments.
- Preserve the 1000BASE-T forced-mode (SPEED_1000 + AUTONEG_DISABLE)
  rejection in the kernel PHY path, closing a validation gap.
---
 .../ethernet/hisilicon/hns3/hns3_ethtool.c    | 31 +++++++++----------
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 28 +++++++++++++++--
 2 files changed, 40 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 9cb7ce9fd311..64bee0e78db3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -811,12 +811,11 @@ static int hns3_get_link_ksettings(struct net_device *netdev,
 }
 
 static int hns3_check_ksettings_param(const struct net_device *netdev,
-				      const struct ethtool_link_ksettings *cmd)
+				      const struct ethtool_link_ksettings *cmd,
+				      u8 media_type)
 {
 	struct hnae3_handle *handle = hns3_get_handle(netdev);
 	const struct hnae3_ae_ops *ops = hns3_get_ops(handle);
-	u8 module_type = HNAE3_MODULE_TYPE_UNKNOWN;
-	u8 media_type = HNAE3_MEDIA_TYPE_UNKNOWN;
 	u32 lane_num;
 	u8 autoneg;
 	u32 speed;
@@ -836,9 +835,6 @@ static int hns3_check_ksettings_param(const struct net_device *netdev,
 			return 0;
 	}
 
-	if (ops->get_media_type)
-		ops->get_media_type(handle, &media_type, &module_type);
-
 	if (cmd->base.duplex == DUPLEX_HALF &&
 	    media_type != HNAE3_MEDIA_TYPE_COPPER) {
 		netdev_err(netdev,
@@ -863,6 +859,8 @@ static int hns3_set_link_ksettings(struct net_device *netdev,
 	struct hnae3_handle *handle = hns3_get_handle(netdev);
 	struct hnae3_ae_dev *ae_dev = hns3_get_ae_dev(handle);
 	const struct hnae3_ae_ops *ops = hns3_get_ops(handle);
+	u8 module_type = HNAE3_MODULE_TYPE_UNKNOWN;
+	u8 media_type = HNAE3_MEDIA_TYPE_UNKNOWN;
 	int ret;
 
 	/* Chip don't support this mode. */
@@ -878,22 +876,23 @@ static int hns3_set_link_ksettings(struct net_device *netdev,
 		  cmd->base.autoneg, cmd->base.speed, cmd->base.duplex,
 		  cmd->lanes);
 
-	/* Only support ksettings_set for netdev with phy attached for now */
-	if (netdev->phydev) {
-		if (cmd->base.speed == SPEED_1000 &&
-		    cmd->base.autoneg == AUTONEG_DISABLE)
-			return -EINVAL;
+	if (!ops->get_media_type)
+		return -EOPNOTSUPP;
+	ops->get_media_type(handle, &media_type, &module_type);
 
-		return phy_ethtool_ksettings_set(netdev->phydev, cmd);
-	} else if (test_bit(HNAE3_DEV_SUPPORT_PHY_IMP_B, ae_dev->caps) &&
-		   ops->set_phy_link_ksettings) {
-		return ops->set_phy_link_ksettings(handle, cmd);
+	if (media_type == HNAE3_MEDIA_TYPE_COPPER) {
+		if (!ops->set_phy_link_ksettings)
+			return -EOPNOTSUPP;
+		ret = ops->set_phy_link_ksettings(handle, cmd);
+		if (ret != -ENODEV)
+			return ret;
+		/* PHY_INEXISTENT, use MAC-level configuration */
 	}
 
 	if (ae_dev->dev_version < HNAE3_DEVICE_VERSION_V2)
 		return -EOPNOTSUPP;
 
-	ret = hns3_check_ksettings_param(netdev, cmd);
+	ret = hns3_check_ksettings_param(netdev, cmd, media_type);
 	if (ret)
 		return ret;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 2f1984930da2..9fe6bc02d71e 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3285,8 +3285,8 @@ static int hclge_get_phy_link_ksettings(struct hnae3_handle *handle,
 }
 
 static int
-hclge_set_phy_link_ksettings(struct hnae3_handle *handle,
-			     const struct ethtool_link_ksettings *cmd)
+hclge_ethtool_ksettings_set(struct hnae3_handle *handle,
+			    const struct ethtool_link_ksettings *cmd)
 {
 	struct hclge_desc desc[HCLGE_PHY_LINK_SETTING_BD_NUM];
 	struct hclge_vport *vport = hclge_get_vport(handle);
@@ -3327,10 +3327,32 @@ hclge_set_phy_link_ksettings(struct hnae3_handle *handle,
 		return ret;
 	}
 
+	linkmode_copy(hdev->hw.mac.advertising, cmd->link_modes.advertising);
+	return 0;
+}
+
+static int
+hclge_set_phy_link_ksettings(struct hnae3_handle *handle,
+			     const struct ethtool_link_ksettings *cmd)
+{
+	struct hclge_vport *vport = hclge_get_vport(handle);
+	struct hclge_dev *hdev = vport->back;
+	int ret = -ENODEV;
+
+	if (hnae3_dev_phy_imp_supported(hdev)) {
+		ret = hclge_ethtool_ksettings_set(handle, cmd);
+	} else if (handle->netdev->phydev) {
+		if (cmd->base.speed == SPEED_1000 &&
+		    cmd->base.autoneg == AUTONEG_DISABLE)
+			return -EINVAL;
+		ret = phy_ethtool_ksettings_set(handle->netdev->phydev, cmd);
+	}
+	if (ret)
+		return ret;
+
 	hdev->hw.mac.req_autoneg = cmd->base.autoneg;
 	hdev->hw.mac.req_speed = cmd->base.speed;
 	hdev->hw.mac.req_duplex = cmd->base.duplex;
-	linkmode_copy(hdev->hw.mac.advertising, cmd->link_modes.advertising);
 
 	return 0;
 }
-- 
2.33.0


^ permalink raw reply related

* [PATCH V2 net 2/4] net: hns3: refactor MAC autoneg and speed configuration
From: Jijie Shao @ 2026-06-24 14:13 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
	netdev, linux-kernel, shaojijie
In-Reply-To: <20260624141319.271439-1-shaojijie@huawei.com>

From: Shuaisong Yang <yangshuaisong@h-partners.com>

Extract the MAC autoneg and speed/duplex/lane configuration logic out
of hclge_mac_init() and encapsulate it into a new dedicated helper
function hclge_set_autoneg_speed_dup().

In the init path (hclge_init_ae_dev), this helper is now called after
hclge_update_port_info() so that firmware-reported autoneg values are
already populated before applying the link configuration.

Introduce a separate req_lane_num field in struct hclge_mac to isolate
the user-requested lane count from mac.lane_num, which firmware may
overwrite via hclge_get_sfp_info() with stale values from a prior link
lifecycle (e.g., lane_num=4 from 100G). During probe, req_lane_num is
initialized to 0, which instructs firmware to auto-select the correct
lane count for the current speed, rather than reusing the firmware-
reported mac.lane_num that may be inconsistent with the target speed.
This prevents probe failures from mismatched (speed, lane_num) pairs.

In the reset path (hclge_reset_ae_dev), it runs immediately after
hclge_mac_init(), using the previously cached req_* values to restore
the link without re-querying firmware.

Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
Changes in V2:
- Squashed the former patch 5 ("fix init failure caused by lane_num
  contamination") into this patch. The req_lane_num separation is
  introduced here to avoid a bisect-time regression where an
  intermediate commit could fail probe with an inconsistent
  (speed, lane_num) pair.
- Rewrote the commit message to accurately describe the init/reset
  path asymmetry and the req_lane_num rationale.
---
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 55 ++++++++++++++-----
 .../hisilicon/hns3/hns3pf/hclge_main.h        |  1 +
 2 files changed, 42 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 9fe6bc02d71e..fb12ba77228c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1504,6 +1504,11 @@ static int hclge_configure(struct hclge_dev *hdev)
 	hdev->hw.mac.req_autoneg = AUTONEG_ENABLE;
 	hdev->hw.mac.req_duplex = DUPLEX_FULL;
 
+	/* When lane_num is 0, the firmware will automatically
+	 * select the appropriate lane_num based on the speed.
+	 */
+	hdev->hw.mac.req_lane_num = 0;
+
 	hclge_parse_link_mode(hdev, cfg.speed_ability);
 
 	hdev->hw.mac.max_speed = hclge_get_max_speed(cfg.speed_ability);
@@ -2579,6 +2584,7 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed,
 	if (ret)
 		return ret;
 
+	hdev->hw.mac.req_lane_num = lane_num;
 	hdev->hw.mac.req_speed = (u32)speed;
 	hdev->hw.mac.req_duplex = duplex;
 
@@ -2884,20 +2890,6 @@ static int hclge_mac_init(struct hclge_dev *hdev)
 	if (!test_bit(HCLGE_STATE_RST_HANDLING, &hdev->state))
 		hdev->hw.mac.duplex = HCLGE_MAC_FULL;
 
-	if (hdev->hw.mac.support_autoneg) {
-		ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
-		if (ret)
-			return ret;
-	}
-
-	if (!hdev->hw.mac.autoneg) {
-		ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
-						 hdev->hw.mac.req_duplex,
-						 hdev->hw.mac.lane_num);
-		if (ret)
-			return ret;
-	}
-
 	mac->link = 0;
 
 	if (mac->user_fec_mode & BIT(HNAE3_FEC_USER_DEF)) {
@@ -9316,6 +9308,27 @@ static int hclge_set_wol(struct hnae3_handle *handle,
 	return ret;
 }
 
+static int hclge_set_autoneg_speed_dup(struct hclge_dev *hdev)
+{
+	int ret;
+
+	if (hdev->hw.mac.support_autoneg) {
+		ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
+		if (ret)
+			return ret;
+	}
+
+	if (!hdev->hw.mac.autoneg) {
+		ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
+						 hdev->hw.mac.req_duplex,
+						 hdev->hw.mac.req_lane_num);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
 static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 {
 	struct pci_dev *pdev = ae_dev->pdev;
@@ -9477,6 +9490,13 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 	if (ret)
 		goto err_ptp_uninit;
 
+	ret = hclge_set_autoneg_speed_dup(hdev);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"failed to set autoneg speed duplex, ret = %d\n", ret);
+		goto err_ptp_uninit;
+	}
+
 	INIT_KFIFO(hdev->mac_tnl_log);
 
 	hclge_dcb_ops_set(hdev);
@@ -9807,6 +9827,13 @@ static int hclge_reset_ae_dev(struct hnae3_ae_dev *ae_dev)
 		return ret;
 	}
 
+	ret = hclge_set_autoneg_speed_dup(hdev);
+	if (ret) {
+		dev_err(&pdev->dev,
+			"failed to set autoneg speed duplex, ret = %d\n", ret);
+		return ret;
+	}
+
 	ret = hclge_tp_port_init(hdev);
 	if (ret) {
 		dev_err(&pdev->dev, "failed to init tp port, ret = %d\n",
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index 87adeb64e6ea..7419481422c3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
@@ -287,6 +287,7 @@ struct hclge_mac {
 	u8 support_autoneg;
 	u8 speed_type;	/* 0: sfp speed, 1: active speed */
 	u8 lane_num;
+	u8 req_lane_num;
 	u32 speed;
 	u32 req_speed;
 	u32 max_speed;
-- 
2.33.0


^ permalink raw reply related

* [PATCH V2 net 0/4] net: hns3: fix configuration deadlocks and refactor link setup
From: Jijie Shao @ 2026-06-24 14:13 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
	netdev, linux-kernel, shaojijie

This patch series addresses a sequence of link configuration deadlocks
and parameter contamination issues in the hns3 network driver, which
typically occur during hardware resets or driver initialization under
specific user-configured scenarios.

The bugs root from asynchronous discrepancies between the MAC state
machine and cached user requests during sudden hardware resets, leading
to invalid parameter combos or frozen registers.

Changes in V2:
- Squashed the former patch 5 ("fix init failure caused by lane_num
  contamination") into patch 2, introducing the req_lane_num separation
  directly where the helper is created. This avoids a bisect-time
  regression where an intermediate commit could fail probe with an
  inconsistent (speed, lane_num) pair.
- Added a NULL phydev guard in patch 1 (hclge_set_phy_link_ksettings)
  to prevent a kernel panic when firmware reports PHY_INEXISTENT on a
  copper port. The previous netdev->phydev check was lost during the
  ethtool refactor.
- In patch 1, for copper ports where neither IMP firmware nor a kernel
  PHY is available (e.g. PHY_INEXISTENT), hclge_set_phy_link_ksettings()
  now returns -ENODEV, and hns3_set_link_ksettings() catches this error
  to proceed to the existing MAC-level path (check_ksettings_param
  -> cfg_mac_speed_dup_h), preserving compatibility with PHY-less copper
  deployments.
- Preserved the 1000BASE-T forced-mode rejection in the kernel PHY
  path inside the new hclge_set_phy_link_ksettings() wrapper, closing
  a gap identified in community review.
- Fixed a link-loss regression in patch 4 where fiber ports in forced
  mode would be configured with the static default_speed instead of the
  firmware-probed SFP speed, by synchronizing req_speed from mac.speed
  when req_autoneg is overridden to AUTONEG_DISABLE.
- Rewrote the commit message of patch 2 to accurately describe the
  init/reset path asymmetry and the req_lane_num rationale.

The series is organized as follows:
- Patch 1 refactors the ethtool link settings entry path to unify copper
  port handling (both native kernel PHY_LIB and firmware-controlled PHY)
  and ensures req_xxx configurations are uniformly saved across all modes.
  For PHY_INEXISTENT copper ports, -ENODEV is returned to allow fallthrough
  to MAC-level configuration.
- Patch 2 refactors the MAC initialization by extracting the autoneg and
  speed configuration logic out of hclge_mac_init() into a dedicated
  helper function, and introduces req_lane_num to isolate the user-
  requested lane count from firmware-overwritten mac.lane_num.
- Patch 3 fixes a permanent link-down deadlock after a reset by ensuring
  that the driver caches and uses the user's intended autoneg/speed
  settings (req_***) rather than unsynchronized runtime states or
  SPEED_UNKNOWN tokens.
- Patch 4 fixes a link loss issue on optical ports during initialization
  by differentiating autoneg default values between copper and fiber
  media types, and synchronizing req_speed with the firmware-probed
  SFP speed when forced mode is detected.

Shuaisong Yang (4):
  net: hns3: unify copper port ksettings configuration path
  net: hns3: refactor MAC autoneg and speed configuration
  net: hns3: fix permanent link down deadlock after reset
  net: hns3: differentiate autoneg default values between copper and
    fiber

 .../ethernet/hisilicon/hns3/hns3_ethtool.c    |  31 +++--
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 108 ++++++++++++++----
 .../hisilicon/hns3/hns3pf/hclge_main.h        |   1 +
 3 files changed, 102 insertions(+), 38 deletions(-)


base-commit: d87363b0edfc7504ff2b144fe4cdd8154f90f42e
--
2.33.0

^ permalink raw reply

* [PATCH V2 net 3/4] net: hns3: fix permanent link down deadlock after reset
From: Jijie Shao @ 2026-06-24 14:13 UTC (permalink / raw)
  To: davem, edumazet, kuba, pabeni, andrew+netdev, horms
  Cc: shenjian15, liuyonglong, chenhao418, huangdonghua3, yangshuaisong,
	netdev, linux-kernel, shaojijie
In-Reply-To: <20260624141319.271439-1-shaojijie@huawei.com>

From: Shuaisong Yang <yangshuaisong@h-partners.com>

Fix a critical race condition deadlock where the network interface
remains permanently Link Down after a hardware reset under specific
ethtool sequences.

This issue exclusively manifests in firmware-controlled PHY topologies
where the driver relies on the IMP firmware to arbitrate link parameters.
Standard devices driven by the kernel's native PHY_LIB are unaffected.

The deadlock occurs via the following path:
1. User disables autoneg and forces an unmatched speed, forcing link
   down: `ethtool -s ethx autoneg off speed 10 duplex full`
2. User re-enables autoneg: `ethtool -s ethx autoneg on`. The netdev
   stack passes cmd->base.speed as SPEED_UNKNOWN (0xffffffff).
3. Driver saves req_autoneg=1, but before the interface can link up,
   a hardware reset is triggered.
4. During reset recovery, MAC init reads the un-synchronized runtime
   state mac.autoneg (which is still 0/OFF), misinterprets it as
   forced mode, and pushes the cached SPEED_UNKNOWN into the hardware
   registers, causing the MAC firmware state machine to freeze.
   Meanwhile, PHY init reads req_autoneg=1 and enables PHY autoneg.

Since the MAC is frozen with 0xffffffff and PHY is running autoneg,
they mismatch permanently.

Fix this by:
1. Intercepting SPEED_UNKNOWN/DUPLEX_UNKNOWN in
   hclge_set_phy_link_ksettings() and hclge_cfg_mac_speed_dup_h() to
   prevent it from corrupting the driver's cached valid configuration.
2. Save req_autoneg in hclge_set_autoneg().
3. Aligning the state judgment in hclge_set_autoneg_speed_dup() to use
   req_autoneg instead of the un-synchronized runtime mac.autoneg,
   ensuring both MAC and PHY consistently enter the autoneg branch to
   eliminate configuration discrepancies during reset recovery.

Fixes: 05eb60e9648c ("net: hns3: using user configure after hardware reset")
Signed-off-by: Shuaisong Yang <yangshuaisong@h-partners.com>
Signed-off-by: Jijie Shao <shaojijie@huawei.com>
---
 .../hisilicon/hns3/hns3pf/hclge_main.c        | 22 +++++++++++++------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index fb12ba77228c..d176100d3e4c 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2585,8 +2585,10 @@ static int hclge_cfg_mac_speed_dup_h(struct hnae3_handle *handle, int speed,
 		return ret;
 
 	hdev->hw.mac.req_lane_num = lane_num;
-	hdev->hw.mac.req_speed = (u32)speed;
-	hdev->hw.mac.req_duplex = duplex;
+	if (speed != SPEED_UNKNOWN)
+		hdev->hw.mac.req_speed = (u32)speed;
+	if (duplex != DUPLEX_UNKNOWN)
+		hdev->hw.mac.req_duplex = duplex;
 
 	return 0;
 }
@@ -2617,6 +2619,7 @@ static int hclge_set_autoneg(struct hnae3_handle *handle, bool enable)
 {
 	struct hclge_vport *vport = hclge_get_vport(handle);
 	struct hclge_dev *hdev = vport->back;
+	int ret;
 
 	if (!hdev->hw.mac.support_autoneg) {
 		if (enable) {
@@ -2628,7 +2631,10 @@ static int hclge_set_autoneg(struct hnae3_handle *handle, bool enable)
 		}
 	}
 
-	return hclge_set_autoneg_en(hdev, enable);
+	ret = hclge_set_autoneg_en(hdev, enable);
+	if (!ret)
+		hdev->hw.mac.req_autoneg = enable;
+	return ret;
 }
 
 static int hclge_get_autoneg(struct hnae3_handle *handle)
@@ -3343,8 +3349,10 @@ hclge_set_phy_link_ksettings(struct hnae3_handle *handle,
 		return ret;
 
 	hdev->hw.mac.req_autoneg = cmd->base.autoneg;
-	hdev->hw.mac.req_speed = cmd->base.speed;
-	hdev->hw.mac.req_duplex = cmd->base.duplex;
+	if (cmd->base.speed != SPEED_UNKNOWN)
+		hdev->hw.mac.req_speed = cmd->base.speed;
+	if (cmd->base.duplex != DUPLEX_UNKNOWN)
+		hdev->hw.mac.req_duplex = cmd->base.duplex;
 
 	return 0;
 }
@@ -9313,12 +9321,12 @@ static int hclge_set_autoneg_speed_dup(struct hclge_dev *hdev)
 	int ret;
 
 	if (hdev->hw.mac.support_autoneg) {
-		ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.autoneg);
+		ret = hclge_set_autoneg_en(hdev, hdev->hw.mac.req_autoneg);
 		if (ret)
 			return ret;
 	}
 
-	if (!hdev->hw.mac.autoneg) {
+	if (!hdev->hw.mac.req_autoneg) {
 		ret = hclge_cfg_mac_speed_dup_hw(hdev, hdev->hw.mac.req_speed,
 						 hdev->hw.mac.req_duplex,
 						 hdev->hw.mac.req_lane_num);
-- 
2.33.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox