Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* Re: [PATCH v3 4/6] rust: drm: set fops.owner from driver module pointer
From: Gary Guo @ 2026-06-22 10:48 UTC (permalink / raw)
  To: Alvin Sun, Miguel Ojeda, Boqun Feng, Gary Guo,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg, Alice Ryhl,
	Trevor Gross, Danilo Krummrich, Luis Chamberlain, Petr Pavlu,
	Daniel Gomez, Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel
In-Reply-To: <20260622-fix-fops-owner-v3-4-49d45cb37032@linux.dev>

On Mon Jun 22, 2026 at 3:44 AM BST, Alvin Sun wrote:
> Change `create_fops()` to accept an owner module pointer instead of
> hardcoding `null_mut()`, ensuring the kernel correctly tracks the
> module owning the DRM device's file operations.
> 
> Signed-off-by: Alvin Sun <alvin.sun@linux.dev>

Reviewed-by: Gary Guo <gary@garyguo.net>

How is the patch logistics going to be handled? This series probably should be
routed via the rust tree? Perhaps as fixes?

Best,
Gary

> ---
>  rust/kernel/drm/device.rs  | 3 ++-
>  rust/kernel/drm/gem/mod.rs | 4 ++--
>  2 files changed, 4 insertions(+), 3 deletions(-)


^ permalink raw reply

* Re: [PATCH v3 1/6] rust: module: add `THIS_MODULE` const to `ModuleMetadata` trait
From: Gary Guo @ 2026-06-22 10:50 UTC (permalink / raw)
  To: Alvin Sun, Miguel Ojeda, Boqun Feng, Gary Guo,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg, Alice Ryhl,
	Trevor Gross, Danilo Krummrich, Luis Chamberlain, Petr Pavlu,
	Daniel Gomez, Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel
In-Reply-To: <20260622-fix-fops-owner-v3-1-49d45cb37032@linux.dev>

On Mon Jun 22, 2026 at 3:44 AM BST, Alvin Sun wrote:
> Since `const_refs_to_static` has been stable as of the MSRV bump, a
> `ThisModule` pointer can now be used in const contexts.
>
> Add a `THIS_MODULE` const to the `ModuleMetadata` trait so that modules
> can provide their `ThisModule` pointer in const contexts such as static
> `file_operations`.
>
> Move the `THIS_MODULE` static from the `module!` macro into the
> `ModuleMetadata` impl, add a `this_module()` helper, and update `__init`
> to use it.
>
> Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
> ---
>  rust/kernel/lib.rs    |  8 ++++++++
>  rust/macros/module.rs | 34 +++++++++++++++++-----------------
>  2 files changed, 25 insertions(+), 17 deletions(-)
>
> diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
> index b72b2fbe046d6..50f5a7b5f028e 100644
> --- a/rust/kernel/lib.rs
> +++ b/rust/kernel/lib.rs
> @@ -184,6 +184,14 @@ fn init(module: &'static ThisModule) -> impl pin_init::PinInit<Self, error::Erro
>  pub trait ModuleMetadata {
>      /// The name of the module as specified in the `module!` macro.
>      const NAME: &'static crate::str::CStr;
> +
> +    /// The module's `THIS_MODULE` pointer.
> +    const THIS_MODULE: ThisModule;
> +}
> +
> +/// Returns a reference to the `THIS_MODULE` of the given module type.
> +pub const fn this_module<M: ModuleMetadata>() -> &'static ThisModule {
> +    &M::THIS_MODULE
>  }

Also, FWIW I think this should not put this in the crate root. Perhaps create a
modules.rs?

Best,
Gary

>  
>  /// Equivalent to `THIS_MODULE` in the C API.
> diff --git a/rust/macros/module.rs b/rust/macros/module.rs
> index 06c18e2075083..b9fdee2f2af47 100644
> --- a/rust/macros/module.rs
> +++ b/rust/macros/module.rs
> @@ -497,28 +497,28 @@ pub(crate) fn module(info: ModuleInfo) -> Result<TokenStream> {
>          /// Used by the printing macros, e.g. [`info!`].
>          const __LOG_PREFIX: &[u8] = #name_cstr.to_bytes_with_nul();
>  
> -        // SAFETY: `__this_module` is constructed by the kernel at load time and will not be
> -        // freed until the module is unloaded.
> -        #[cfg(MODULE)]
> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
> -            extern "C" {
> -                static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
> -            };
> -
> -            ::kernel::ThisModule::from_ptr(__this_module.get())
> -        };
> -
> -        #[cfg(not(MODULE))]
> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
> -            ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
> -        };
> -
>          /// The `LocalModule` type is the type of the module created by `module!`,
>          /// `module_pci_driver!`, `module_platform_driver!`, etc.
>          type LocalModule = #type_;
>  
>          impl ::kernel::ModuleMetadata for #type_ {
>              const NAME: &'static ::kernel::str::CStr = #name_cstr;
> +
> +            #[cfg(MODULE)]
> +            const THIS_MODULE: ::kernel::ThisModule = {
> +                extern "C" {
> +                    static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
> +                }
> +
> +                // SAFETY: `__this_module` is constructed by the kernel at load time
> +                // and lives until the module is unloaded.
> +                unsafe { ::kernel::ThisModule::from_ptr(__this_module.get()) }
> +            };
> +
> +            #[cfg(not(MODULE))]
> +            const THIS_MODULE: ::kernel::ThisModule = unsafe {
> +                ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
> +            };
>          }
>  
>          // Double nested modules, since then nobody can access the public items inside.
> @@ -616,7 +616,7 @@ pub extern "C" fn #ident_exit() {
>                  /// This function must only be called once.
>                  unsafe fn __init() -> ::kernel::ffi::c_int {
>                      let initer = <super::super::LocalModule as ::kernel::InPlaceModule>::init(
> -                        &super::super::THIS_MODULE
> +                        ::kernel::this_module::<super::super::LocalModule>()
>                      );
>                      // SAFETY: No data race, since `__MOD` can only be accessed by this module
>                      // and there only `__init` and `__exit` access it. These functions are only



^ permalink raw reply

* Re: [PATCH v3 0/7] Prepare mutable list iterators to cache cursor state
From: David Hildenbrand (Arm) @ 2026-06-22 11:27 UTC (permalink / raw)
  To: Alexei Starovoitov, Kaitao Cheng
  Cc: Andrew Morton, Jens Axboe, Tejun Heo, Alexander Viro,
	Christian Brauner, Alexei Starovoitov, Daniel Borkmann,
	Andrii Nakryiko, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Thomas Gleixner,
	Juri Lelli, Vincent Guittot, Paul Moore, Andy Shevchenko,
	Paul E. McKenney, Shakeel Butt, Christian König,
	David Howells, Simona Vetter, Randy Dunlap, Luca Ceresoli,
	Philipp Stanner, linux-block, LKML,
	open list:CONTROL GROUP (CGROUP), linux-ntfs-dev, Linux-Fsdevel,
	io-uring, audit, bpf, Network Development, dri-devel,
	linux-perf-use., linux-trace-kernel, kexec, live-patching,
	linux-modules, Linux Crypto Mailing List, Linux Power Management,
	rcu, sched-ext, linux-mm, virtualization, damon,
	clang-built-linux, chengkaitao
In-Reply-To: <CAADnVQJmPWFT01b7DuLdtafv=8FyB84GYHNZ8zSTck+9Aw0JpA@mail.gmail.com>

On 6/22/26 07:28, Alexei Starovoitov wrote:
> On Sun, Jun 21, 2026 at 9:06 PM Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>
>> From: chengkaitao <chengkaitao@kylinos.cn>
>>
>> The list_for_each*_safe() helpers are used when the loop body may remove
>> the current entry.  Their current interface, however, forces every caller
>> to define a temporary cursor outside the macro and pass it in, even when
>> the caller never uses that cursor directly.  For most call sites this
>> extra cursor is just boilerplate required by the macro implementation.
>>
>> This is awkward because the saved next pointer is an internal detail of
>> the iteration.  Callers that only remove or move the current entry do not
>> need to spell it out.
>>
>> The _safe() suffix has also caused confusion.  Christian Koenig pointed
>> out that the name is easy to read as a thread-safe variant, especially
>> for beginners, even though it only means that the iterator keeps enough
>> state to tolerate removal of the current entry.  He suggested _mutable()
>> as a clearer description of what the loop permits.
>>
>> Add *_mutable() iterator variants for list, hlist and llist.  The new
>> helpers are variadic and support both forms.  In the common case, the
>> caller omits the temporary cursor and the macro creates a unique internal
>> cursor with typeof(pos) and __UNIQUE_ID().  If a loop really needs an
>> explicit temporary cursor, the caller can still pass it and the helper
>> keeps the existing *_safe() behaviour.
>>
>> For example, a call site may use the shorter form:
>>
>>   list_for_each_entry_mutable(pos, head, member)
>>
>> or keep the explicit temporary cursor form:
>>
>>   list_for_each_entry_mutable(pos, tmp, head, member)
>>
>> The existing *_safe() helpers remain available for compatibility.  This
>> series only converts users in mm, block, kernel, init and io_uring.  If
>> this approach looks acceptable, the remaining users can be converted in
>> follow-up series.
>>
>> Changes in v3 (Christian König, Andy Shevchenko):
>> - Convert safe list walks to mutable iterators
>>
>> Changes in v2 (Muchun Song, Andy Shevchenko):
>> - Drop the list_for_each_entry_mutable*() helpers from v1 and make the
>>   cursor change directly in the existing list_for_each_entry*() helpers.
>> - Open-code special list walks that rely on updating the loop cursor in
>>   the body, preserving their existing traversal semantics.
>>
>> Link to v2:
>> https://lore.kernel.org/all/20260609061347.93688-1-kaitao.cheng@linux.dev/
>>
>> Link to v1:
>> https://lore.kernel.org/all/20260529082149.76764-1-kaitao.cheng@linux.dev/
>>
>> Kaitao Cheng (7):
>>   list: Add mutable iterator variants
>>   llist: Add mutable iterator variants
>>   mm: Use mutable list iterators
>>   block: Use mutable list iterators
>>   kernel: Use mutable list iterators
>>   initramfs: Use mutable list iterator
>>   io_uring: Use mutable list iterators
>>
>>  block/bfq-iosched.c                 |  17 +-
>>  block/blk-cgroup.c                  |  12 +-
>>  block/blk-flush.c                   |   4 +-
>>  block/blk-iocost.c                  |  18 +-
>>  block/blk-mq.c                      |   8 +-
>>  block/blk-throttle.c                |   4 +-
>>  block/kyber-iosched.c               |   4 +-
>>  block/partitions/ldm.c              |   8 +-
>>  block/sed-opal.c                    |   4 +-
>>  include/linux/list.h                | 269 ++++++++++++++++++++++++----
>>  include/linux/llist.h               |  81 +++++++--
>>  init/initramfs.c                    |   5 +-
>>  io_uring/cancel.c                   |   6 +-
>>  io_uring/poll.c                     |   3 +-
>>  io_uring/rw.c                       |   4 +-
>>  io_uring/timeout.c                  |   8 +-
>>  io_uring/uring_cmd.c                |   3 +-
>>  kernel/audit_tree.c                 |   4 +-
>>  kernel/audit_watch.c                |  16 +-
>>  kernel/auditfilter.c                |   4 +-
>>  kernel/auditsc.c                    |   4 +-
>>  kernel/bpf/arena.c                  |  10 +-
>>  kernel/bpf/arraymap.c               |   8 +-
>>  kernel/bpf/bpf_local_storage.c      |   3 +-
>>  kernel/bpf/bpf_lru_list.c           |  25 ++-
>>  kernel/bpf/btf.c                    |  18 +-
>>  kernel/bpf/cgroup.c                 |   7 +-
>>  kernel/bpf/cpumap.c                 |   4 +-
>>  kernel/bpf/devmap.c                 |  10 +-
>>  kernel/bpf/helpers.c                |   8 +-
>>  kernel/bpf/local_storage.c          |   4 +-
>>  kernel/bpf/memalloc.c               |  16 +-
>>  kernel/bpf/offload.c                |   8 +-
>>  kernel/bpf/states.c                 |   4 +-
>>  kernel/bpf/stream.c                 |   4 +-
>>  kernel/bpf/verifier.c               |   6 +-
>>  kernel/cgroup/cgroup-v1.c           |   4 +-
>>  kernel/cgroup/cgroup.c              |  54 +++---
>>  kernel/cgroup/dmem.c                |  12 +-
>>  kernel/cgroup/rdma.c                |   8 +-
>>  kernel/events/core.c                |  44 +++--
>>  kernel/events/uprobes.c             |  12 +-
>>  kernel/exit.c                       |   8 +-
>>  kernel/fail_function.c              |   4 +-
>>  kernel/gcov/clang.c                 |   4 +-
>>  kernel/irq_work.c                   |   4 +-
>>  kernel/kexec_core.c                 |   4 +-
>>  kernel/kprobes.c                    |  16 +-
>>  kernel/livepatch/core.c             |   4 +-
>>  kernel/livepatch/core.h             |   4 +-
>>  kernel/liveupdate/kho_block.c       |   4 +-
>>  kernel/liveupdate/luo_flb.c         |   4 +-
>>  kernel/locking/rwsem.c              |   2 +-
>>  kernel/locking/test-ww_mutex.c      |   2 +-
>>  kernel/module/main.c                |  11 +-
>>  kernel/padata.c                     |   4 +-
>>  kernel/power/snapshot.c             |   8 +-
>>  kernel/power/wakelock.c             |   4 +-
>>  kernel/printk/printk.c              |  11 +-
>>  kernel/ptrace.c                     |   4 +-
>>  kernel/rcu/rcutorture.c             |   3 +-
>>  kernel/rcu/tasks.h                  |   9 +-
>>  kernel/rcu/tree.c                   |   6 +-
>>  kernel/resource.c                   |   4 +-
>>  kernel/sched/core.c                 |   4 +-
>>  kernel/sched/ext.c                  |  22 +--
>>  kernel/sched/fair.c                 |  28 +--
>>  kernel/sched/topology.c             |   4 +-
>>  kernel/sched/wait.c                 |   4 +-
>>  kernel/seccomp.c                    |   4 +-
>>  kernel/signal.c                     |  11 +-
>>  kernel/smp.c                        |   4 +-
>>  kernel/taskstats.c                  |   8 +-
>>  kernel/time/clockevents.c           |   6 +-
>>  kernel/time/clocksource.c           |   4 +-
>>  kernel/time/posix-cpu-timers.c      |   4 +-
>>  kernel/time/posix-timers.c          |   3 +-
>>  kernel/torture.c                    |   3 +-
>>  kernel/trace/bpf_trace.c            |   4 +-
>>  kernel/trace/ftrace.c               |  49 +++--
>>  kernel/trace/ring_buffer.c          |  25 ++-
>>  kernel/trace/trace.c                |  12 +-
>>  kernel/trace/trace_dynevent.c       |   6 +-
>>  kernel/trace/trace_dynevent.h       |   5 +-
>>  kernel/trace/trace_events.c         |  35 ++--
>>  kernel/trace/trace_events_filter.c  |   4 +-
>>  kernel/trace/trace_events_hist.c    |   8 +-
>>  kernel/trace/trace_events_trigger.c |  17 +-
>>  kernel/trace/trace_events_user.c    |  16 +-
>>  kernel/trace/trace_stat.c           |   4 +-
>>  kernel/user-return-notifier.c       |   3 +-
>>  kernel/workqueue.c                  |  16 +-
>>  mm/backing-dev.c                    |   8 +-
>>  mm/balloon.c                        |   8 +-
>>  mm/cma.c                            |   4 +-
>>  mm/compaction.c                     |   4 +-
>>  mm/damon/core.c                     |   4 +-
>>  mm/damon/sysfs-schemes.c            |   4 +-
>>  mm/dmapool.c                        |   4 +-
>>  mm/huge_memory.c                    |   8 +-
>>  mm/hugetlb.c                        |  56 +++---
>>  mm/hugetlb_vmemmap.c                |  16 +-
>>  mm/khugepaged.c                     |  14 +-
>>  mm/kmemleak.c                       |   7 +-
>>  mm/ksm.c                            |  25 +--
>>  mm/list_lru.c                       |   4 +-
>>  mm/memcontrol-v1.c                  |   8 +-
>>  mm/memory-failure.c                 |  12 +-
>>  mm/memory-tiers.c                   |   4 +-
>>  mm/migrate.c                        |  23 ++-
>>  mm/mmu_notifier.c                   |   9 +-
>>  mm/page_alloc.c                     |   8 +-
>>  mm/page_reporting.c                 |   2 +-
>>  mm/percpu.c                         |  11 +-
>>  mm/pgtable-generic.c                |   4 +-
>>  mm/rmap.c                           |  10 +-
>>  mm/shmem.c                          |   9 +-
>>  mm/slab_common.c                    |  14 +-
>>  mm/slub.c                           |  33 ++--
>>  mm/swapfile.c                       |   4 +-
>>  mm/userfaultfd.c                    |  12 +-
>>  mm/vmalloc.c                        |  24 +--
>>  mm/vmscan.c                         |   7 +-
>>  mm/zsmalloc.c                       |   4 +-
>>  124 files changed, 875 insertions(+), 681 deletions(-)
> 
> Not sure what you were thinking, but this diff stat
> is not landable.

Agreed. If we decide we want this, I guess we should target per-subsystem
conversions.

If this goes through the MM tree, I would even appreciate doing this on a per-MM
component granularity.

(unless we have some magic "Linus converts all of them" script, which I doubt we
will have)

Is there a way forward to replace list_for_each_*_safe entirely, possibly just
reusing the old name but simply the parameter?

-- 
Cheers,

David

^ permalink raw reply

* Re: [PATCH blktests] Fix _get_page_size()
From: Shin'ichiro Kawasaki @ 2026-06-22 11:38 UTC (permalink / raw)
  To: Bart Van Assche; +Cc: Jeff Moyer, linux-block, osandov, kch
In-Reply-To: <089e0281-4df8-4358-91ce-1f5cc0f0ec4b@acm.org>

On Jun 20, 2026 / 09:11, Bart Van Assche wrote:
> On 6/20/26 6:51 AM, Shin'ichiro Kawasaki wrote:
> > On Jun 20, 2026 / 05:55, Bart Van Assche wrote:
> > > On 6/20/26 3:26 AM, Shin'ichiro Kawasaki wrote:
> > > > This is a rather fundamental change, so I would like to ask opinions from
> > > > other blktests users, especially Omar and Chaitanya. What do you think about
> > > > the idea to add getconf to the requirement list?
> > > 
> > > CONFIG_PAGE_SHIFT was introduced in the Linux kernel in February 2024
> > > (commit ba89f9c8ccba ("arch: consolidate existing CONFIG_PAGE_SIZE_*KB
> > > definitions")). Older kernels had CONFIG_PAGE_SIZE_4KB,
> > > CONFIG_PAGE_SIZE_16KB, etc. This means that it is possible to derive the
> > > kernel page size from the kernel configuration file for all upstream and
> > > distro kernels, isn't it?
> > 
> > I checked the commit is in the tag v6.9. My Debian bookworm system has kernel
> > v6.1, then the config file at /boot does not have CONFIG_PAGE_SHIFT as expected.
> > But it does not have CONFIG_PAGE_SIZE_* either... I'm still afraid that kernel
> > config file approach is not reliable.
> 
> Right, for older kernels CONFIG_PAGE_SIZE_*KB is only available for some
> but not for all supported architectures.
> 
> It is not clear to me where the desire to avoid the dependency on
> getconf comes from? As far as I know it is available on all Linux
> distro's. Since it is typically included in the C library package it
> should not introduce a new dependency.

I think less dependent is the better in general, and wanted to confirm that
it is fine for everybody. If there is no voice to object, I will create a
patch to add getconf to the requirement list.

^ permalink raw reply

* Re: [PATCH v3 1/6] rust: module: add `THIS_MODULE` const to `ModuleMetadata` trait
From: Alvin Sun @ 2026-06-22 12:42 UTC (permalink / raw)
  To: Gary Guo, Miguel Ojeda, Boqun Feng, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel
In-Reply-To: <DJFIKIXLML05.3KYOXUGZYJRDZ@garyguo.net>


On 6/22/26 18:42, Gary Guo wrote:
> On Mon Jun 22, 2026 at 3:44 AM BST, Alvin Sun wrote:
>> Since `const_refs_to_static` has been stable as of the MSRV bump, a
>> `ThisModule` pointer can now be used in const contexts.
>>
>> Add a `THIS_MODULE` const to the `ModuleMetadata` trait so that modules
>> can provide their `ThisModule` pointer in const contexts such as static
>> `file_operations`.
>>
>> Move the `THIS_MODULE` static from the `module!` macro into the
>> `ModuleMetadata` impl, add a `this_module()` helper, and update `__init`
>> to use it.
> Doesn't this break existing users of THIS_MODULE?

You are right, I missed binder. I will add a patch to update binder to use
`this_module::<LocalModule>()` in the next version.

While looking into this, I also noticed `gen_disk.rs` has a `// TODO: Set to
THIS_MODULE` with `owner` still set to `null_mut()`.

Best regards,
Alvin Sun

>
> Binder, rnull and configfs macros are using it.
>
> Best,
> Gary
>
>> Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
>> ---
>>   rust/kernel/lib.rs    |  8 ++++++++
>>   rust/macros/module.rs | 34 +++++++++++++++++-----------------
>>   2 files changed, 25 insertions(+), 17 deletions(-)
>>
>> diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
>> index b72b2fbe046d6..50f5a7b5f028e 100644
>> --- a/rust/kernel/lib.rs
>> +++ b/rust/kernel/lib.rs
>> @@ -184,6 +184,14 @@ fn init(module: &'static ThisModule) -> impl pin_init::PinInit<Self, error::Erro
>>   pub trait ModuleMetadata {
>>       /// The name of the module as specified in the `module!` macro.
>>       const NAME: &'static crate::str::CStr;
>> +
>> +    /// The module's `THIS_MODULE` pointer.
>> +    const THIS_MODULE: ThisModule;
>> +}
>> +
>> +/// Returns a reference to the `THIS_MODULE` of the given module type.
>> +pub const fn this_module<M: ModuleMetadata>() -> &'static ThisModule {
>> +    &M::THIS_MODULE
>>   }
>>   
>>   /// Equivalent to `THIS_MODULE` in the C API.
>> diff --git a/rust/macros/module.rs b/rust/macros/module.rs
>> index 06c18e2075083..b9fdee2f2af47 100644
>> --- a/rust/macros/module.rs
>> +++ b/rust/macros/module.rs
>> @@ -497,28 +497,28 @@ pub(crate) fn module(info: ModuleInfo) -> Result<TokenStream> {
>>           /// Used by the printing macros, e.g. [`info!`].
>>           const __LOG_PREFIX: &[u8] = #name_cstr.to_bytes_with_nul();
>>   
>> -        // SAFETY: `__this_module` is constructed by the kernel at load time and will not be
>> -        // freed until the module is unloaded.
>> -        #[cfg(MODULE)]
>> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
>> -            extern "C" {
>> -                static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
>> -            };
>> -
>> -            ::kernel::ThisModule::from_ptr(__this_module.get())
>> -        };
>> -
>> -        #[cfg(not(MODULE))]
>> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
>> -            ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
>> -        };
>> -
>>           /// The `LocalModule` type is the type of the module created by `module!`,
>>           /// `module_pci_driver!`, `module_platform_driver!`, etc.
>>           type LocalModule = #type_;
>>   
>>           impl ::kernel::ModuleMetadata for #type_ {
>>               const NAME: &'static ::kernel::str::CStr = #name_cstr;
>> +
>> +            #[cfg(MODULE)]
>> +            const THIS_MODULE: ::kernel::ThisModule = {
>> +                extern "C" {
>> +                    static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
>> +                }
>> +
>> +                // SAFETY: `__this_module` is constructed by the kernel at load time
>> +                // and lives until the module is unloaded.
>> +                unsafe { ::kernel::ThisModule::from_ptr(__this_module.get()) }
>> +            };
>> +
>> +            #[cfg(not(MODULE))]
>> +            const THIS_MODULE: ::kernel::ThisModule = unsafe {
>> +                ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
>> +            };
>>           }
>>   
>>           // Double nested modules, since then nobody can access the public items inside.
>> @@ -616,7 +616,7 @@ pub extern "C" fn #ident_exit() {
>>                   /// This function must only be called once.
>>                   unsafe fn __init() -> ::kernel::ffi::c_int {
>>                       let initer = <super::super::LocalModule as ::kernel::InPlaceModule>::init(
>> -                        &super::super::THIS_MODULE
>> +                        ::kernel::this_module::<super::super::LocalModule>()
>>                       );
>>                       // SAFETY: No data race, since `__MOD` can only be accessed by this module
>>                       // and there only `__init` and `__exit` access it. These functions are only
>

^ permalink raw reply

* Re: [PATCH v3 1/6] rust: module: add `THIS_MODULE` const to `ModuleMetadata` trait
From: Alvin Sun @ 2026-06-22 12:52 UTC (permalink / raw)
  To: Gary Guo, Miguel Ojeda, Boqun Feng, Björn Roy Baron,
	Benno Lossin, Andreas Hindborg, Alice Ryhl, Trevor Gross,
	Danilo Krummrich, Luis Chamberlain, Petr Pavlu, Daniel Gomez,
	Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel
In-Reply-To: <DJFIQPLOVO4T.1K8T0VZM30LDA@garyguo.net>


On 6/22/26 18:50, Gary Guo wrote:
> On Mon Jun 22, 2026 at 3:44 AM BST, Alvin Sun wrote:
>> Since `const_refs_to_static` has been stable as of the MSRV bump, a
>> `ThisModule` pointer can now be used in const contexts.
>>
>> Add a `THIS_MODULE` const to the `ModuleMetadata` trait so that modules
>> can provide their `ThisModule` pointer in const contexts such as static
>> `file_operations`.
>>
>> Move the `THIS_MODULE` static from the `module!` macro into the
>> `ModuleMetadata` impl, add a `this_module()` helper, and update `__init`
>> to use it.
>>
>> Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
>> ---
>>   rust/kernel/lib.rs    |  8 ++++++++
>>   rust/macros/module.rs | 34 +++++++++++++++++-----------------
>>   2 files changed, 25 insertions(+), 17 deletions(-)
>>
>> diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
>> index b72b2fbe046d6..50f5a7b5f028e 100644
>> --- a/rust/kernel/lib.rs
>> +++ b/rust/kernel/lib.rs
>> @@ -184,6 +184,14 @@ fn init(module: &'static ThisModule) -> impl pin_init::PinInit<Self, error::Erro
>>   pub trait ModuleMetadata {
>>       /// The name of the module as specified in the `module!` macro.
>>       const NAME: &'static crate::str::CStr;
>> +
>> +    /// The module's `THIS_MODULE` pointer.
>> +    const THIS_MODULE: ThisModule;
>> +}
>> +
>> +/// Returns a reference to the `THIS_MODULE` of the given module type.
>> +pub const fn this_module<M: ModuleMetadata>() -> &'static ThisModule {
>> +    &M::THIS_MODULE
>>   }
> Also, FWIW I think this should not put this in the crate root. Perhaps create a
> modules.rs?

Makes sense. I'll create a new `module.rs` and move the module-related items
(`ModuleMetadata`, `ThisModule`, `this_module()`) there, then re-export from
`lib.rs`.

Best regards,
Alvin Sun

>
> Best,
> Gary
>
>>   
>>   /// Equivalent to `THIS_MODULE` in the C API.
>> diff --git a/rust/macros/module.rs b/rust/macros/module.rs
>> index 06c18e2075083..b9fdee2f2af47 100644
>> --- a/rust/macros/module.rs
>> +++ b/rust/macros/module.rs
>> @@ -497,28 +497,28 @@ pub(crate) fn module(info: ModuleInfo) -> Result<TokenStream> {
>>           /// Used by the printing macros, e.g. [`info!`].
>>           const __LOG_PREFIX: &[u8] = #name_cstr.to_bytes_with_nul();
>>   
>> -        // SAFETY: `__this_module` is constructed by the kernel at load time and will not be
>> -        // freed until the module is unloaded.
>> -        #[cfg(MODULE)]
>> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
>> -            extern "C" {
>> -                static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
>> -            };
>> -
>> -            ::kernel::ThisModule::from_ptr(__this_module.get())
>> -        };
>> -
>> -        #[cfg(not(MODULE))]
>> -        static THIS_MODULE: ::kernel::ThisModule = unsafe {
>> -            ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
>> -        };
>> -
>>           /// The `LocalModule` type is the type of the module created by `module!`,
>>           /// `module_pci_driver!`, `module_platform_driver!`, etc.
>>           type LocalModule = #type_;
>>   
>>           impl ::kernel::ModuleMetadata for #type_ {
>>               const NAME: &'static ::kernel::str::CStr = #name_cstr;
>> +
>> +            #[cfg(MODULE)]
>> +            const THIS_MODULE: ::kernel::ThisModule = {
>> +                extern "C" {
>> +                    static __this_module: ::kernel::types::Opaque<::kernel::bindings::module>;
>> +                }
>> +
>> +                // SAFETY: `__this_module` is constructed by the kernel at load time
>> +                // and lives until the module is unloaded.
>> +                unsafe { ::kernel::ThisModule::from_ptr(__this_module.get()) }
>> +            };
>> +
>> +            #[cfg(not(MODULE))]
>> +            const THIS_MODULE: ::kernel::ThisModule = unsafe {
>> +                ::kernel::ThisModule::from_ptr(::core::ptr::null_mut())
>> +            };
>>           }
>>   
>>           // Double nested modules, since then nobody can access the public items inside.
>> @@ -616,7 +616,7 @@ pub extern "C" fn #ident_exit() {
>>                   /// This function must only be called once.
>>                   unsafe fn __init() -> ::kernel::ffi::c_int {
>>                       let initer = <super::super::LocalModule as ::kernel::InPlaceModule>::init(
>> -                        &super::super::THIS_MODULE
>> +                        ::kernel::this_module::<super::super::LocalModule>()
>>                       );
>>                       // SAFETY: No data race, since `__MOD` can only be accessed by this module
>>                       // and there only `__init` and `__exit` access it. These functions are only
>

^ permalink raw reply

* Re: [PATCH v3 1/6] rust: module: add `THIS_MODULE` const to `ModuleMetadata` trait
From: Gary Guo @ 2026-06-22 13:06 UTC (permalink / raw)
  To: Alvin Sun, Gary Guo, Miguel Ojeda, Boqun Feng,
	Björn Roy Baron, Benno Lossin, Andreas Hindborg, Alice Ryhl,
	Trevor Gross, Danilo Krummrich, Luis Chamberlain, Petr Pavlu,
	Daniel Gomez, Sami Tolvanen, Aaron Tomlin, Greg Kroah-Hartman,
	Rafael J. Wysocki, David Airlie, Simona Vetter, Daniel Almeida,
	Arnd Bergmann, Brendan Higgins, David Gow, Rae Moar, Breno Leitao,
	Jens Axboe
  Cc: rust-for-linux, linux-modules, driver-core, dri-devel, nova-gpu,
	linux-kselftest, kunit-dev, linux-block, linux-kernel
In-Reply-To: <2d54f3e0-3f35-4f97-b6af-b3ceb1aca246@linux.dev>

On Mon Jun 22, 2026 at 1:52 PM BST, Alvin Sun wrote:
>
> On 6/22/26 18:50, Gary Guo wrote:
>> On Mon Jun 22, 2026 at 3:44 AM BST, Alvin Sun wrote:
>>> Since `const_refs_to_static` has been stable as of the MSRV bump, a
>>> `ThisModule` pointer can now be used in const contexts.
>>>
>>> Add a `THIS_MODULE` const to the `ModuleMetadata` trait so that modules
>>> can provide their `ThisModule` pointer in const contexts such as static
>>> `file_operations`.
>>>
>>> Move the `THIS_MODULE` static from the `module!` macro into the
>>> `ModuleMetadata` impl, add a `this_module()` helper, and update `__init`
>>> to use it.
>>>
>>> Signed-off-by: Alvin Sun <alvin.sun@linux.dev>
>>> ---
>>>   rust/kernel/lib.rs    |  8 ++++++++
>>>   rust/macros/module.rs | 34 +++++++++++++++++-----------------
>>>   2 files changed, 25 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/rust/kernel/lib.rs b/rust/kernel/lib.rs
>>> index b72b2fbe046d6..50f5a7b5f028e 100644
>>> --- a/rust/kernel/lib.rs
>>> +++ b/rust/kernel/lib.rs
>>> @@ -184,6 +184,14 @@ fn init(module: &'static ThisModule) -> impl pin_init::PinInit<Self, error::Erro
>>>   pub trait ModuleMetadata {
>>>       /// The name of the module as specified in the `module!` macro.
>>>       const NAME: &'static crate::str::CStr;
>>> +
>>> +    /// The module's `THIS_MODULE` pointer.
>>> +    const THIS_MODULE: ThisModule;
>>> +}
>>> +
>>> +/// Returns a reference to the `THIS_MODULE` of the given module type.
>>> +pub const fn this_module<M: ModuleMetadata>() -> &'static ThisModule {
>>> +    &M::THIS_MODULE
>>>   }
>> Also, FWIW I think this should not put this in the crate root. Perhaps create a
>> modules.rs?
>
> Makes sense. I'll create a new `module.rs` and move the module-related items
> (`ModuleMetadata`, `ThisModule`, `this_module()`) there, then re-export from
> `lib.rs`.

Please do not re-export `this_module`. For the other two, I think it's fine to
re-export to avoid tree-wide changes, but please do update users on code that
would route via the Rust tree.

Best,
Gary

^ permalink raw reply

* Re: [PATCH 1/2 blktests] src/miniublk: switch to ioctl-encoded ublk commands
From: Sebastian Chlad @ 2026-06-22 13:34 UTC (permalink / raw)
  To: Shin'ichiro Kawasaki; +Cc: Sebastian Chlad, linux-block
In-Reply-To: <ajSud4Y4PmCu2X_5@shinmob>

Hi Shin'ichiro,

On Fri, Jun 19, 2026 at 5:26 AM Shin'ichiro Kawasaki
<shinichiro.kawasaki@wdc.com> wrote:
>
> Hi Sebastian,
>
> Thanks for the patches. I agree that this direction is good: it's the better
> shift away from the legacy interface.
>
> One point I noticed is that src/miniublk.c can no longer be built with the
> kernel headers of the LTS kernel version v6.1.y, probably (v5.15.y does not have
> ublk and v6.6.y supports the new interface). This is a rather small window, and
> may be acceptable but I wonder what you think about it
>
> If we drop the miniublk build with v6.1.y kernel headers, it might be the better
> to check before building miniublk. I quickly created a Makefile change [1] for
> that purpose.

You're right, sorry for the omission. I'll incorporate your Makefile
fix into v2 with a Suggested-by tag.

>
> Also, please find a comment in line below.
>
> On Jun 17, 2026 / 09:25, Sebastian Chlad wrote:
> > Kernels built without CONFIG_BLKDEV_UBLK_LEGACY_OPCODES reject the
> > legacy raw UBLK_CMD_* and UBLK_IO_* opcodes. Switch miniublk to use
> > the ioctl-encoded UBLK_U_CMD_* and UBLK_U_IO_* variants defined in
> > linux/ublk_cmd.h instead.
> >
> > For IO commands, the ioctl-encoded opcode is used for submission while
> > _IOC_NR() extracts the raw NR bits for build_user_data(), keeping the
> > user_data tag encoding intact.
> >
> > Signed-off-by: Sebastian Chlad <sebastian.chlad@suse.com>
> > ---
> >  src/miniublk.c | 30 +++++++++++++++---------------
> >  1 file changed, 15 insertions(+), 15 deletions(-)
> >
> > diff --git a/src/miniublk.c b/src/miniublk.c
> > index f98f850..5a35ca7 100644
> > --- a/src/miniublk.c
> > +++ b/src/miniublk.c
> [...]
> > @@ -624,9 +624,9 @@ static int ublk_queue_io_cmd(struct ublk_queue *q,
> >               return 0;
> >
> >       if (io->flags & UBLKSRV_NEED_COMMIT_RQ_COMP)
> > -             cmd_op = UBLK_IO_COMMIT_AND_FETCH_REQ;
> > -     else if (io->flags & UBLKSRV_NEED_FETCH_RQ)
> > -             cmd_op = UBLK_IO_FETCH_REQ;
> > +             cmd_op = UBLK_U_IO_COMMIT_AND_FETCH_REQ;
> > +     else
> > +             cmd_op = UBLK_U_IO_FETCH_REQ;
>
> The hunk above changes the "else if" part, is this intentional?
>

Yes, this is intentional because we already check things in
    if (!(io->flags &
        (UBLKSRV_NEED_FETCH_RQ | UBLKSRV_NEED_COMMIT_RQ_COMP)))
which returns early if neither flag is set, so checking the first
condition makes another check redundant as by that
time we know we need UBLK_U_IO_FETCH_REQ.

However if you think it's safer to still check if io->flags &
UBLKSRV_NEED_FETCH_RQ, I can implement it this way in the v2.
Let me know what you prefer.

I will wait with the v2 for your reply and address either the makefile
change exclusively, or both changes depending on your input.

>
> [1]
>
> diff --git a/src/Makefile b/src/Makefile
> index d8833bf..adfe3ef 100644
> --- a/src/Makefile
> +++ b/src/Makefile
> @@ -8,6 +8,10 @@ HAVE_C_MACRO = $(shell if echo "$(H)include <$(1)>" |  \
>                 $(CC) $(CFLAGS) -E - 2>&1 /dev/null | grep $(2) > /dev/null 2>&1; \
>                 then echo 1;else echo 0; fi)
>
> +HAVE_C_DEF = $(shell if echo -e "$(H)include <$(1)>\n#ifdef $(2)\nHAVE_$(2)\n#endif" | \
> +               $(CC) $(CFLAGS) -E - 2>&1 /dev/null | grep HAVE_$(2) > /dev/null 2>&1; \
> +               then echo 1;else echo 0; fi)
> +
>  C_TARGETS := \
>         dio-offsets \
>         loblksize \
> @@ -27,6 +31,7 @@ C_UBLK_TARGETS := miniublk
>
>  HAVE_LIBURING := $(call HAVE_C_MACRO,liburing.h,IORING_OP_URING_CMD)
>  HAVE_UBLK_HEADER := $(call HAVE_C_HEADER,linux/ublk_cmd.h,1)
> +HAVE_NEW_UBLK_INTF := $(call HAVE_C_DEF,linux/ublk_cmd.h,UBLK_U_CMD_START_DEV)
>
>  CXX_TARGETS := \
>         discontiguous-io
> @@ -37,8 +42,12 @@ SYZKALLER_TARGETS := \
>  TARGETS := $(C_TARGETS) $(CXX_TARGETS) $(SYZKALLER_TARGETS)
>
>  ifeq ($(HAVE_UBLK_HEADER), 1)
> +ifeq ($(HAVE_NEW_UBLK_INTF), 1)
>  C_URING_TARGETS += $(C_UBLK_TARGETS)
>  else
> +$(info Skip $(C_UBLK_TARGETS) build due to missing new ublk interface(v6.4+))
> +endif
> +else
>  $(info Skip $(C_UBLK_TARGETS) build due to missing kernel header(v6.0+))
>  endif
>
>

^ permalink raw reply

* Re: [PATCH RFC v2 01/18] xfs: fix the error unwind in xfs_open_devices()
From: Jan Kara @ 2026-06-22 13:35 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-1-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:17, Christian Brauner wrote:
> Since the rt and log block devices are closed in xfs_free_buftarg() the
> buftarg owns the device file. The error unwind does not respect that:
> when the log buftarg allocation fails, out_free_rtdev_targ frees the rt
> buftarg - releasing rtdev_file - and then falls through to
> out_close_rtdev and releases it a second time.
> 
> The unwind also leaves mp->m_rtdev_targp and mp->m_ddev_targp pointing
> to the freed buftargs. The failed mount continues into
> deactivate_locked_super() -> xfs_kill_sb() -> xfs_mount_free(), which
> frees them again.
> 
> Clear the buftarg pointers once the unwind freed them and clear
> rtdev_file once the rt buftarg owns it, so nothing is released twice.
> 
> Reachable when a buftarg allocation fails after the data buftarg was
> set up: an I/O error in sync_blockdev() or an allocation failure in
> xfs_init_buftarg() while mounting with external rt and log devices.
> 
> Fixes: 41233576e9a4 ("xfs: close the RT and log block devices in xfs_free_buftarg")
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Looks good to me. As a small nit I'd probably do rtdev_file = NULL just
after we've successfully allocated m_rtdev_targp but that's really minor.
Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/xfs/xfs_super.c | 3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
> index eac7f9503805..8531d526fc44 100644
> --- a/fs/xfs/xfs_super.c
> +++ b/fs/xfs/xfs_super.c
> @@ -534,8 +534,11 @@ xfs_open_devices(
>   out_free_rtdev_targ:
>  	if (mp->m_rtdev_targp)
>  		xfs_free_buftarg(mp->m_rtdev_targp);
> +	mp->m_rtdev_targp = NULL;
> +	rtdev_file = NULL;	/* released by xfs_free_buftarg() */
>   out_free_ddev_targ:
>  	xfs_free_buftarg(mp->m_ddev_targp);
> +	mp->m_ddev_targp = NULL;
>   out_close_rtdev:
>  	 if (rtdev_file)
>  		bdev_fput(rtdev_file);
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 05/18] ext4: use anonymous devices for KUnit test superblocks
From: Jan Kara @ 2026-06-22 13:48 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-5-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:21, Christian Brauner wrote:
> The mballoc and extents KUnit tests create superblocks through
> sget_fc() with a set callback that never assigns s_dev and a kill_sb
> that only calls generic_shutdown_super().
> 
> The upcoming global device-to-superblock table registers every
> superblock under its s_dev, so each superblock needs a unique device
> number. Allocate a proper anonymous device via set_anon_super_fc() and
> release it through kill_anon_super().
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Ok. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/ext4/extents-test.c | 9 ++-------
>  fs/ext4/mballoc-test.c | 9 ++-------
>  2 files changed, 4 insertions(+), 14 deletions(-)
> 
> diff --git a/fs/ext4/extents-test.c b/fs/ext4/extents-test.c
> index bd7795a82607..c3836ecb89f9 100644
> --- a/fs/ext4/extents-test.c
> +++ b/fs/ext4/extents-test.c
> @@ -126,11 +126,6 @@ struct kunit_ext_test_param {
>  	struct kunit_ext_data_state exp_data_state[3];
>  };
>  
> -static void ext_kill_sb(struct super_block *sb)
> -{
> -	generic_shutdown_super(sb);
> -}
> -
>  static int ext_init_fs_context(struct fs_context *fc)
>  {
>  	return 0;
> @@ -138,13 +133,13 @@ static int ext_init_fs_context(struct fs_context *fc)
>  
>  static int ext_set(struct super_block *sb, struct fs_context *fc)
>  {
> -	return 0;
> +	return set_anon_super_fc(sb, fc);
>  }
>  
>  static struct file_system_type ext_fs_type = {
>  	.name		 = "extents test",
>  	.init_fs_context = ext_init_fs_context,
> -	.kill_sb	 = ext_kill_sb,
> +	.kill_sb	 = kill_anon_super,
>  };
>  
>  static void extents_kunit_exit(struct kunit *test)
> diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c
> index d90da44aadbd..a3b33ed2c172 100644
> --- a/fs/ext4/mballoc-test.c
> +++ b/fs/ext4/mballoc-test.c
> @@ -59,11 +59,6 @@ static const struct super_operations mbt_sops = {
>  	.free_inode	= mbt_free_inode,
>  };
>  
> -static void mbt_kill_sb(struct super_block *sb)
> -{
> -	generic_shutdown_super(sb);
> -}
> -
>  static int mbt_init_fs_context(struct fs_context *fc)
>  {
>  	return 0;
> @@ -72,7 +67,7 @@ static int mbt_init_fs_context(struct fs_context *fc)
>  static struct file_system_type mbt_fs_type = {
>  	.name			= "mballoc test",
>  	.init_fs_context	= mbt_init_fs_context,
> -	.kill_sb		= mbt_kill_sb,
> +	.kill_sb		= kill_anon_super,
>  };
>  
>  static int mbt_mb_init(struct super_block *sb)
> @@ -136,7 +131,7 @@ static void mbt_mb_release(struct super_block *sb)
>  
>  static int mbt_set(struct super_block *sb, struct fs_context *fc)
>  {
> -	return 0;
> +	return set_anon_super_fc(sb, fc);
>  }
>  
>  static struct super_block *mbt_ext4_alloc_super_block(void)
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 02/18] super: convert s_count to refcount_t s_passive
From: Jan Kara @ 2026-06-22 13:48 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-2-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:18, Christian Brauner wrote:
> The superblock carries two counters: s_active, the active reference
> count that keeps the filesystem usable, and s_count, the passive
> reference count that merely keeps the structure itself alive. Turn the
> passive count into a refcount_t and rename it to s_passive to make the
> pairing with s_active obvious.
> 
> Everything is still serialized by sb_lock, so there is no functional
> change; the conversion buys the usual refcount_t saturation and
> underflow checking. The following patches start dropping passive
> references without holding sb_lock and make the device-to-superblock
> table hold one passive reference per registered entry, which a plain
> integer cannot support.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Yeah, looks like a reasonable cleanup. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/super.c                     | 18 +++++++++---------
>  include/linux/fs/super_types.h |  2 +-
>  2 files changed, 10 insertions(+), 10 deletions(-)
> 
> diff --git a/fs/super.c b/fs/super.c
> index a8fd61136aaf..25dd72b550e0 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -102,7 +102,7 @@ static bool super_flags(const struct super_block *sb, unsigned int flags)
>   * creation will succeed and SB_BORN is set by vfs_get_tree() or we're
>   * woken and we'll see SB_DYING.
>   *
> - * The caller must have acquired a temporary reference on @sb->s_count.
> + * The caller must have acquired a temporary reference on @sb->s_passive.
>   *
>   * Return: The function returns true if SB_BORN was set and with
>   *         s_umount held. The function returns false if SB_DYING was
> @@ -367,7 +367,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  	spin_lock_init(&s->s_inode_wblist_lock);
>  	fserror_mount(s);
>  
> -	s->s_count = 1;
> +	refcount_set(&s->s_passive, 1);
>  	atomic_set(&s->s_active, 1);
>  	mutex_init(&s->s_vfs_rename_mutex);
>  	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
> @@ -407,7 +407,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>   */
>  static void __put_super(struct super_block *s)
>  {
> -	if (!--s->s_count) {
> +	if (refcount_dec_and_test(&s->s_passive)) {
>  		list_del_init(&s->s_list);
>  		WARN_ON(s->s_dentry_lru.node);
>  		WARN_ON(s->s_inode_lru.node);
> @@ -529,7 +529,7 @@ static bool grab_super(struct super_block *sb)
>  {
>  	bool locked;
>  
> -	sb->s_count++;
> +	refcount_inc(&sb->s_passive);
>  	spin_unlock(&sb_lock);
>  	locked = super_lock_excl(sb);
>  	if (locked) {
> @@ -556,7 +556,7 @@ static bool grab_super(struct super_block *sb)
>   *	lock held in read mode in case of success. On successful return,
>   *	the caller must drop the s_umount lock when done.
>   *
> - *	Note that unlike get_super() et.al. this one does *not* bump ->s_count.
> + *	Note that unlike get_super() et.al. this one does *not* bump ->s_passive.
>   *	The reason why it's safe is that we are OK with doing trylock instead
>   *	of down_read().  There's a couple of places that are OK with that, but
>   *	it's very much not a general-purpose interface.
> @@ -858,7 +858,7 @@ static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg,
>  	     sb = next_super(sb, flags)) {
>  		if (super_flags(sb, SB_DYING))
>  			continue;
> -		sb->s_count++;
> +		refcount_inc(&sb->s_passive);
>  		spin_unlock(&sb_lock);
>  
>  		if (flags & SUPER_ITER_UNLOCKED) {
> @@ -903,7 +903,7 @@ void iterate_supers_type(struct file_system_type *type,
>  		if (super_flags(sb, SB_DYING))
>  			continue;
>  
> -		sb->s_count++;
> +		refcount_inc(&sb->s_passive);
>  		spin_unlock(&sb_lock);
>  
>  		locked = super_lock_shared(sb);
> @@ -935,7 +935,7 @@ struct super_block *user_get_super(dev_t dev, bool excl)
>  		if (sb->s_dev != dev)
>  			continue;
>  
> -		sb->s_count++;
> +		refcount_inc(&sb->s_passive);
>  		spin_unlock(&sb_lock);
>  
>  		locked = super_lock(sb, excl);
> @@ -1369,7 +1369,7 @@ static struct super_block *bdev_super_lock(struct block_device *bdev, bool excl)
>  
>  	/* Make sure sb doesn't go away from under us */
>  	spin_lock(&sb_lock);
> -	sb->s_count++;
> +	refcount_inc(&sb->s_passive);
>  	spin_unlock(&sb_lock);
>  
>  	mutex_unlock(&bdev->bd_holder_lock);
> diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
> index ef7941e9dc79..68747182abf9 100644
> --- a/include/linux/fs/super_types.h
> +++ b/include/linux/fs/super_types.h
> @@ -145,7 +145,7 @@ struct super_block {
>  	unsigned long				s_magic;
>  	struct dentry				*s_root;
>  	struct rw_semaphore			s_umount;
> -	int					s_count;
> +	refcount_t				s_passive;
>  	atomic_t				s_active;
>  #ifdef CONFIG_SECURITY
>  	void					*s_security;
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 03/18] super: take lock after last reference count
From: Jan Kara @ 2026-06-22 13:50 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-3-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:19, Christian Brauner wrote:
> __put_super() required the caller to hold sb_lock, so put_super()
> wrapped it. The per-device superblock table introduced later drops its
> passive references from contexts that do not hold sb_lock, so make
> put_super() self-locking: drop the count first and take sb_lock only for
> the final list_del.
> 
> With the count now dropped outside sb_lock a superblock can briefly sit
> on @super_blocks with s_passive == 0 before it is unlinked, so the list
> walkers (__iterate_supers(), iterate_supers_type(), user_get_super())
> switch to refcount_inc_not_zero() and skip it.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Looks good, just one style nit below. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

> -static void __put_super(struct super_block *s)
> +void put_super(struct super_block *s)
>  {
>  	if (refcount_dec_and_test(&s->s_passive)) {
> +

I'd delete this empty line.

> +		spin_lock(&sb_lock);
>  		list_del_init(&s->s_list);
> +		spin_unlock(&sb_lock);
> +


								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 00/18] fs: support freeze/thaw/mark_dead/sync with shared devices
From: Jan Kara @ 2026-06-22 15:40 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs, syzbot
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-0-7df6b864028e@kernel.org>

Hi!

On Tue 16-06-26 16:08:16, Christian Brauner wrote:
> This is a generalization of the device number to superblock so it works
> for actual block device and anonymous (or even mtd) devices.
> 
> fs_holder_ops recovers the affected superblock from bdev->bd_holder. That
> forces the holder of a block device to be exactly one superblock and makes
> it impossible for several superblocks to share a single device.
> 
> erofs does exactly that. It can mount read-only "blob" devices that are
> shared between many superblocks: a metadata-only erofs that indexes a set
> of per-layer blobs (one filesystem instead of one per OCI layer), or an
> incremental image whose base device is shared by several updates. Because
> the block layer only tracks a single holder, a freeze, thaw, removal or
> sync on such a device is never propagated to all the superblocks using it,
> and the current infrastructure has no way to find them.
> 
> This series replaces the bd_holder-based lookup with a global, dev_t-keyed
> table mapping each block device to the superblock(s) using it. The holder
> argument becomes purely the block layer's exclusivity token -- a superblock,
> or the file_system_type for a device shared within one filesystem type --
> and the fs_holder_ops callbacks look the device up in the table and act on
> every superblock registered for it: 1:1 for most filesystems, 1:many for
> erofs.

So I was thinking about this also in the light of Christoph's complaints. I
agree with you, Chritian, that this translation table maintains the
abstraction of the holder - holder ops define how to transition from bdev
to its holder(s) and how to translate the .sync, .freeze and other
operations for the holders - and that is kept since your changes are
specific to fs_holder_ops.

What I'm wondering about a bit is whether we want this complexity for the
only user which is erofs (i.e., whether this wouldn't be better implemented
in erofs specific holder ops which could arguably be simpler than this
generic solution). On the other hand that will likely have to replicate
the locking dances we do in bdev_super_lock() and I'm not sure whether
spread of this locking complexity into filesystems is better than this
more complex VFS mapping code.

One more thing I was considering is that the need to transition from one
bdev to multiple holders isn't actually unique to erofs. For example device
mapper will need the same thing, arguably partition bdevs could be also
made holders of the complete bdev so events are propagated from the whole
bdev into partition bdevs properly (which currently happens in kind of ad
hoc manner and only in some cases). Currently your translation mechanism is
tied to mapping to superblock but actually rather weakly - we only need the
guarantee that the holder stays alive while the mapping entry exists, the
rest is protected by the mapping entry refcount AFAICS. So with a bit of
effort we could make this a generic bdev -> holders mapping mechanism
usable from whichever holder ops decide to employ it, which would then be
quite attractive IMO.

But I guess let's leave lifting the mapping code from super.c and
converting it into generic mapping mechanism for the moment when we really
get into implementing another user.

All this is a long way of saying that I'm OK with the mapping mechanism
like this :).

								Honza

> Filesystems claim and release their devices through new
> fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release() helpers; the
> per-fs patches convert xfs, btrfs, ext4, f2fs and erofs over to them and
> fix cramfs and romfs, which released the registered main device with a
> raw bdev_fput().
> 
> Since every superblock is registered under its s_dev the table also
> replaces the last s_dev-keyed walk of the super_blocks list:
> user_get_super() resolves device numbers through it, so ustat() and
> quotactl() now work on any device a filesystem claims and no longer
> take sb_lock.
> 
> The longer-term motivation is to let userspace decide which devices may be
> onlined from one central place, without having to teach every filesystem
> about it individually.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>
> ---
> Changes in v2:
> - super: rework the device-to-superblock table reference counting: each
>   (device, superblock) entry carries a single claim count and holds one
>   passive reference on its superblock for the entry's lifetime. New prep
>   patches convert s_count to refcount_t s_passive and make put_super()
>   self-locking.
> - super: preallocate the entry in alloc_super() and register it from the
>   set callbacks through set_anon_super()/set_bdev_super(); an insert
>   failure unwinds exactly like a set callback failure. The superblock
>   stashes the entry in sb->s_super_dev and kill_super_notify() drops the
>   claim through it.
> - super: initialize the table from mnt_init(); the rootfs and shm mounts
>   are created long before any initcall runs.
> - super: fold the v1 "refuse to claim a frozen block device" patch into
>   the registration helper and restore the EBUSY check for the primary
>   device in setup_bdev_super(): additional devices (the xfs log, the ext4
>   journal, erofs blobs) are now refused while frozen as well, answering
>   Jan's question on v1 3/8.
> - Split the core patch into table/helpers/switch-over and move the
>   xfs/btrfs/ext4 conversions before the fs_holder_ops switch so no
>   freeze/mark_dead events are lost mid-series; erofs follows the switch.
> - New prep patches: the ext4 KUnit tests allocate anonymous devices and
>   ocfs2 stops resetting s_dev on dismount.
> - New: convert user_get_super() to the device table, plus a ustat()
>   selftest.
> - New: fix a pre-existing double release of the realtime device file and
>   dangling buftarg pointers in xfs_open_devices()'s error unwind.
> - New: convert f2fs's additional devices to the helpers; fix cramfs and
>   romfs releasing the registered main device with a raw bdev_fput().
> - erofs: drop the .shutdown() and .remove_bdev() implementations and the
>   per-device "dead" flag. Immutable filesystems don't need them: the block
>   layer sets GD_DEAD before fs_bdev_mark_dead() so in-flight bios fail
>   anyway, erofs has no write path or journal to stop, and the read-only
>   loop_change_fd() case must not be forced to -EIO. Patch from Gao Xiang,
>   applied verbatim - thanks!
> - btrfs: fix a general protection fault in close_fs_devices() on a failed
>   mount (reported by syzbot). The release path took the superblock from
>   device->fs_info, which is still NULL if open_ctree() fails before
>   btrfs_init_devices_late(); it now uses bdev_file->private_data.
> - erofs: the v1 conversion was sent with a generic boilerplate changelog;
>   superseded by Gao's patch above.
> - Collect Reviewed-by from Jan Kara and Tested-by from syzbot.
> - Rebase onto v7.1-rc1.
> - Link to v1: https://patch.msgid.link/20260602-work-super-bdev_holder_global-v1-0-bb0fd82f3861@kernel.org
> 
> ---
> Christian Brauner (18):
>       xfs: fix the error unwind in xfs_open_devices()
>       super: convert s_count to refcount_t s_passive
>       super: take lock after last reference count
>       fs, block: move blk_mode_t and fop_flags_t into <linux/types.h>
>       ext4: use anonymous devices for KUnit test superblocks
>       ocfs2: don't reset s_dev on dismount
>       fs: maintain a global device-to-superblock table
>       fs: add dedicated block device open helpers for filesystems
>       xfs: port to fs_bdev_file_open_by_path()
>       btrfs: open via dedicated fs bdev helpers
>       ext4: open via dedicated fs bdev helpers
>       fs: look up superblocks via the device table in fs_holder_ops
>       fs: tolerate per-superblock freeze errors on shared devices
>       erofs: open via dedicated fs bdev helpers
>       f2fs: open via dedicated fs bdev helpers
>       super: make fs_holder_ops private
>       fs: look up the superblock via the device table in user_get_super()
>       selftests/filesystems: add ustat() coverage
> 
>  fs/btrfs/volumes.c                               |  31 +-
>  fs/cramfs/inode.c                                |   2 +-
>  fs/erofs/super.c                                 |  35 +-
>  fs/ext4/extents-test.c                           |   9 +-
>  fs/ext4/mballoc-test.c                           |   9 +-
>  fs/ext4/super.c                                  |  12 +-
>  fs/f2fs/super.c                                  |   6 +-
>  fs/internal.h                                    |   1 +
>  fs/namespace.c                                   |   2 +
>  fs/ocfs2/super.c                                 |   1 -
>  fs/romfs/super.c                                 |   2 +-
>  fs/super.c                                       | 620 ++++++++++++++++-------
>  fs/xfs/xfs_buf.c                                 |   2 +-
>  fs/xfs/xfs_super.c                               |  13 +-
>  include/linux/blkdev.h                           |   9 -
>  include/linux/fs.h                               |   2 -
>  include/linux/fs/super.h                         |   8 +
>  include/linux/fs/super_types.h                   |   4 +-
>  include/linux/types.h                            |   2 +
>  tools/testing/selftests/filesystems/.gitignore   |   1 +
>  tools/testing/selftests/filesystems/Makefile     |   2 +-
>  tools/testing/selftests/filesystems/ustat_test.c | 135 +++++
>  22 files changed, 647 insertions(+), 261 deletions(-)
> ---
> base-commit: 0c0d974f62e6603d4514e1a8035658edb353c68f
> change-id: 20260602-work-super-bdev_holder_global-8cba5e52bed5
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 07/18] fs: maintain a global device-to-superblock table
From: Jan Kara @ 2026-06-22 15:59 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-7-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:23, Christian Brauner wrote:
> fs_holder_ops recovers the owning superblock from bdev->bd_holder, which
> forces the holder to be exactly one superblock and prevents several
> superblocks from sharing one block device. That's what erofs is doing.
> 
> As a first step introduce a global dev_t-keyed rhltable mapping each
> device to the superblock(s) using it. The entry is preallocated in
> alloc_super() and registered under sb->s_dev by the set callback through
> set_anon_super() and set_bdev_super(), the two helpers every set
> callback assigns s_dev through. Registration is the final fallible act
> of a set callback, so an insert failure unwinds through sget_fc()'s
> existing set-failure path: the fs_context keeps ownership of s_fs_info
> and the callers' error paths stay correct. set_anon_super() releases
> the anonymous dev it allocated when registration fails. Unwinding
> through deactivate_locked_super() instead would run kill_sb() and free
> s_fs_info behind the caller's back: nfs and ceph free that object
> through a local pointer when sget_fc() fails and would double-free.
> 
> The superblock stashes the entry in sb->s_super_dev and
> kill_super_notify() drops the claim through it, so teardown doesn't
> depend on s_dev staying stable; an entry that was never registered is
> freed together with the superblock in destroy_super_work().
> 
> Each table entry holds a passive reference (s_passive) on its
> superblock, so the struct stays valid for as long as the entry is
> reachable. Entries are claim-counted through sd_ref: additional claims
> on the same (device, superblock) pair share the entry, and the unlink
> is deferred to the last put, so a later iteration cursor never resumes
> from a removed node.
> 
> The table is initialized from mnt_init(): the first superblocks (the
> tmpfs shm mount and rootfs) are created from start_kernel() long before
> any initcall runs, so an initcall would be too late.
> 
> The table has no readers yet; the fs_holder_ops callbacks are switched
> over once all devices a filesystem claims are registered.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

Looks good. Feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza

> ---
>  fs/internal.h                  |   1 +
>  fs/namespace.c                 |   2 +
>  fs/super.c                     | 102 ++++++++++++++++++++++++++++++++++++++++-
>  include/linux/fs/super_types.h |   2 +
>  4 files changed, 105 insertions(+), 2 deletions(-)
> 
> diff --git a/fs/internal.h b/fs/internal.h
> index d77578d66d42..83eb3e2a0f85 100644
> --- a/fs/internal.h
> +++ b/fs/internal.h
> @@ -137,6 +137,7 @@ extern int reconfigure_super(struct fs_context *);
>  extern bool super_trylock_shared(struct super_block *sb);
>  struct super_block *user_get_super(dev_t, bool excl);
>  void put_super(struct super_block *sb);
> +void __init super_dev_init(void);
>  extern bool mount_capable(struct fs_context *);
>  int sb_init_dio_done_wq(struct super_block *sb);
>  
> diff --git a/fs/namespace.c b/fs/namespace.c
> index 3d5cd5bf3b05..7cef6dae0854 100644
> --- a/fs/namespace.c
> +++ b/fs/namespace.c
> @@ -6262,6 +6262,8 @@ void __init mnt_init(void)
>  	if (!mount_hashtable || !mountpoint_hashtable)
>  		panic("Failed to allocate mount hash table\n");
>  
> +	super_dev_init();
> +
>  	kernfs_init();
>  
>  	err = sysfs_init();
> diff --git a/fs/super.c b/fs/super.c
> index a771a0ad4c9a..ff5e305d0ab4 100644
> --- a/fs/super.c
> +++ b/fs/super.c
> @@ -24,6 +24,7 @@
>  #include <linux/export.h>
>  #include <linux/slab.h>
>  #include <linux/blkdev.h>
> +#include <linux/rhashtable.h>
>  #include <linux/mount.h>
>  #include <linux/security.h>
>  #include <linux/writeback.h>		/* for the emergency remount stuff */
> @@ -272,6 +273,8 @@ static unsigned long super_cache_count(struct shrinker *shrink,
>  	return total_objects;
>  }
>  
> +static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb);
> +
>  static void destroy_super_work(struct work_struct *work)
>  {
>  	struct super_block *s = container_of(work, struct super_block,
> @@ -279,6 +282,8 @@ static void destroy_super_work(struct work_struct *work)
>  	fsnotify_sb_free(s);
>  	security_sb_free(s);
>  	put_user_ns(s->s_user_ns);
> +	/* Only an unregistered entry is still owned by the superblock. */
> +	kfree(s->s_super_dev);
>  	kfree(s->s_subtype);
>  	for (int i = 0; i < SB_FREEZE_LEVELS; i++)
>  		percpu_free_rwsem(&s->s_writers.rw_sem[i]);
> @@ -392,6 +397,10 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
>  		goto fail;
>  	if (list_lru_init_memcg(&s->s_inode_lru, s->s_shrink))
>  		goto fail;
> +	s->s_super_dev = super_dev_alloc(0, s);
> +	if (!s->s_super_dev)
> +		goto fail;
> +
>  	s->s_min_writeback_pages = MIN_WRITEBACK_PAGES;
>  	return s;
>  
> @@ -421,6 +430,77 @@ void put_super(struct super_block *s)
>  	}
>  }
>  
> +struct super_dev {
> +	dev_t			sd_dev;
> +	struct super_block	*sd_sb;
> +	refcount_t		sd_ref;
> +	struct rhlist_head	sd_node;
> +	struct rcu_head		sd_rcu;
> +};
> +
> +static struct rhltable super_dev_table;
> +static const struct rhashtable_params super_dev_params = {
> +	.key_len	= sizeof(dev_t),
> +	.key_offset	= offsetof(struct super_dev, sd_dev),
> +	.head_offset	= offsetof(struct super_dev, sd_node),
> +};
> +
> +static struct super_dev *super_dev_alloc(dev_t dev, struct super_block *sb)
> +{
> +	struct super_dev *fsd;
> +
> +	fsd = kzalloc_obj(*fsd);
> +	if (!fsd)
> +		return NULL;
> +	fsd->sd_dev = dev;
> +	fsd->sd_sb = sb;
> +	refcount_set(&fsd->sd_ref, 1);
> +	return fsd;
> +}
> +
> +static void super_dev_put(struct super_dev *fsd)
> +{
> +	/* Unlink only once unpinned, so a cursor never resumes from a removed node. */
> +	if (fsd && refcount_dec_and_test(&fsd->sd_ref)) {
> +		rhltable_remove(&super_dev_table, &fsd->sd_node, super_dev_params);
> +		put_super(fsd->sd_sb);
> +		kfree_rcu(fsd, sd_rcu);
> +	}
> +}
> +
> +void __init super_dev_init(void)
> +{
> +	if (rhltable_init(&super_dev_table, &super_dev_params))
> +		panic("VFS: Cannot initialise super_dev_table\n");
> +}
> +
> +static int super_dev_insert(struct super_dev *fsd)
> +{
> +	int err;
> +
> +	err = rhltable_insert(&super_dev_table, &fsd->sd_node, super_dev_params);
> +	if (!err)
> +		refcount_inc(&fsd->sd_sb->s_passive);
> +	return err;
> +}
> +
> +/* Register @sb under @sb->s_dev as the final fallible act of a set callback. */
> +static int super_dev_register(struct super_block *sb)
> +{
> +	struct super_dev *fsd = sb->s_super_dev;
> +	int err;
> +
> +	lockdep_assert_held(&sb_lock);
> +	VFS_WARN_ON_ONCE(!sb->s_dev);
> +	VFS_WARN_ON_ONCE(!fsd || fsd->sd_dev);
> +
> +	fsd->sd_dev = sb->s_dev;
> +	err = super_dev_insert(fsd);
> +	if (err)
> +		fsd->sd_dev = 0;
> +	return err;
> +}
> +
>  static void kill_super_notify(struct super_block *sb)
>  {
>  	lockdep_assert_not_held(&sb->s_umount);
> @@ -440,6 +520,12 @@ static void kill_super_notify(struct super_block *sb)
>  	hlist_del_init(&sb->s_instances);
>  	spin_unlock(&sb_lock);
>  
> +	/* Drop sget_fc()'s claim; a never-registered entry stays with the sb. */
> +	if (sb->s_super_dev->sd_dev) {
> +		super_dev_put(sb->s_super_dev);
> +		sb->s_super_dev = NULL;
> +	}
> +
>  	/*
>  	 * Let concurrent mounts know that this thing is really dead.
>  	 * We don't need @sb->s_umount here as every concurrent caller
> @@ -750,6 +836,7 @@ struct super_block *sget_fc(struct fs_context *fc,
>  	}
>  	if (!s) {
>  		spin_unlock(&sb_lock);
> +
>  		s = alloc_super(fc->fs_type, fc->sb_flags, user_ns);
>  		if (!s)
>  			return ERR_PTR(-ENOMEM);
> @@ -759,11 +846,13 @@ struct super_block *sget_fc(struct fs_context *fc,
>  	s->s_fs_info = fc->s_fs_info;
>  	err = set(s, fc);
>  	if (err) {
> +		VFS_WARN_ON_ONCE(s->s_super_dev->sd_dev);
>  		s->s_fs_info = NULL;
>  		spin_unlock(&sb_lock);
>  		destroy_unused_super(s);
>  		return ERR_PTR(err);
>  	}
> +	VFS_WARN_ON_ONCE(!s->s_super_dev->sd_dev);
>  	fc->s_fs_info = NULL;
>  	s->s_type = fc->fs_type;
>  	s->s_iflags |= fc->s_iflags;
> @@ -1217,7 +1306,16 @@ EXPORT_SYMBOL(free_anon_bdev);
>  
>  int set_anon_super(struct super_block *s, void *data)
>  {
> -	return get_anon_bdev(&s->s_dev);
> +	int error;
> +
> +	error = get_anon_bdev(&s->s_dev);
> +	if (error)
> +		return error;
> +
> +	error = super_dev_register(s);
> +	if (error)
> +		free_anon_bdev(s->s_dev);
> +	return error;
>  }
>  EXPORT_SYMBOL(set_anon_super);
>  
> @@ -1303,7 +1401,7 @@ EXPORT_SYMBOL(get_tree_keyed);
>  static int set_bdev_super(struct super_block *s, void *data)
>  {
>  	s->s_dev = *(dev_t *)data;
> -	return 0;
> +	return super_dev_register(s);
>  }
>  
>  static int super_s_dev_set(struct super_block *s, struct fs_context *fc)
> diff --git a/include/linux/fs/super_types.h b/include/linux/fs/super_types.h
> index 68747182abf9..c8172558750f 100644
> --- a/include/linux/fs/super_types.h
> +++ b/include/linux/fs/super_types.h
> @@ -30,6 +30,7 @@ struct mount;
>  struct mtd_info;
>  struct quotactl_ops;
>  struct shrinker;
> +struct super_dev;
>  struct unicode_map;
>  struct user_namespace;
>  struct workqueue_struct;
> @@ -132,6 +133,7 @@ struct super_operations {
>  struct super_block {
>  	struct list_head			s_list;		/* Keep this first */
>  	dev_t					s_dev;		/* search index; _not_ kdev_t */
> +	struct super_dev			*s_super_dev;	/* sget_fc()'s device table claim */
>  	unsigned char				s_blocksize_bits;
>  	unsigned long				s_blocksize;
>  	loff_t					s_maxbytes;	/* Max file size */
> 
> -- 
> 2.47.3
> 
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* [PATCH] block: fix incorrect error injection static key decrement
From: Christoph Hellwig @ 2026-06-22 16:07 UTC (permalink / raw)
  To: axboe; +Cc: dlemoal, linux-block

Only decrement the static key when we had items and thus it was
incremented before.

Fixes: e8dcf2d142bd ("block: add configurable error injection")
Reported-by: Damien Le Moal <dlemoal@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/error-injection.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/error-injection.c b/block/error-injection.c
index d24c90e9a25f..cfb83138960c 100644
--- a/block/error-injection.c
+++ b/block/error-injection.c
@@ -120,13 +120,13 @@ static void error_inject_removeall(struct gendisk *disk)
 	struct blk_error_inject *inj;
 
 	mutex_lock(&disk->error_injection_lock);
-	clear_bit(GD_ERROR_INJECT, &disk->state);
+	if (test_and_clear_bit(GD_ERROR_INJECT, &disk->state))
+		static_branch_dec(&blk_error_injection_enabled);
 	while ((inj = list_first_entry_or_null(&disk->error_injection_list,
 			struct blk_error_inject, entry))) {
 		list_del_rcu(&inj->entry);
 		kfree_rcu_mightsleep(inj);
 	}
-	static_branch_dec(&blk_error_injection_enabled);
 	mutex_unlock(&disk->error_injection_lock);
 }
 
-- 
2.53.0


^ permalink raw reply related

* [PATCH blktests] block/044: basic block error injection sanity test
From: Christoph Hellwig @ 2026-06-22 16:08 UTC (permalink / raw)
  To: shinichiro.kawasaki; +Cc: linux-block

Test the basic block layer error injection functionality.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 tests/block/044     | 71 +++++++++++++++++++++++++++++++++++++++++++++
 tests/block/044.out |  9 ++++++
 2 files changed, 80 insertions(+)
 create mode 100755 tests/block/044
 create mode 100644 tests/block/044.out

diff --git a/tests/block/044 b/tests/block/044
new file mode 100755
index 000000000000..8baf9fa59c68
--- /dev/null
+++ b/tests/block/044
@@ -0,0 +1,71 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (c) 2026 Christoph Hellwig.
+#
+# Basic block error injection test.
+
+. tests/block/rc
+. common/scsi_debug
+
+DESCRIPTION="basic block error injection test"
+QUICK=1
+
+requires()
+{	
+	_have_loadable_scsi_debug
+	_have_program xfs_io
+}
+
+# load and remove scsi_debug once to test the static_key bug in the
+# initial commit
+_test_load_unload()
+{
+	if ! _init_scsi_debug dev_size_mb=500; then
+		return 1
+	fi
+
+	local dev=${SCSI_DEBUG_DEVICES[0]}
+	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
+	if [[ ! -f "${debugfs_file}" ]]; then
+		SKIP_REASONS+=("error injection not supported")
+		_exit_scsi_debug
+		return 1
+	fi
+	echo "Testing unload without rules"
+	_exit_scsi_debug
+}
+
+_test_rules()
+{
+	if ! _init_scsi_debug dev_size_mb=500; then
+		return 1
+	fi
+
+	local dev=${SCSI_DEBUG_DEVICES[0]}
+	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
+
+	echo "Testing valid rules"
+	echo "add,op=WRITE,status=RESOURCE,start=0,nr_sectors=8" > $debugfs_file
+	echo "add,op=READ,status=IOERR,start=16,nr_sectors=8" > $debugfs_file
+	xfs_io -d -c 'pwrite -q 0 4096' /dev/$dev
+	xfs_io -d -c 'pread -q 0 4096' /dev/$dev
+	xfs_io -d -c 'pwrite -q 4096 4096' /dev/$dev
+	xfs_io -d -c 'pread -q 8192 8192' /dev/$dev
+
+	echo "Testing invalid rules"
+	echo "op=READ,status=IOERR" > $debugfs_file
+	echo "add,op=READ,status=EIO,start=32" > $debugfs_file
+	_exit_scsi_debug
+}
+
+test()
+{
+	echo "Running ${TEST_NAME}"
+
+	local ret
+
+	_test_load_unload
+	_test_rules
+
+	echo "Test complete"
+}
diff --git a/tests/block/044.out b/tests/block/044.out
new file mode 100644
index 000000000000..92efcddf7c8e
--- /dev/null
+++ b/tests/block/044.out
@@ -0,0 +1,9 @@
+Running block/044
+Testing unload without rules
+Testing valid rules
+pwrite: Cannot allocate memory
+pread: Input/output error
+Testing invalid rules
+tests/block/044: line 56: echo: write error: Invalid argument
+tests/block/044: line 57: echo: write error: Invalid argument
+Test complete
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH RFC v2 08/18] fs: add dedicated block device open helpers for filesystems
From: Jan Kara @ 2026-06-22 16:28 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <20260616-work-super-bdev_holder_global-v2-8-7df6b864028e@kernel.org>

On Tue 16-06-26 16:08:24, Christian Brauner wrote:
> Add fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release(). They
> open the device with fs_holder_ops and register a claim in the
> device-to-superblock table. Claims on the same (device, superblock)
> pair share one entry, so when a filesystem claims a device it already
> uses (xfs with its log on the data device), no second entry is added
> and each superblock will be acted on once.
> 
> The holder argument remains purely the block layer's exclusivity token:
> a superblock, or a file_system_type for a device shared by several
> superblocks of that type. The shared case only becomes usable once the
> fs_holder_ops callbacks resolve superblocks through the table instead
> of bdev->bd_holder.
> 
> Convert the main device, setup_bdev_super() and kill_block_super(),
> over: the open finds the entry registered by sget_fc() and claims it
> again. cramfs and romfs bypass kill_block_super() so they can handle
> MTD mounts and release the main device with a plain bdev_fput(), which
> would leave the claim behind: the (dev, sb) entry would never be
> unregistered and the passive reference it holds would keep the
> superblock alive forever. Convert their release paths in the same
> step.
> 
> The frozen-device check stays in setup_bdev_super() for the primary
> device and is added to fs_bdev_register() for new claims, i.e. every
> additional device a filesystem opens through the helpers. Only a
> (device, superblock) pair the superblock claimed earlier may be
> reopened while frozen (xfs with its log on the data device): the freeze
> already covers that superblock through the existing claim, so nothing
> escapes it. Without the setup_bdev_super() check a device frozen before
> the mount even started (dm lock_fs, loop) could be mounted and written
> to (journal replay) under an active freeze, because the primary open
> reuses the entry registered by sget_fc() and never takes the new-claim
> path.
> 
> Both checks read bd_fsfreeze_count only after the entry is published
> (by sget_fc() for the primary, by fs_bdev_register() for new claims)
> and pair with bdev_freeze() incrementing the count before walking the
> table: either the mount sees the elevated freeze count and fails with
> EBUSY, or the freeze finds the published entry and converges once
> SB_BORN is set.
> 
> Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

...

> +static int fs_bdev_register(struct file *bdev_file, struct super_block *sb)
> +{
> +	struct super_dev *sb_dev __free(kfree) = NULL;

Frankly I find the use of __free on sb_dev more confusing than helping in
this function. If you didn't use it, you could remove the somewhat
confusing retain_and_null_ptr() calls below, remove this initialization and
just put one kfree() into the error handling branch when super_dev_insert()
fails...

> +	dev_t dev = file_bdev(bdev_file)->bd_dev;
> +	int err;
> +
> +	scoped_guard(rcu) {
> +		sb_dev = super_dev_lookup(dev, sb);
> +		if (sb_dev && refcount_inc_not_zero(&sb_dev->sd_ref)) {
> +			retain_and_null_ptr(sb_dev);
> +			return 0;
> +		}
> +	}
> +
> +	sb_dev = super_dev_alloc(dev, sb);
> +	if (!sb_dev)
> +		return -ENOMEM;
> +
> +	err = super_dev_insert(sb_dev);
> +	if (err)
> +		return err;
> +
> +	/* Publish the entry before reading the count; pairs with bdev_freeze(). */
> +	smp_mb();
> +	if (atomic_read(&file_bdev(bdev_file)->bd_fsfreeze_count) > 0) {
> +		err = -EBUSY;
> +		super_dev_put(sb_dev);
> +	}
> +
> +	retain_and_null_ptr(sb_dev);
> +	return err;
> +}

...

> +/**
> + * fs_bdev_file_release - release a block device claimed for a superblock
> + * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
> + * @sb: superblock the device was claimed for
> + *
> + * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
> + * pinning cursor defers the actual unlink).  Then close the block device.
> + */
> +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
> +{
> +	dev_t dev = file_bdev(bdev_file)->bd_dev;
> +	struct super_dev *sb_dev;
> +
> +	rcu_read_lock();
> +	sb_dev = super_dev_lookup(dev, sb);
> +	rcu_read_unlock();
> +	super_dev_put(sb_dev);
> +	bdev_fput(bdev_file);
> +}
> +EXPORT_SYMBOL_GPL(fs_bdev_file_release);

Why don't you use sb->s_super_dev in this function?

Otherwise the patch looks good to me.

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH RFC v2 08/18] fs: add dedicated block device open helpers for filesystems
From: Jan Kara @ 2026-06-22 16:34 UTC (permalink / raw)
  To: Christian Brauner
  Cc: Jan Kara, Christoph Hellwig, Jens Axboe, Alexander Viro,
	linux-block, linux-kernel, linux-fsdevel, Carlos Maiolino,
	linux-xfs, Chris Mason, David Sterba, linux-btrfs,
	Theodore Ts'o, linux-ext4, Gao Xiang, linux-erofs
In-Reply-To: <xlfnmwv2upjia6ozd4z5l5icaewor4a6cgkafnigulndzmt6r7@rhay3h3wablo>

On Mon 22-06-26 18:28:50, Jan Kara wrote:
> On Tue 16-06-26 16:08:24, Christian Brauner wrote:
> > Add fs_bdev_file_open_by_{dev,path}() and fs_bdev_file_release(). They
> > open the device with fs_holder_ops and register a claim in the
> > device-to-superblock table. Claims on the same (device, superblock)
> > pair share one entry, so when a filesystem claims a device it already
> > uses (xfs with its log on the data device), no second entry is added
> > and each superblock will be acted on once.
> > 
> > The holder argument remains purely the block layer's exclusivity token:
> > a superblock, or a file_system_type for a device shared by several
> > superblocks of that type. The shared case only becomes usable once the
> > fs_holder_ops callbacks resolve superblocks through the table instead
> > of bdev->bd_holder.
> > 
> > Convert the main device, setup_bdev_super() and kill_block_super(),
> > over: the open finds the entry registered by sget_fc() and claims it
> > again. cramfs and romfs bypass kill_block_super() so they can handle
> > MTD mounts and release the main device with a plain bdev_fput(), which
> > would leave the claim behind: the (dev, sb) entry would never be
> > unregistered and the passive reference it holds would keep the
> > superblock alive forever. Convert their release paths in the same
> > step.
> > 
> > The frozen-device check stays in setup_bdev_super() for the primary
> > device and is added to fs_bdev_register() for new claims, i.e. every
> > additional device a filesystem opens through the helpers. Only a
> > (device, superblock) pair the superblock claimed earlier may be
> > reopened while frozen (xfs with its log on the data device): the freeze
> > already covers that superblock through the existing claim, so nothing
> > escapes it. Without the setup_bdev_super() check a device frozen before
> > the mount even started (dm lock_fs, loop) could be mounted and written
> > to (journal replay) under an active freeze, because the primary open
> > reuses the entry registered by sget_fc() and never takes the new-claim
> > path.
> > 
> > Both checks read bd_fsfreeze_count only after the entry is published
> > (by sget_fc() for the primary, by fs_bdev_register() for new claims)
> > and pair with bdev_freeze() incrementing the count before walking the
> > table: either the mount sees the elevated freeze count and fails with
> > EBUSY, or the freeze finds the published entry and converges once
> > SB_BORN is set.
> > 
> > Signed-off-by: Christian Brauner (Amutable) <brauner@kernel.org>

...

> > +/**
> > + * fs_bdev_file_release - release a block device claimed for a superblock
> > + * @bdev_file: file returned by fs_bdev_file_open_by_{dev,path}()
> > + * @sb: superblock the device was claimed for
> > + *
> > + * Drop one claim on the {dev, @sb} entry; the last claim unregisters it (a
> > + * pinning cursor defers the actual unlink).  Then close the block device.
> > + */
> > +void fs_bdev_file_release(struct file *bdev_file, struct super_block *sb)
> > +{
> > +	dev_t dev = file_bdev(bdev_file)->bd_dev;
> > +	struct super_dev *sb_dev;
> > +
> > +	rcu_read_lock();
> > +	sb_dev = super_dev_lookup(dev, sb);
> > +	rcu_read_unlock();
> > +	super_dev_put(sb_dev);
> > +	bdev_fput(bdev_file);
> > +}
> > +EXPORT_SYMBOL_GPL(fs_bdev_file_release);
> 
> Why don't you use sb->s_super_dev in this function?

Nevermind, I've realized sb can hold multiple bdevs so this is really
needed.

I'd still prefer the __free handling in fs_bdev_register() sorted out but
regardless feel free to add:

Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply

* Re: [PATCH v6 1/4] block: add task-context bio completion infrastructure
From: Tal Zussman @ 2026-06-22 16:45 UTC (permalink / raw)
  To: Jan Kara
  Cc: Christoph Hellwig, Jens Axboe, Matthew Wilcox (Oracle),
	Christian Brauner, Darrick J. Wong, Carlos Maiolino,
	Alexander Viro, Dave Chinner, Bart Van Assche, linux-block,
	linux-kernel, linux-xfs, linux-fsdevel, linux-mm, Gao Xiang
In-Reply-To: <4jtsjd2sbsn2w7fzfwb7wwowz72r4kc6345ckkkcjoxjbbwjwn@rkl44x3o3sgy>

On 6/18/26 10:26 AM, Jan Kara wrote:
> On Mon 01-06-26 13:04:41, Jan Kara wrote:
>> On Fri 29-05-26 16:46:15, Tal Zussman wrote:
>> > On 5/27/26 9:00 AM, Christoph Hellwig wrote:
>> > > On Wed, May 27, 2026 at 11:42:28AM +0200, Jan Kara wrote:
>> > >> > I ran some experiments with fio on both XFS and a raw block device. Five
>> > >> > iterations each for 60s. Results below.
>> > >> > 
>> > >> > TLDR: Removing the delay doesn't significantly decrease user-visible
>> > >> > latency or otherwise improve performance, but does significantly reduce
>> > >> > throughput and increase context switches in some workloads (e.g. C).
>> > >> > I think it makes sense to leave the delay as-is. Thoughts?
>> > >> 
>> > >> Thanks for the test! One question below:
>> > > 
>> > > Thanks from me as well!
>> > > 
>> > >> 
>> > >> > Results:
>> > >> > 
>> > >> > Workloads (all `uncached=1`):
>> > >> >   A: rw=write     bs=128k iodepth=1   ioengine=pvsync2     # XFS
>> > >> >   B: rw=write     bs=128k iodepth=128 ioengine=io_uring    # XFS
>> > >> >   C: rw=randwrite bs=4k   iodepth=32  ioengine=io_uring    # XFS
>> > >> >   D: rw=rw 50/50  bs=64k  iodepth=32  ioengine=io_uring    # XFS
>> > >> >   E: rw=write     bs=128k iodepth=128 ioengine=io_uring    # raw /dev/nvmeXn1
>> > >> >   F: rw=write     bs=128k iodepth=128 numjobs=4
>> > >> >      + vm.dirty_bytes=64MB, vm.dirty_background_bytes=32MB # XFS
>> > >> > 
>> > >> > Mean ± stddev across 5 iterations:
>> > >> > 
>> > >> >     metric                     delay=1           delay=0     delta
>> > >> >     --------------------------------------------------------------
>> > >> > 
>> > >> >   A seq 128k qd1
>> > >> >     BW (MB/s)                4333 ± 27         4374 ± 34     +0.9%
>> > >> >     p99   (us)              36.2 ± 0.8        35.8 ± 0.4     -1.1%
>> > >> >     p999  (us)               3260 ± 75         3228 ± 29     -1.0%
>> > >> >     ctx-switches          184 k ± 59 k     3.68 M ± 65 k    +1903%
>> > >> >     cs / io                0.09 ± 0.03       1.86 ± 0.03    +1888%
>> > >> >     avg bios/run            80.4 ± 0.6         1.1 ± 0.0    -98.7%
>> > >> 
>> > >> So 1 jiffie delay is (with default HZ=1000) 1ms. That means for this load
>> > >> the completion latency should be at least 1000us but your results show p99
>> > >> latency of 36. What am I missing?
>> > > 
>> > > Yes, this looks a bit odd.  Unless there's multiple threads submitting
>> > > and somehow the completions get batched this should complete one
>> > > bio at a time and be the worst case for the delay scheme.
>> > 
>> > Sorry, I should've clarified - the latency here is the userspace-visible
>> > I/O completion latency (i.e. fio's clat value).
>> > 
>> > I ran again and traced to get the actual time from __bio_complete_in_task()
>> > to calling ->bi_end_io(). The results match the 1 jiffie delay now:
>> > 
>> >   metric                  delay=1  delay=0
>> > 
>> >   A seq 128k qd1
>> >     fio clat p99             38us     36us
>> >     bio cb p50             1.23ms    2.5us
>> >     bio cb p99             4.13ms   1.44ms
>> >     bio cb p999            5.01ms   2.63ms
>> 
>> So I'm clearly missing something fundamental as I don't see how can fio
>> reported IO completion time be lower than the end_io callback latency...
>> Ahh, it is the strange meaning of clat in fio in combination with sync
>> engine where clat means: "how long after the syscall has returned the data
>> is ready". Which for sync engine is immediately so the clat number is
>> meaningless. I think reporting 'lat' numbers from fio would make more
>> sense but whatever.
>> 
>> The bio cb latency indeed looks like what I'd roughly expect now. And
>> notice how the median latency of IO completion is 1.23ms in delay=1 case
>> and your throughput isn't abbysmal only because writes end up accumulating
>> in the page cache and writeback infrastructure ends up submitting a lot of
>> writeback IOs in parallel (you have ~80 bios to complete per run which
>> amortizes the latency to decent level).
>> 
>> However if you'd have IO that were to use BIO_COMPLETE_IN_TASK
>> infrastructure which doesn't have so many IOs in flight (like direct IO
>> with lower queue depth which has to do extent conversion on completion),
>> you would very much see the latency hit on your throughput as well. In the
>> extreme case of qd=1 direct IO you'd reduce the throughput to ~4MB/s.
>> 
>> Now I'm not saying the delay is bad - it is a tradeoff with clear wins in
>> CPU overhead your benchmarks are showing. I just wanted to point out
>> there's also the cost side which your benchmarks don't show very clearly.
>> So we might need to keep some stats showing how many IO completions we are
>> offloading per second on each CPU and switch to delaying the work only once
>> it crosses a threshold like 1000000/HZ per second or so (so we at most
>> double the IO latency by delaying the end_io callback).
> 
> Any progress here? The patchset looks really promising so I'd love to have
> it completed :)
> 
Sorry for the delay - got caught up with some other work and had to set this
aside for a couple weeks, but haven't forgotten about this. Planning to pick
it back up some time this week.

Thanks,
Tal


^ permalink raw reply

* Re: [PATCH blktests] Fix _get_page_size()
From: Omar Sandoval @ 2026-06-22 17:31 UTC (permalink / raw)
  To: Shin'ichiro Kawasaki; +Cc: Bart Van Assche, Jeff Moyer, linux-block, kch
In-Reply-To: <ajkeGQd-0LnKJbHN@shinmob>

On Mon, Jun 22, 2026 at 08:38:48PM +0900, Shin'ichiro Kawasaki wrote:
> On Jun 20, 2026 / 09:11, Bart Van Assche wrote:
> > On 6/20/26 6:51 AM, Shin'ichiro Kawasaki wrote:
> > > On Jun 20, 2026 / 05:55, Bart Van Assche wrote:
> > > > On 6/20/26 3:26 AM, Shin'ichiro Kawasaki wrote:
> > > > > This is a rather fundamental change, so I would like to ask opinions from
> > > > > other blktests users, especially Omar and Chaitanya. What do you think about
> > > > > the idea to add getconf to the requirement list?
> > > > 
> > > > CONFIG_PAGE_SHIFT was introduced in the Linux kernel in February 2024
> > > > (commit ba89f9c8ccba ("arch: consolidate existing CONFIG_PAGE_SIZE_*KB
> > > > definitions")). Older kernels had CONFIG_PAGE_SIZE_4KB,
> > > > CONFIG_PAGE_SIZE_16KB, etc. This means that it is possible to derive the
> > > > kernel page size from the kernel configuration file for all upstream and
> > > > distro kernels, isn't it?
> > > 
> > > I checked the commit is in the tag v6.9. My Debian bookworm system has kernel
> > > v6.1, then the config file at /boot does not have CONFIG_PAGE_SHIFT as expected.
> > > But it does not have CONFIG_PAGE_SIZE_* either... I'm still afraid that kernel
> > > config file approach is not reliable.
> > 
> > Right, for older kernels CONFIG_PAGE_SIZE_*KB is only available for some
> > but not for all supported architectures.
> > 
> > It is not clear to me where the desire to avoid the dependency on
> > getconf comes from? As far as I know it is available on all Linux
> > distro's. Since it is typically included in the C library package it
> > should not introduce a new dependency.
> 
> I think less dependent is the better in general, and wanted to confirm that
> it is fine for everybody. If there is no voice to object, I will create a
> patch to add getconf to the requirement list.

I agree with Bart, getconf is ubiquitous enough that it's not worth
trying to hack around its absence. In my opinion, parsing kernel config
options should be a last resort. If anyone complains about the getconf
dependency in the future, I think it'd be better to add a simple
src/pagesize.c file that uses sysconf(_SC_PAGESIZE), but I don't expect
that to be necessary.

Omar

^ permalink raw reply

* Re: [PATCH 1/1] block: validate user space vectors during extraction
From: Keith Busch @ 2026-06-22 17:40 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, linux-block, linux-fsdevel, dm-devel, axboe, brauner,
	djwong, viro, stable
In-Reply-To: <ajPv7yOoYsR5O6kf@kbusch-mbp>

On Thu, Jun 18, 2026 at 07:17:35AM -0600, Keith Busch wrote:
>
> But since you mention it, __blkdev_direct_IO's handling does look wrong,
> so maybe I can clean that up first.

After careful reviewing, I think __blkdev_direct_IO() is mostly correct
in what it's doing. It looks weird, but appears to be well optimized for
the common case such that making it look more readable would produce
less efficient execution code. So I'm not going to touch it, but there is
a bug here with metadata mapping error handling that I'm going to
propose in the next version.

^ permalink raw reply

* [PATCHv2 0/6] direct-io: validate user space vectors during extraction
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch

From: Keith Busch <kbusch@kernel.org>

This addresses the misaligned direct-io problem behind various threads:

 https://lore.kernel.org/linux-xfs/20260610145218.141369-1-cem@kernel.org/
 https://lore.kernel.org/all/CAC_j7i1R7oy+nRhxEjCTba=DUgn02w9X+p94DCu0aHv5+5tKnQ@mail.gmail.com/
 https://lore.kernel.org/linux-block/ai7rnH20IYeSmY8s@gallifrey/
 https://lore.kernel.org/linux-block/20260616154009.2123183-1-kbusch@meta.com/

The previously tested fixes are correct as far as they go, but they
treat the symptom: they only matter because an invalid bio reaches those
drivers in the first place.

The reason it reaches them is an assumption I made when I removed
direct-io alignment checks in 5ff3f74e145a ("block: simplify direct io
validity check") and 7eac331869575 ("iomap: simplify direct io validity
check"): every bio is eventually split to the device limits, and the
upper layers cope with resulting errors once the bio has formed. Both
were optimistic assumptions. Drivers with their own ->submit_bio may
never pass through blk_mq_submit_bio()'s split, so the check never runs
for them, and as numerous threads showed, the consumers don't uniformly
handle this condition.

This patch stops the invalid bio at the source instead. It validates the
buffer's alignment against the alignment limits when the bio is built
from the iov_iter. The check is folded into the bvec extraction that
already walks the vectors, so it adds only a comparison on a path that
is pinning direct-io pages anyway. Misalignment is now uniformly
rejected with EINVAL before submission for every direct-io path.

With this in place, the dm side changes under discussion are no longer
required to fix the bugs: the affected targets simply never see the
invalid bio. The tested patches remain reasonable as defense-in-depth if
desired, but they are not strictly necessary after this.

v1->v2:

 I've included some prep patches that fix other issues in this path.

 Renamed the alignment to "mem_align_mask", re-ordered the function
 parameters so it appears before the length alignment, and added the
 appropriate kerneldoc.

 Added additional comments to explain the rationale behind the checks.

 For DEBUG kernels, a bio_vec iterator is checked in its entirety. The
 existing use cases appear to only need the first vector to be checked,
 so the more expensive exhaustive check is only happening for the debug
 kernels.

Keith Busch (6):
  block: introduce bio_endio_errno helper
  block: report the actual status
  block: fix dio leak on metadata mapping error
  loop: set dma_alignment from the backing file for direct I/O
  zloop: set dma_alignment from the backing files for direct I/O
  block: validate user space vectors during extraction

 block/bio.c            | 50 +++++++++++++++++++++++++++++++++++++++---
 block/blk-map.c        |  2 +-
 block/blk-merge.c      |  4 ++--
 block/fops.c           |  9 +++++---
 drivers/block/loop.c   | 50 +++++++++++++++++++++++++++++++++++-------
 drivers/block/zloop.c  | 22 +++++++++++++++++--
 fs/iomap/direct-io.c   |  1 +
 include/linux/bio.h    |  2 +-
 include/linux/blkdev.h |  5 +++++
 include/linux/uio.h    |  3 ++-
 lib/iov_iter.c         |  9 +++++++-
 11 files changed, 135 insertions(+), 22 deletions(-)

-- 
2.52.0

^ permalink raw reply

* [PATCHv2 3/6] block: fix dio leak on metadata mapping error
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

A failed integrity mapping holds a dio reference, so we need to go
through the full bio ending in case there were previously submitted
bio's in the sequence.

Fixes: 2729a60bbfb92 ("block: don't silently ignore metadata for sync read/write")
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/fops.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index f237d6cab8975..b5c320da28123 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -238,8 +238,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		if (iocb->ki_flags & IOCB_HAS_METADATA) {
 			ret = bio_integrity_map_iter(bio, iocb->private);
-			if (unlikely(ret))
-				goto fail;
+			if (unlikely(ret)) {
+				bio_endio_errno(bio, ret);
+				break;
+			}
 		}
 
 		if (is_read) {
-- 
2.52.0


^ permalink raw reply related

* [PATCHv2 1/6] block: introduce bio_endio_errno helper
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

No functional change; purely introducing a convenience function.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 block/blk-merge.c      | 4 ++--
 include/linux/blkdev.h | 5 +++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index ab1161ca69f1e..c93170f340977 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -122,7 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
 	struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
 
 	if (IS_ERR(split)) {
-		bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
+		bio_endio_errno(bio, PTR_ERR(split));
 		return NULL;
 	}
 
@@ -142,7 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
 static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
 {
 	if (unlikely(split_sectors < 0)) {
-		bio_endio_status(bio, errno_to_blk_status(split_sectors));
+		bio_endio_errno(bio, split_sectors);
 		return NULL;
 	}
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9213a5716f95a..88e4bd88c3e28 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1047,6 +1047,11 @@ extern const char *blk_op_str(enum req_op op);
 int blk_status_to_errno(blk_status_t status);
 blk_status_t errno_to_blk_status(int errno);
 
+static inline void bio_endio_errno(struct bio *bio, int errno)
+{
+	bio_endio_status(bio, errno_to_blk_status(errno));
+}
+
 /* only poll the hardware once, don't continue until a completion was found */
 #define BLK_POLL_ONESHOT		(1 << 0)
 int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags);
-- 
2.52.0


^ permalink raw reply related

* [PATCHv2 4/6] loop: set dma_alignment from the backing file for direct I/O
From: Keith Busch @ 2026-06-22 17:42 UTC (permalink / raw)
  To: linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260622174241.2299563-1-kbusch@meta.com>

From: Keith Busch <kbusch@kernel.org>

Direct I/O user pages are forwarded to the backing file unchanged, so
the backing's DMA alignment requirement applies to them. Track the
backing's dio_mem_align and advertise it as the loop device's
dma_alignment so we advertise proper limits and misaligned I/O is
rejected here instead of being dispatched to the backend.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/block/loop.c | 50 +++++++++++++++++++++++++++++++++++++-------
 1 file changed, 42 insertions(+), 8 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 310de0463beb1..7114f80ab162a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -54,6 +54,7 @@ struct loop_device {
 
 	struct file	*lo_backing_file;
 	unsigned int	lo_min_dio_size;
+	unsigned int	lo_dio_mem_align;
 	struct block_device *lo_device;
 
 	gfp_t		old_gfp_mask;
@@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo)
 			__func__, lo->lo_number, lo->lo_file_name, rc);
 }
 
-static unsigned int loop_query_min_dio_size(struct loop_device *lo)
+static void loop_update_dio_alignment(struct loop_device *lo)
 {
 	struct file *file = lo->lo_backing_file;
 	struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
 	struct kstat st;
 
 	/*
-	 * Use the minimal dio alignment of the file system if provided.
+	 * Use the dio alignment of the file system if provided.  dio_offset_align
+	 * is the minimum dio size and offset; dio_mem_align is the buffer memory
+	 * alignment, kept as a mask to become the loop device's dma_alignment in
+	 * direct I/O mode where the buffer is handed to the backing file unchanged.
 	 */
 	if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
-	    (st.result_mask & STATX_DIOALIGN))
-		return st.dio_offset_align;
+	    (st.result_mask & STATX_DIOALIGN)) {
+		lo->lo_min_dio_size = st.dio_offset_align;
+		lo->lo_dio_mem_align = st.dio_mem_align - 1;
+		return;
+	}
 
 	/*
 	 * In a perfect world this wouldn't be needed, but as of Linux 6.13 only
 	 * a handful of file systems support the STATX_DIOALIGN flag.
 	 */
-	if (sb_bdev)
-		return bdev_logical_block_size(sb_bdev);
-	return SECTOR_SIZE;
+	if (sb_bdev) {
+		lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev);
+		lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev);
+		return;
+	}
+
+	lo->lo_min_dio_size = SECTOR_SIZE;
+	lo->lo_dio_mem_align = SECTOR_SIZE - 1;
 }
 
 static inline int is_loop_device(struct file *file)
@@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
 			lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
 	if (lo->lo_backing_file->f_flags & O_DIRECT)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
-	lo->lo_min_dio_size = loop_query_min_dio_size(lo);
+	loop_update_dio_alignment(lo);
 }
 
 static int loop_check_backing_file(struct file *file)
@@ -961,6 +973,17 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
 	lim->logical_block_size = bsize;
 	lim->physical_block_size = bsize;
 	lim->io_min = bsize;
+	/*
+	 * In direct I/O the user pages are handed to the backing file as-is, so
+	 * the backing's DMA alignment requirement applies to them.  Advertise it
+	 * so misaligned I/O is rejected at this device's entry instead of being
+	 * dispatched to the backend.  Buffered I/O copies through the page cache
+	 * and imposes no such requirement.
+	 */
+	if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+		lim->dma_alignment = lo->lo_dio_mem_align;
+	else
+		lim->dma_alignment = SECTOR_SIZE - 1;
 	lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
 	if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
 		lim->features |= BLK_FEAT_WRITE_CACHE;
@@ -1416,6 +1439,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 {
 	bool use_dio = !!arg;
 	unsigned int memflags;
+	struct queue_limits lim;
 
 	if (lo->lo_state != Lo_bound)
 		return -ENXIO;
@@ -1434,6 +1458,16 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
 		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
 	else
 		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+	/*
+	 * Direct I/O forwards the user pages to the backing file unchanged, so
+	 * track the backing's DMA alignment requirement as the mode is toggled.
+	 */
+	lim = queue_limits_start_update(lo->lo_queue);
+	if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
+		lim.dma_alignment = lo->lo_dio_mem_align;
+	else
+		lim.dma_alignment = SECTOR_SIZE - 1;
+	queue_limits_commit_update(lo->lo_queue, &lim);
 	blk_mq_unfreeze_queue(lo->lo_queue, memflags);
 	return 0;
 }
-- 
2.52.0


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox