public inbox for linux-nvme@lists.infradead.org
 help / color / mirror / Atom feed
* [PATCH 0/2] nvme: blktests bug fix for 6.19
@ 2025-11-23 19:18 Chaitanya Kulkarni
  2025-11-23 19:18 ` [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset Chaitanya Kulkarni
  2025-11-23 19:18 ` [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue Chaitanya Kulkarni
  0 siblings, 2 replies; 7+ messages in thread
From: Chaitanya Kulkarni @ 2025-11-23 19:18 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi; +Cc: linux-nvme, Chaitanya Kulkarni

Hi,

While testing my discard return value series I discovered two blktests
failures, here are two fixes, below blktest testing after these patches.

-ck

Chaitanya Kulkarni (2):
  nvme-tcp: use __fput_sync() to avoid use-after-free on reset
  nvme-multipath: clear BIO_QOS flags on requeue

 drivers/nvme/host/multipath.c | 10 ++++++++++
 drivers/nvme/host/tcp.c       |  9 +++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

nvme (nvme-6.19) # gitlog -3
0e53aeec7e61 (HEAD -> nvme-6.19) nvme-multipath: clear BIO_QOS flags on requeue
d0147be50d71 nvme-tcp: use __fput_sync() to avoid use-after-free on reset
1c2ad96dbb33 (origin/nvme-6.19) nvme: Fix typo error in nvme target
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # ./compile_nvme.sh
+ unload
+ sh ./unload-vfio-nvme.sh
rmmod: ERROR: Module drivers/vfio/pci/nvme/nvme_vfio_pci is not currently loaded
rmmod: ERROR: Module vfio_pci is not currently loaded
rmmod: ERROR: Module vfio_pci_core is not currently loaded
rmmod: ERROR: Module vfio_iommu_type1 is not currently loaded
rmmod: ERROR: Module vfio is not currently loaded
############################## UNLOAD #############################
nvme_loop              20480  0
nvmet                 221184  1 nvme_loop
nvme_tcp               90112  0
nvme_fabrics           40960  2 nvme_tcp,nvme_loop
nvme_keyring           20480  3 nvmet,nvme_tcp,nvme_fabrics
nvme                   69632  0
nvme_core             233472  5 nvmet,nvme_tcp,nvme,nvme_loop,nvme_fabrics
umount: /mnt/nvme0n1: no mount point specified.
NQN:testnqn disconnected 0 controller(s)

real	0m0.011s
user	0m0.001s
sys	0m0.010s
rmdir: failed to remove '/sys/kernel/config/nvmet/ports/1': No such file or directory
./delete.sh: line 14: /sys/kernel/config/nvmet/subsystems/*/namespaces/*/enable: No such file or directory
rmdir: failed to remove '/sys/kernel/config/nvmet/subsystems/*/namespaces/*': No such file or directory
rmdir: failed to remove '/sys/kernel/config/nvmet/subsystems/*': No such file or directory
rmdir: failed to remove 'config/nullb/nullb*': No such file or directory
umount: /mnt/nvme0n1: no mount point specified.
umount: /mnt/backend: not mounted.
############################## DELETE #############################
nvme_tcp               90112  0
nvme_fabrics           40960  1 nvme_tcp
nvme_keyring           20480  2 nvme_tcp,nvme_fabrics
nvme                   69632  0
nvme_core             233472  3 nvme_tcp,nvme,nvme_fabrics
nvme_tcp               90112  0
nvme_fabrics           40960  1 nvme_tcp
nvme_keyring           20480  2 nvme_tcp,nvme_fabrics
nvme                   69632  0
nvme_core             233472  3 nvme_tcp,nvme,nvme_fabrics
nvme                   69632  0
nvme_core             233472  1 nvme
nvme                   69632  0
nvme_core             233472  1 nvme
modprobe: FATAL: Module nvme_keryring not found.
modprobe: FATAL: Module nvme_auth not found.

3 directories, 0 files
############################## UNLOAD #############################
### nvme_loop unload 
### nvmet unload 
### nvme_tcp unload 
### nvme_fabrics unload 
### nvme unload 
### nvme_core unload 
### nvme_keryring unload 
### nvme_auth unload 
modprobe: FATAL: Module nvme_auth not found.

make[1]: Entering directory '/mnt/data100G/nvme/drivers/nvme'
make[1]: Leaving directory '/mnt/data100G/nvme/drivers/nvme'
+ ls -lrth /lib/modules/6.17.0-rc3nvme+/kernel/drivers/nvme/host/ /lib/modules/6.17.0-rc3nvme+/kernel/drivers/nvme/target//
/lib/modules/6.17.0-rc3nvme+/kernel/drivers/nvme/host/:
total 7.9M
-rw-r--r--. 1 root root 4.2M Nov 23 10:21 nvme-core.ko
-rw-r--r--. 1 root root 592K Nov 23 10:21 nvme-fabrics.ko
-rw-r--r--. 1 root root 1.2M Nov 23 10:21 nvme-fc.ko
-rw-r--r--. 1 root root 929K Nov 23 10:21 nvme.ko
-rw-r--r--. 1 root root 1.2M Nov 23 10:21 nvme-tcp.ko

/lib/modules/6.17.0-rc3nvme+/kernel/drivers/nvme/target//:
total 9.5M
-rw-r--r--. 1 root root  660K Nov 23 10:21 nvme-fcloop.ko
-rw-r--r--. 1 root root  562K Nov 23 10:21 nvme-loop.ko
-rw-r--r--. 1 root root 1009K Nov 23 10:21 nvmet-fc.ko
-rw-r--r--. 1 root root  4.6M Nov 23 10:21 nvmet.ko
-rw-r--r--. 1 root root  765K Nov 23 10:21 nvmet-pci-epf.ko
-rw-r--r--. 1 root root  1.1M Nov 23 10:21 nvmet-rdma.ko
-rw-r--r--. 1 root root  992K Nov 23 10:21 nvmet-tcp.ko
+ sync
+ modprobe nvme-core
+ modprobe nvme
+ modprobe nvme-fabrics
+ modprobe nvme-tcp
+ modprobe nvme_loop
+ modprobe nvmet
+ lsmod
+ grep nvme
nvme_loop              20480  0
nvmet                 221184  1 nvme_loop
nvme_tcp               90112  0
nvme_fabrics           40960  2 nvme_tcp,nvme_loop
nvme_keyring           20480  3 nvmet,nvme_tcp,nvme_fabrics
nvme                   69632  0
nvme_core             233472  5 nvmet,nvme_tcp,nvme,nvme_loop,nvme_fabrics
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # 
nvme (nvme-6.19) # cdblktests 
blktests (master) # 
blktests (master) # 
blktests (master) # 
blktests (master) # 
blktests (master) # 

blktests (master) # ./test-nvme.sh
++ for t in loop tcp
++ echo '################nvme_trtype=loop############'
################nvme_trtype=loop############
++ nvme_img_size=900M
++ nvme_num_iter=1
++ nvme_trtype=loop
++ ./check nvme
nvme/002 (tr=loop) (create many subsystems and test discovery) [passed]
    runtime  40.593s  ...  41.344s
nvme/003 (tr=loop) (test if we're sending keep-alives to a discovery controller) [passed]
    runtime  10.256s  ...  10.266s
nvme/004 (tr=loop) (test nvme and nvmet UUID NS descriptors) [passed]
    runtime  0.893s  ...  0.911s
nvme/005 (tr=loop) (reset local loopback target)             [passed]
    runtime  1.449s  ...  1.482s
nvme/006 (tr=loop bd=device) (create an NVMeOF target)       [passed]
    runtime  0.119s  ...  0.112s
nvme/006 (tr=loop bd=file) (create an NVMeOF target)         [passed]
    runtime  0.082s  ...  0.078s
nvme/008 (tr=loop bd=device) (create an NVMeOF host)         [passed]
    runtime  0.907s  ...  0.909s
nvme/008 (tr=loop bd=file) (create an NVMeOF host)           [passed]
    runtime  0.879s  ...  0.867s
nvme/010 (tr=loop bd=device) (run data verification fio job) [passed]
    runtime  38.757s  ...  71.268s
nvme/010 (tr=loop bd=file) (run data verification fio job)   [passed]
    runtime  101.443s  ...  175.439s
nvme/012 (tr=loop bd=device) (run mkfs and data verification fio) [passed]
    runtime  37.683s  ...  82.689s
nvme/012 (tr=loop bd=file) (run mkfs and data verification fio) [passed]
    runtime  81.227s  ...  156.973s
nvme/014 (tr=loop bd=device) (flush a command from host)     [passed]
    runtime  6.871s  ...  15.076s
nvme/014 (tr=loop bd=file) (flush a command from host)       [passed]
    runtime  6.281s  ...  10.713s
nvme/016 (tr=loop) (create/delete many NVMeOF block device-backed ns and test discovery) [passed]
    runtime  0.165s  ...  0.162s
nvme/017 (tr=loop) (create/delete many file-ns and test discovery) [passed]
    runtime  0.164s  ...  0.162s
nvme/018 (tr=loop) (unit test NVMe-oF out of range access on a file backend) [passed]
    runtime  0.849s  ...  0.852s
nvme/019 (tr=loop bd=device) (test NVMe DSM Discard command) [passed]
    runtime  0.892s  ...  0.871s
nvme/019 (tr=loop bd=file) (test NVMe DSM Discard command)   [passed]
    runtime  0.865s  ...  0.849s
nvme/021 (tr=loop bd=device) (test NVMe list command)        [passed]
    runtime  0.971s  ...  0.887s
nvme/021 (tr=loop bd=file) (test NVMe list command)          [passed]
    runtime  0.903s  ...  0.857s
nvme/022 (tr=loop bd=device) (test NVMe reset command)       [passed]
    runtime  1.506s  ...  1.507s
nvme/022 (tr=loop bd=file) (test NVMe reset command)         [passed]
    runtime  1.441s  ...  1.450s
nvme/023 (tr=loop bd=device) (test NVMe smart-log command)   [passed]
    runtime  0.887s  ...  0.871s
nvme/023 (tr=loop bd=file) (test NVMe smart-log command)     [passed]
    runtime  0.849s  ...  0.872s
nvme/025 (tr=loop bd=device) (test NVMe effects-log)         [passed]
    runtime  0.883s  ...  0.916s
nvme/025 (tr=loop bd=file) (test NVMe effects-log)           [passed]
    runtime  0.855s  ...  0.862s
nvme/026 (tr=loop bd=device) (test NVMe ns-descs)            [passed]
    runtime  0.874s  ...  0.892s
nvme/026 (tr=loop bd=file) (test NVMe ns-descs)              [passed]
    runtime  0.857s  ...  0.895s
nvme/027 (tr=loop bd=device) (test NVMe ns-rescan command)   [passed]
    runtime  0.910s  ...  0.930s
nvme/027 (tr=loop bd=file) (test NVMe ns-rescan command)     [passed]
    runtime  0.887s  ...  0.891s
nvme/028 (tr=loop bd=device) (test NVMe list-subsys)         [passed]
    runtime  0.884s  ...  0.899s
nvme/028 (tr=loop bd=file) (test NVMe list-subsys)           [passed]
    runtime  0.858s  ...  0.851s
nvme/029 (tr=loop) (test userspace IO via nvme-cli read/write interface) [passed]
    runtime  1.111s  ...  1.108s
nvme/030 (tr=loop) (ensure the discovery generation counter is updated appropriately) [passed]
    runtime  0.524s  ...  0.551s
nvme/031 (tr=loop) (test deletion of NVMeOF controllers immediately after setup) [passed]
    runtime  8.303s  ...  8.372s
nvme/038 (tr=loop) (test deletion of NVMeOF subsystem without enabling) [passed]
    runtime  0.029s  ...  0.029s
nvme/040 (tr=loop) (test nvme fabrics controller reset/disconnect operation during I/O) [passed]
    runtime  7.817s  ...  7.811s
nvme/041 (tr=loop) (Create authenticated connections)        [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/042 (tr=loop) (Test dhchap key types for authenticated connections) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/043 (tr=loop) (Test hash and DH group variations for authenticated connections) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/044 (tr=loop) (Test bi-directional authentication)      [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/045 (tr=loop) (Test re-authentication)                  [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/047 (tr=loop) (test different queue types for fabric transports) [not run]
    nvme_trtype=loop is not supported in this test
nvme/048 (tr=loop) (Test queue count changes on reconnect)   [not run]
    nvme_trtype=loop is not supported in this test
nvme/051 (tr=loop) (test nvmet concurrent ns enable/disable) [passed]
    runtime  2.157s  ...  2.184s
nvme/052 (tr=loop) (Test file-ns creation/deletion under one subsystem) [passed]
    runtime  6.443s  ...  6.458s
nvme/054 (tr=loop) (Test the NVMe reservation feature)       [passed]
    runtime  0.937s  ...  0.933s
nvme/055 (tr=loop) (Test nvme write to a loop target ns just after ns is disabled) [passed]
    runtime  0.917s  ...  0.924s
nvme/056 (tr=loop) (enable zero copy offload and run rw traffic) [not run]
    Remote target required but NVME_TARGET_CONTROL is not set
    nvme_trtype=loop is not supported in this test
    kernel option ULP_DDP has not been enabled
    module nvme_tcp does not have parameter ddp_offload
    KERNELSRC not set
    Kernel sources do not have tools/net/ynl/cli.py
    NVME_IFACE not set
nvme/057 (tr=loop) (test nvme fabrics controller ANA failover during I/O) [passed]
    runtime  37.234s  ...  29.876s
nvme/058 (tr=loop) (test rapid namespace remapping)          [passed]
    runtime  8.093s  ...  7.385s
nvme/060 (tr=loop) (test nvme fabrics target reset)          [not run]
    nvme_trtype=loop is not supported in this test
nvme/061 (tr=loop) (test fabric target teardown and setup during I/O) [not run]
    nvme_trtype=loop is not supported in this test
nvme/062 (tr=loop) (Create TLS-encrypted connections)        [not run]
    nvme_trtype=loop is not supported in this test
    command tlshd is not available
    systemctl unit 'tlshd' is missing
    Install ktls-utils for tlshd
nvme/063 (tr=loop) (Create authenticated TCP connections with secure concatenation) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
    nvme_trtype=loop is not supported in this test
    command tlshd is not available
    systemctl unit 'tlshd' is missing
    Install ktls-utils for tlshd
nvme/065 (test unmap write zeroes sysfs interface with nvmet devices) [not run]
    driver scsi_debug is not available
++ for t in loop tcp
++ echo '################nvme_trtype=tcp############'
################nvme_trtype=tcp############
++ nvme_img_size=900M
++ nvme_num_iter=1
++ nvme_trtype=tcp
++ ./check nvme
nvme/002 (tr=tcp) (create many subsystems and test discovery) [not run]
    nvme_trtype=tcp is not supported in this test
nvme/003 (tr=tcp) (test if we're sending keep-alives to a discovery controller) [passed]
    runtime  10.267s  ...  10.260s
nvme/004 (tr=tcp) (test nvme and nvmet UUID NS descriptors)  [passed]
    runtime  0.405s  ...  0.410s
nvme/005 (tr=tcp) (reset local loopback target)              [passed]
    runtime  0.507s  ...  0.505s
nvme/006 (tr=tcp bd=device) (create an NVMeOF target)        [passed]
    runtime  0.121s  ...  0.131s
nvme/006 (tr=tcp bd=file) (create an NVMeOF target)          [passed]
    runtime  0.096s  ...  0.093s
nvme/008 (tr=tcp bd=device) (create an NVMeOF host)          [passed]
    runtime  0.396s  ...  0.408s
nvme/008 (tr=tcp bd=file) (create an NVMeOF host)            [passed]
    runtime  0.378s  ...  0.384s
nvme/010 (tr=tcp bd=device) (run data verification fio job)  [passed]
    runtime  3.971s  ...  101.482s
nvme/010 (tr=tcp bd=file) (run data verification fio job)    [passed]
    runtime  0.984s  ...  149.695s
nvme/012 (tr=tcp bd=device) (run mkfs and data verification fio) [passed]
    runtime  2.220s  ...  111.416s
nvme/012 (tr=tcp bd=file) (run mkfs and data verification fio) [passed]
    runtime    ...  144.534s
nvme/014 (tr=tcp bd=device) (flush a command from host)      [passed]
    runtime  6.669s  ...  12.069s
nvme/014 (tr=tcp bd=file) (flush a command from host)        [passed]
    runtime  6.066s  ...  10.806s
nvme/016 (tr=tcp) (create/delete many NVMeOF block device-backed ns and test discovery) [not run]
    nvme_trtype=tcp is not supported in this test
nvme/017 (tr=tcp) (create/delete many file-ns and test discovery) [not run]
    nvme_trtype=tcp is not supported in this test
nvme/018 (tr=tcp) (unit test NVMe-oF out of range access on a file backend) [passed]
    runtime  0.374s  ...  0.382s
nvme/019 (tr=tcp bd=device) (test NVMe DSM Discard command)  [passed]
    runtime  0.410s  ...  0.414s
nvme/019 (tr=tcp bd=file) (test NVMe DSM Discard command)    [passed]
    runtime  0.362s  ...  0.383s
nvme/021 (tr=tcp bd=device) (test NVMe list command)         [passed]
    runtime  0.422s  ...  0.413s
nvme/021 (tr=tcp bd=file) (test NVMe list command)           [passed]
    runtime  0.383s  ...  0.383s
nvme/022 (tr=tcp bd=device) (test NVMe reset command)        [passed]
    runtime  0.522s  ...  0.510s
nvme/022 (tr=tcp bd=file) (test NVMe reset command)          [passed]
    runtime  0.473s  ...  0.488s
nvme/023 (tr=tcp bd=device) (test NVMe smart-log command)    [passed]
    runtime  0.405s  ...  0.390s
nvme/023 (tr=tcp bd=file) (test NVMe smart-log command)      [passed]
    runtime  0.368s  ...  0.350s
nvme/025 (tr=tcp bd=device) (test NVMe effects-log)          [passed]
    runtime  0.421s  ...  0.436s
nvme/025 (tr=tcp bd=file) (test NVMe effects-log)            [passed]
    runtime  0.385s  ...  0.382s
nvme/026 (tr=tcp bd=device) (test NVMe ns-descs)             [passed]
    runtime  0.407s  ...  0.397s
nvme/026 (tr=tcp bd=file) (test NVMe ns-descs)               [passed]
    runtime  0.354s  ...  0.367s
nvme/027 (tr=tcp bd=device) (test NVMe ns-rescan command)    [passed]
    runtime  0.432s  ...  0.446s
nvme/027 (tr=tcp bd=file) (test NVMe ns-rescan command)      [passed]
    runtime  0.411s  ...  0.407s
nvme/028 (tr=tcp bd=device) (test NVMe list-subsys)          [passed]
    runtime  0.395s  ...  0.393s
nvme/028 (tr=tcp bd=file) (test NVMe list-subsys)            [passed]
    runtime  0.367s  ...  0.362s
nvme/029 (tr=tcp) (test userspace IO via nvme-cli read/write interface) [passed]
    runtime  0.626s  ...  0.636s
nvme/030 (tr=tcp) (ensure the discovery generation counter is updated appropriately) [passed]
    runtime  0.376s  ...  0.384s
nvme/031 (tr=tcp) (test deletion of NVMeOF controllers immediately after setup) [passed]
    runtime  3.170s  ...  3.217s
nvme/038 (tr=tcp) (test deletion of NVMeOF subsystem without enabling) [passed]
    runtime  0.035s  ...  0.036s
nvme/040 (tr=tcp) (test nvme fabrics controller reset/disconnect operation during I/O) [passed]
    runtime  6.481s  ...  6.484s
nvme/041 (tr=tcp) (Create authenticated connections)         [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/042 (tr=tcp) (Test dhchap key types for authenticated connections) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/043 (tr=tcp) (Test hash and DH group variations for authenticated connections) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/044 (tr=tcp) (Test bi-directional authentication)       [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/045 (tr=tcp) (Test re-authentication)                   [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
nvme/047 (tr=tcp) (test different queue types for fabric transports) [passed]
    runtime  1.943s  ...  1.936s
nvme/048 (tr=tcp) (Test queue count changes on reconnect)    [passed]
    runtime  5.548s  ...  5.519s
nvme/051 (tr=tcp) (test nvmet concurrent ns enable/disable)  [passed]
    runtime  2.197s  ...  2.234s
nvme/052 (tr=tcp) (Test file-ns creation/deletion under one subsystem) [not run]
    nvme_trtype=tcp is not supported in this test
nvme/054 (tr=tcp) (Test the NVMe reservation feature)        [passed]
    runtime  0.477s  ...  0.453s
nvme/055 (tr=tcp) (Test nvme write to a loop target ns just after ns is disabled) [not run]
    nvme_trtype=tcp is not supported in this test
nvme/056 (tr=tcp) (enable zero copy offload and run rw traffic) [not run]
    Remote target required but NVME_TARGET_CONTROL is not set
    kernel option ULP_DDP has not been enabled
    module nvme_tcp does not have parameter ddp_offload
    KERNELSRC not set
    Kernel sources do not have tools/net/ynl/cli.py
    NVME_IFACE not set
nvme/057 (tr=tcp) (test nvme fabrics controller ANA failover during I/O) [passed]
    runtime  26.300s  ...  31.214s
nvme/058 (tr=tcp) (test rapid namespace remapping)           [passed]
    runtime  4.343s  ...  7.080s
nvme/060 (tr=tcp) (test nvme fabrics target reset)           [passed]
    runtime  19.562s  ...  19.475s
nvme/061 (tr=tcp) (test fabric target teardown and setup during I/O) [passed]
    runtime  8.648s  ...  8.586s
nvme/062 (tr=tcp) (Create TLS-encrypted connections)         [not run]
    command tlshd is not available
    systemctl unit 'tlshd' is missing
    Install ktls-utils for tlshd
nvme/063 (tr=tcp) (Create authenticated TCP connections with secure concatenation) [not run]
    kernel option NVME_AUTH has not been enabled
    kernel option NVME_TARGET_AUTH has not been enabled
    nvme-fabrics does not support dhchap_ctrl_secret
    command tlshd is not available
    systemctl unit 'tlshd' is missing
    Install ktls-utils for tlshd
nvme/065 (test unmap write zeroes sysfs interface with nvmet devices) [not run]
    driver scsi_debug is not available
blktests (master) #

-- 
2.40.0



^ permalink raw reply	[flat|nested] 7+ messages in thread

* [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset
  2025-11-23 19:18 [PATCH 0/2] nvme: blktests bug fix for 6.19 Chaitanya Kulkarni
@ 2025-11-23 19:18 ` Chaitanya Kulkarni
  2025-11-24  6:24   ` Christoph Hellwig
  2025-11-23 19:18 ` [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue Chaitanya Kulkarni
  1 sibling, 1 reply; 7+ messages in thread
From: Chaitanya Kulkarni @ 2025-11-23 19:18 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi; +Cc: linux-nvme, Chaitanya Kulkarni

When a controller reset is triggered via sysfs (by writing to
/sys/class/nvme/<nvmedev>/reset_controller), the reset work tears down
and re-establishes all queues. The socket release using fput() defers
the actual cleanup to task_work delayed_fput workqueue. This deferred
cleanup can race with the subsequent queue re-allocation during reset,
potentially leading to use-after-free or resource conflicts.

Replace fput() with __fput_sync() to ensure synchronous socket release,
guaranteeing that all socket resources are fully cleaned up before the
function returns. This prevents races during controller reset where
new queue setup may begin before the old socket is fully released.

* Call chain during reset:
  nvme_reset_ctrl_work()
    -> nvme_tcp_teardown_ctrl()
      -> nvme_tcp_teardown_io_queues()
        -> nvme_tcp_free_io_queues()
          -> nvme_tcp_free_queue()       <-- fput() -> __fput_sync()
      -> nvme_tcp_teardown_admin_queue()
        -> nvme_tcp_free_admin_queue()
          -> nvme_tcp_free_queue()       <-- fput() -> __fput_sync()
    -> nvme_tcp_setup_ctrl()             <-- race with deferred fput

* The issue can be reproduced using blktests:

  nvme_trtype=tcp ./check nvme/005
blktests (master) # nvme_trtype=tcp ./check nvme/005
nvme/005 (tr=tcp) (reset local loopback target)              [failed]
    runtime  0.725s  ...  0.798s
    something found in dmesg:
    [  108.473940] run blktests nvme/005 at 2025-11-22 16:12:20

    [...]

    (See '/root/blktests/results/nodev_tr_tcp/nvme/005.dmesg' for the entire message)
blktests (master) # cat /root/blktests/results/nodev_tr_tcp/nvme/005.dmesg
[  108.473940] run blktests nvme/005 at 2025-11-22 16:12:20
[  108.526983] loop0: detected capacity change from 0 to 2097152
[  108.555606] nvmet: adding nsid 1 to subsystem blktests-subsystem-1
[  108.572531] nvmet_tcp: enabling port 0 (127.0.0.1:4420)
[  108.613061] nvmet: Created nvm controller 1 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
[  108.616832] nvme nvme0: creating 48 I/O queues.
[  108.630791] nvme nvme0: mapped 48/0/0 default/read/poll queues.
[  108.661892] nvme nvme0: new ctrl: NQN "blktests-subsystem-1", addr 127.0.0.1:4420, hostnqn: nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349
[  108.746639] nvmet: Created nvm controller 2 for subsystem blktests-subsystem-1 for NQN nqn.2014-08.org.nvmexpress:uuid:0f01fb42-9f7f-4856-b0b3-51e60b8de349.
[  108.748466] nvme nvme0: creating 48 I/O queues.
[  108.802984] nvme nvme0: mapped 48/0/0 default/read/poll queues.
[  108.829983] nvme nvme0: Removing ctrl: NQN "blktests-subsystem-1"
[  108.854288] block nvme0n1: no available path - failing I/O
[  108.854344] block nvme0n1: no available path - failing I/O
[  108.854373] Buffer I/O error on dev nvme0n1, logical block 1, async page read

[  108.891693] ======================================================
[  108.895912] WARNING: possible circular locking dependency detected
[  108.900184] 6.17.0nvme+ #3 Tainted: G                 N
[  108.903913] ------------------------------------------------------
[  108.908171] nvme/2734 is trying to acquire lock:
[  108.911957] ffff88810210e610 (set->srcu){.+.+}-{0:0}, at: __synchronize_srcu+0x17/0x170
[  108.917587]
               but task is already holding lock:
[  108.921570] ffff88813abea198 (&q->elevator_lock){+.+.}-{4:4}, at: elevator_change+0xa8/0x1c0
[  108.927361]
               which lock already depends on the new lock.

[  108.933018]
               the existing dependency chain (in reverse order) is:
[  108.938223]
               -> #4 (&q->elevator_lock){+.+.}-{4:4}:
[  108.942988]        __mutex_lock+0xa2/0x1150
[  108.945873]        elevator_change+0xa8/0x1c0
[  108.948925]        elv_iosched_store+0xdf/0x140
[  108.952043]        kernfs_fop_write_iter+0x16a/0x220
[  108.955367]        vfs_write+0x378/0x520
[  108.957598]        ksys_write+0x67/0xe0
[  108.959721]        do_syscall_64+0x76/0xbb0
[  108.962052]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  108.965145]
               -> #3 (&q->q_usage_counter(io)){++++}-{0:0}:
[  108.968923]        blk_alloc_queue+0x30e/0x350
[  108.972117]        blk_mq_alloc_queue+0x61/0xd0
[  108.974677]        scsi_alloc_sdev+0x2a0/0x3e0
[  108.977092]        scsi_probe_and_add_lun+0x1bd/0x430
[  108.979921]        __scsi_add_device+0x109/0x120
[  108.982504]        ata_scsi_scan_host+0x97/0x1c0
[  108.984365]        async_run_entry_fn+0x2d/0x130
[  108.986109]        process_one_work+0x20e/0x630
[  108.987830]        worker_thread+0x184/0x330
[  108.989473]        kthread+0x10a/0x250
[  108.990852]        ret_from_fork+0x297/0x300
[  108.992491]        ret_from_fork_asm+0x1a/0x30
[  108.994159]
               -> #2 (fs_reclaim){+.+.}-{0:0}:
[  108.996320]        fs_reclaim_acquire+0x99/0xd0
[  108.998058]        kmem_cache_alloc_node_noprof+0x4e/0x3c0
[  109.000123]        __alloc_skb+0x15f/0x190
[  109.002195]        tcp_send_active_reset+0x3f/0x1e0
[  109.004038]        tcp_disconnect+0x50b/0x720
[  109.005695]        __tcp_close+0x2b8/0x4b0
[  109.007227]        tcp_close+0x20/0x80
[  109.008663]        inet_release+0x31/0x60
[  109.010175]        __sock_release+0x3a/0xc0
[  109.011778]        sock_close+0x14/0x20
[  109.013263]        __fput+0xee/0x2c0
[  109.014673]        delayed_fput+0x31/0x50
[  109.016183]        process_one_work+0x20e/0x630
[  109.017897]        worker_thread+0x184/0x330
[  109.019543]        kthread+0x10a/0x250
[  109.020929]        ret_from_fork+0x297/0x300
[  109.022565]        ret_from_fork_asm+0x1a/0x30
[  109.024194]
               -> #1 (sk_lock-AF_INET-NVME){+.+.}-{0:0}:
[  109.026634]        lock_sock_nested+0x2e/0x70
[  109.028251]        tcp_sendmsg+0x1a/0x40
[  109.029783]        sock_sendmsg+0xed/0x110
[  109.031321]        nvme_tcp_try_send_cmd_pdu+0x13e/0x260 [nvme_tcp]
[  109.034263]        nvme_tcp_try_send+0xb3/0x330 [nvme_tcp]
[  109.036375]        nvme_tcp_queue_rq+0x342/0x3d0 [nvme_tcp]
[  109.038528]        blk_mq_dispatch_rq_list+0x297/0x800
[  109.040448]        __blk_mq_sched_dispatch_requests+0x3db/0x5f0
[  109.042677]        blk_mq_sched_dispatch_requests+0x29/0x70
[  109.044787]        blk_mq_run_work_fn+0x76/0x1b0
[  109.046535]        process_one_work+0x20e/0x630
[  109.048245]        worker_thread+0x184/0x330
[  109.049890]        kthread+0x10a/0x250
[  109.051331]        ret_from_fork+0x297/0x300
[  109.053024]        ret_from_fork_asm+0x1a/0x30
[  109.054740]
               -> #0 (set->srcu){.+.+}-{0:0}:
[  109.056850]        __lock_acquire+0x1468/0x2210
[  109.058614]        lock_sync+0xa5/0x110
[  109.060048]        __synchronize_srcu+0x49/0x170
[  109.061802]        elevator_switch+0xc9/0x330
[  109.063950]        elevator_change+0x128/0x1c0
[  109.065675]        elevator_set_none+0x4c/0x90
[  109.067316]        blk_unregister_queue+0xa8/0x110
[  109.069165]        __del_gendisk+0x14e/0x3c0
[  109.070824]        del_gendisk+0x75/0xa0
[  109.072328]        nvme_ns_remove+0xf2/0x230 [nvme_core]
[  109.074365]        nvme_remove_namespaces+0xf2/0x150 [nvme_core]
[  109.076652]        nvme_do_delete_ctrl+0x71/0x90 [nvme_core]
[  109.078775]        nvme_delete_ctrl_sync+0x3b/0x50 [nvme_core]
[  109.081009]        nvme_sysfs_delete+0x34/0x40 [nvme_core]
[  109.083082]        kernfs_fop_write_iter+0x16a/0x220
[  109.085009]        vfs_write+0x378/0x520
[  109.086539]        ksys_write+0x67/0xe0
[  109.087982]        do_syscall_64+0x76/0xbb0
[  109.089577]        entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  109.091665]
               other info that might help us debug this:

[  109.095478] Chain exists of:
                 set->srcu --> &q->q_usage_counter(io) --> &q->elevator_lock

[  109.099544]  Possible unsafe locking scenario:

[  109.101708]        CPU0                    CPU1
[  109.103402]        ----                    ----
[  109.105103]   lock(&q->elevator_lock);
[  109.106530]                                lock(&q->q_usage_counter(io));
[  109.109022]                                lock(&q->elevator_lock);
[  109.111391]   sync(set->srcu);
[  109.112586]
                *** DEADLOCK ***

[  109.114772] 5 locks held by nvme/2734:
[  109.116189]  #0: ffff888101925410 (sb_writers#4){.+.+}-{0:0}, at: ksys_write+0x67/0xe0
[  109.119143]  #1: ffff88817a914e88 (&of->mutex#2){+.+.}-{4:4}, at: kernfs_fop_write_iter+0x10f/0x220
[  109.123141]  #2: ffff8881046313f8 (kn->active#185){++++}-{0:0}, at: sysfs_remove_file_self+0x26/0x50
[  109.126543]  #3: ffff88810470e1d0 (&set->update_nr_hwq_lock){++++}-{4:4}, at: del_gendisk+0x6d/0xa0
[  109.129891]  #4: ffff88813abea198 (&q->elevator_lock){+.+.}-{4:4}, at: elevator_change+0xa8/0x1c0
[  109.133149]
               stack backtrace:
[  109.134817] CPU: 6 UID: 0 PID: 2734 Comm: nvme Tainted: G                 N  6.17.0nvme+ #3 PREEMPT(voluntary)
[  109.134819] Tainted: [N]=TEST
[  109.134820] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[  109.134821] Call Trace:
[  109.134823]  <TASK>
[  109.134824]  dump_stack_lvl+0x75/0xb0
[  109.134828]  print_circular_bug+0x26a/0x330
[  109.134831]  check_noncircular+0x12f/0x150
[  109.134834]  __lock_acquire+0x1468/0x2210
[  109.134837]  ? __synchronize_srcu+0x17/0x170
[  109.134838]  lock_sync+0xa5/0x110
[  109.134840]  ? __synchronize_srcu+0x17/0x170
[  109.134842]  __synchronize_srcu+0x49/0x170
[  109.134843]  ? mark_held_locks+0x49/0x80
[  109.134845]  ? _raw_spin_unlock_irqrestore+0x2d/0x60
[  109.134847]  ? kvm_clock_get_cycles+0x14/0x30
[  109.134853]  ? ktime_get_mono_fast_ns+0x36/0xb0
[  109.134858]  elevator_switch+0xc9/0x330
[  109.134860]  elevator_change+0x128/0x1c0
[  109.134862]  ? kernfs_put.part.0+0x86/0x290
[  109.134864]  elevator_set_none+0x4c/0x90
[  109.134866]  blk_unregister_queue+0xa8/0x110
[  109.134868]  __del_gendisk+0x14e/0x3c0
[  109.134870]  del_gendisk+0x75/0xa0
[  109.134872]  nvme_ns_remove+0xf2/0x230 [nvme_core]
[  109.134879]  nvme_remove_namespaces+0xf2/0x150 [nvme_core]
[  109.134887]  nvme_do_delete_ctrl+0x71/0x90 [nvme_core]
[  109.134893]  nvme_delete_ctrl_sync+0x3b/0x50 [nvme_core]
[  109.134899]  nvme_sysfs_delete+0x34/0x40 [nvme_core]
[  109.134905]  kernfs_fop_write_iter+0x16a/0x220
[  109.134908]  vfs_write+0x378/0x520
[  109.134911]  ksys_write+0x67/0xe0
[  109.134913]  do_syscall_64+0x76/0xbb0
[  109.134915]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  109.134916] RIP: 0033:0x7fd68a737317
[  109.134917] Code: 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 48 89 54 24 18 48 89 74 24
[  109.134919] RSP: 002b:00007ffded1546d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001
[  109.134920] RAX: ffffffffffffffda RBX: 000000000054f7e0 RCX: 00007fd68a737317
[  109.134921] RDX: 0000000000000001 RSI: 00007fd68a855719 RDI: 0000000000000003
[  109.134921] RBP: 0000000000000003 R08: 0000000030407850 R09: 00007fd68a7cd4e0
[  109.134922] R10: 00007fd68a65b130 R11: 0000000000000246 R12: 00007fd68a855719
[  109.134923] R13: 00000000304074c0 R14: 00000000304074c0 R15: 0000000030408660
[  109.134926]  </TASK>
[  109.962756] Key type psk unregistered

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
---
 drivers/nvme/host/tcp.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 1413788ca7d5..ba23ca465591 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -9,6 +9,7 @@
 #include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/crc32.h>
+#include <linux/file.h>
 #include <linux/nvme-tcp.h>
 #include <linux/nvme-keyring.h>
 #include <net/sock.h>
@@ -1442,8 +1443,8 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	page_frag_cache_drain(&queue->pf_cache);
 
 	noreclaim_flag = memalloc_noreclaim_save();
-	/* ->sock will be released by fput() */
-	fput(queue->sock->file);
+	/* ->sock will be released by __fput_sync() */
+	__fput_sync(queue->sock->file);
 	queue->sock = NULL;
 	memalloc_noreclaim_restore(noreclaim_flag);
 
@@ -1897,8 +1898,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 err_rcv_pdu:
 	kfree(queue->pdu);
 err_sock:
-	/* ->sock will be released by fput() */
-	fput(queue->sock->file);
+	/* ->sock will be released by __fput_sync() */
+	__fput_sync(queue->sock->file);
 	queue->sock = NULL;
 err_destroy_mutex:
 	mutex_destroy(&queue->send_mutex);
-- 
2.40.0



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue
  2025-11-23 19:18 [PATCH 0/2] nvme: blktests bug fix for 6.19 Chaitanya Kulkarni
  2025-11-23 19:18 ` [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset Chaitanya Kulkarni
@ 2025-11-23 19:18 ` Chaitanya Kulkarni
  2025-11-24  6:25   ` Christoph Hellwig
  1 sibling, 1 reply; 7+ messages in thread
From: Chaitanya Kulkarni @ 2025-11-23 19:18 UTC (permalink / raw)
  To: kbusch, axboe, hch, sagi; +Cc: linux-nvme, Chaitanya Kulkarni

When a bio goes through the rq_qos infrastructure on a path's request
queue, it gets BIO_QOS_THROTTLED or BIO_QOS_MERGED flags set. These
flags indicate that rq_qos_done_bio() should be called on completion
to update rq_qos accounting.

During path failover in nvme_failover_req(), the bio's bi_bdev is
redirected from the failed path's disk to the multipath head's disk
via bio_set_dev(). However, the BIO_QOS flags are not cleared.

When the bio eventually completes (either successfully via a new path
or with an error via bio_io_error()), rq_qos_done_bio() checks for
these flags and calls __rq_qos_done_bio(q->rq_qos, bio) where q is
obtained from the bio's current bi_bdev - which is now the multipath
head's queue, not the original path's queue.

The multipath head's queue does not have rq_qos enabled (q->rq_qos is
NULL), but the code assumes that if BIO_QOS_* flags are set, q->rq_qos
must be valid. This assumption is documented in block/blk-rq-qos.h:

  "If a bio has BIO_QOS_xxx set, it implicitly implies that
   q->rq_qos is present."

This breaks when a bio is moved between queues during NVMe multipath
failover, leading to a NULL pointer dereference.

Execution Context timeline :-

   * =====> dd process context
   [USER] dd process                                
     [SYSCALL] write() - dd process context           
       submit_bio()                              
       nvme_ns_head_submit_bio() - path selection
       blk_mq_submit_bio()  #### QOS FLAGS SET HERE
                                                  
        [USER] dd waits or returns                       
                                                  
          ==== I/O in flight on NVMe hardware =====

   ===== End of submission path ====
   ------------------------------------------------------
   
   * dd ====> Interrupt context;
   [IRQ] NVMe completion interrupt              
       nvme_irq()                                
        nvme_complete_rq()                        
         nvme_failover_req() ### BIO MOVED TO HEAD 
            spin_lock_irqsave (atomic section)    
            bio_set_dev() changes bi_bdev         
            ### BUG: QOS flags NOT cleared          
            kblockd_schedule_work()                   
                                                   
   * Interrupt context =====> kblockd workqueue
   [WQ] kblockd workqueue - kworker process         
       nvme_requeue_work()                       
        submit_bio_noacct()                       
         nvme_ns_head_submit_bio()                 
          nvme_find_path() returns NULL             
           bio_io_error()                            
            bio_endio()                               
             rq_qos_done_bio()  ### CRASH ###
                                                   
   KERNEL PANIC / OOPS       

Crash from blktests nvme/058 (rapid namespace remapping):

[ 1339.636033] BUG: kernel NULL pointer dereference, address: 0000000000000000
[ 1339.641025] nvme nvme4: rescanning namespaces.
[ 1339.642064] #PF: supervisor read access in kernel mode
[ 1339.642067] #PF: error_code(0x0000) - not-present page
[ 1339.642070] PGD 0 P4D 0
[ 1339.642073] Oops: Oops: 0000 [#1] SMP NOPTI
[ 1339.642078] CPU: 35 UID: 0 PID: 4579 Comm: kworker/35:2H
               Tainted: G   O     N  6.17.0-rc3nvme+ #5 PREEMPT(voluntary)
[ 1339.642084] Tainted: [O]=OOT_MODULE, [N]=TEST
[ 1339.673446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
	       BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
[ 1339.682359] Workqueue: kblockd nvme_requeue_work [nvme_core]
[ 1339.686613] RIP: 0010:__rq_qos_done_bio+0xd/0x40
[ 1339.690161] Code: 75 dd 5b 5d 41 5c c3 cc cc cc cc 66 90 90 90 90 90 90 90
                     90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 f5
		     53 48 89 fb <48> 8b 03 48 8b 40 30 48 85 c0 74 0b 48 89 ee
		     48 89 df ff d0 0f 1f
[ 1339.703691] RSP: 0018:ffffc900066f3c90 EFLAGS: 00010202
[ 1339.706844] RAX: ffff888148b9ef00 RBX: 0000000000000000 RCX: 0000000000000000
[ 1339.711136] RDX: 00000000000001c0 RSI: ffff8882aaab8a80 RDI: 0000000000000000
[ 1339.715691] RBP: ffff8882aaab8a80 R08: 0000000000000000 R09: 0000000000000000
[ 1339.720472] R10: 0000000000000000 R11: fefefefefefefeff R12: ffff8882aa3b6010
[ 1339.724650] R13: 0000000000000000 R14: ffff8882338bcef0 R15: ffff8882aa3b6020
[ 1339.729029] FS:  0000000000000000(0000) GS:ffff88985c0cf000(0000) knlGS:0000000000000000
[ 1339.734525] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 1339.738563] CR2: 0000000000000000 CR3: 0000000111045000 CR4: 0000000000350ef0
[ 1339.742750] DR0: ffffffff845ccbec DR1: ffffffff845ccbed DR2: ffffffff845ccbee
[ 1339.745630] DR3: ffffffff845ccbef DR6: 00000000ffff0ff0 DR7: 0000000000000600
[ 1339.748488] Call Trace:
[ 1339.749512]  <TASK>
[ 1339.750449]  bio_endio+0x71/0x2e0
[ 1339.751833]  nvme_ns_head_submit_bio+0x290/0x320 [nvme_core]
[ 1339.754073]  __submit_bio+0x222/0x5e0
[ 1339.755623]  ? rcu_is_watching+0xd/0x40
[ 1339.757201]  ? submit_bio_noacct_nocheck+0x131/0x370
[ 1339.759210]  submit_bio_noacct_nocheck+0x131/0x370
[ 1339.761189]  ? submit_bio_noacct+0x20/0x620
[ 1339.762849]  nvme_requeue_work+0x4b/0x60 [nvme_core]
[ 1339.764828]  process_one_work+0x20e/0x630
[ 1339.766528]  worker_thread+0x184/0x330
[ 1339.768129]  ? __pfx_worker_thread+0x10/0x10
[ 1339.769942]  kthread+0x10a/0x250
[ 1339.771263]  ? __pfx_kthread+0x10/0x10
[ 1339.772776]  ? __pfx_kthread+0x10/0x10
[ 1339.774381]  ret_from_fork+0x273/0x2e0
[ 1339.775948]  ? __pfx_kthread+0x10/0x10
[ 1339.777504]  ret_from_fork_asm+0x1a/0x30
[ 1339.779163]  </TASK>

Fix this by clearing both BIO_QOS_THROTTLED and BIO_QOS_MERGED flags
when bios are redirected to the multipath head in nvme_failover_req().
This is consistent with the existing code that clears REQ_POLLED and
REQ_NOWAIT flags when the bio changes queues.

Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
---
 drivers/nvme/host/multipath.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 3da980dc60d9..2535dba8ce1e 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -168,6 +168,16 @@ void nvme_failover_req(struct request *req)
 		 * the flag to avoid spurious EAGAIN I/O failures.
 		 */
 		bio->bi_opf &= ~REQ_NOWAIT;
+		/*
+		 * BIO_QOS_THROTTLED and BIO_QOS_MERGED were set when the bio
+		 * went through the path's request queue rq_qos infrastructure.
+		 * The bio is now being redirected to the multipath head's
+		 * queue which may not have rq_qos enabled, so these flags are
+		 * no longer valid and must be cleared to prevent
+		 * rq_qos_done_bio() from dereferencing a NULL q->rq_qos.
+		 */
+		bio_clear_flag(bio, BIO_QOS_THROTTLED);
+		bio_clear_flag(bio, BIO_QOS_MERGED);
 	}
 	blk_steal_bios(&ns->head->requeue_list, req);
 	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
-- 
2.40.0



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset
  2025-11-23 19:18 ` [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset Chaitanya Kulkarni
@ 2025-11-24  6:24   ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2025-11-24  6:24 UTC (permalink / raw)
  To: Chaitanya Kulkarni; +Cc: kbusch, axboe, hch, sagi, linux-nvme

Please add a comment why you need the sync variant.  Otherwise this
looks sane to me.



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue
  2025-11-23 19:18 ` [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue Chaitanya Kulkarni
@ 2025-11-24  6:25   ` Christoph Hellwig
  2025-11-24  6:45     ` Chaitanya Kulkarni
  0 siblings, 1 reply; 7+ messages in thread
From: Christoph Hellwig @ 2025-11-24  6:25 UTC (permalink / raw)
  To: Chaitanya Kulkarni; +Cc: kbusch, axboe, hch, sagi, linux-nvme

On Sun, Nov 23, 2025 at 11:18:58AM -0800, Chaitanya Kulkarni wrote:
> When a bio goes through the rq_qos infrastructure on a path's request
> queue, it gets BIO_QOS_THROTTLED or BIO_QOS_MERGED flags set. These
> flags indicate that rq_qos_done_bio() should be called on completion
> to update rq_qos accounting.
> 
> During path failover in nvme_failover_req(), the bio's bi_bdev is
> redirected from the failed path's disk to the multipath head's disk
> via bio_set_dev(). However, the BIO_QOS flags are not cleared.
> 
> When the bio eventually completes (either successfully via a new path
> or with an error via bio_io_error()), rq_qos_done_bio() checks for
> these flags and calls __rq_qos_done_bio(q->rq_qos, bio) where q is
> obtained from the bio's current bi_bdev - which is now the multipath
> head's queue, not the original path's queue.
> 
> The multipath head's queue does not have rq_qos enabled (q->rq_qos is
> NULL), but the code assumes that if BIO_QOS_* flags are set, q->rq_qos
> must be valid. This assumption is documented in block/blk-rq-qos.h:
> 
>   "If a bio has BIO_QOS_xxx set, it implicitly implies that
>    q->rq_qos is present."
> 
> This breaks when a bio is moved between queues during NVMe multipath
> failover, leading to a NULL pointer dereference.
> 
> Execution Context timeline :-
> 
>    * =====> dd process context
>    [USER] dd process                                
>      [SYSCALL] write() - dd process context           
>        submit_bio()                              
>        nvme_ns_head_submit_bio() - path selection
>        blk_mq_submit_bio()  #### QOS FLAGS SET HERE
>                                                   
>         [USER] dd waits or returns                       
>                                                   
>           ==== I/O in flight on NVMe hardware =====
> 
>    ===== End of submission path ====
>    ------------------------------------------------------
>    
>    * dd ====> Interrupt context;
>    [IRQ] NVMe completion interrupt              
>        nvme_irq()                                
>         nvme_complete_rq()                        
>          nvme_failover_req() ### BIO MOVED TO HEAD 
>             spin_lock_irqsave (atomic section)    
>             bio_set_dev() changes bi_bdev         
>             ### BUG: QOS flags NOT cleared          
>             kblockd_schedule_work()                   
>                                                    
>    * Interrupt context =====> kblockd workqueue
>    [WQ] kblockd workqueue - kworker process         
>        nvme_requeue_work()                       
>         submit_bio_noacct()                       
>          nvme_ns_head_submit_bio()                 
>           nvme_find_path() returns NULL             
>            bio_io_error()                            
>             bio_endio()                               
>              rq_qos_done_bio()  ### CRASH ###
>                                                    
>    KERNEL PANIC / OOPS       
> 
> Crash from blktests nvme/058 (rapid namespace remapping):
> 
> [ 1339.636033] BUG: kernel NULL pointer dereference, address: 0000000000000000
> [ 1339.641025] nvme nvme4: rescanning namespaces.
> [ 1339.642064] #PF: supervisor read access in kernel mode
> [ 1339.642067] #PF: error_code(0x0000) - not-present page
> [ 1339.642070] PGD 0 P4D 0
> [ 1339.642073] Oops: Oops: 0000 [#1] SMP NOPTI
> [ 1339.642078] CPU: 35 UID: 0 PID: 4579 Comm: kworker/35:2H
>                Tainted: G   O     N  6.17.0-rc3nvme+ #5 PREEMPT(voluntary)
> [ 1339.642084] Tainted: [O]=OOT_MODULE, [N]=TEST
> [ 1339.673446] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
> 	       BIOS rel-1.16.3-0-ga6ed6b701f0a-prebuilt.qemu.org 04/01/2014
> [ 1339.682359] Workqueue: kblockd nvme_requeue_work [nvme_core]
> [ 1339.686613] RIP: 0010:__rq_qos_done_bio+0xd/0x40
> [ 1339.690161] Code: 75 dd 5b 5d 41 5c c3 cc cc cc cc 66 90 90 90 90 90 90 90
>                      90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 55 48 89 f5
> 		     53 48 89 fb <48> 8b 03 48 8b 40 30 48 85 c0 74 0b 48 89 ee
> 		     48 89 df ff d0 0f 1f
> [ 1339.703691] RSP: 0018:ffffc900066f3c90 EFLAGS: 00010202
> [ 1339.706844] RAX: ffff888148b9ef00 RBX: 0000000000000000 RCX: 0000000000000000
> [ 1339.711136] RDX: 00000000000001c0 RSI: ffff8882aaab8a80 RDI: 0000000000000000
> [ 1339.715691] RBP: ffff8882aaab8a80 R08: 0000000000000000 R09: 0000000000000000
> [ 1339.720472] R10: 0000000000000000 R11: fefefefefefefeff R12: ffff8882aa3b6010
> [ 1339.724650] R13: 0000000000000000 R14: ffff8882338bcef0 R15: ffff8882aa3b6020
> [ 1339.729029] FS:  0000000000000000(0000) GS:ffff88985c0cf000(0000) knlGS:0000000000000000
> [ 1339.734525] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [ 1339.738563] CR2: 0000000000000000 CR3: 0000000111045000 CR4: 0000000000350ef0
> [ 1339.742750] DR0: ffffffff845ccbec DR1: ffffffff845ccbed DR2: ffffffff845ccbee
> [ 1339.745630] DR3: ffffffff845ccbef DR6: 00000000ffff0ff0 DR7: 0000000000000600
> [ 1339.748488] Call Trace:
> [ 1339.749512]  <TASK>
> [ 1339.750449]  bio_endio+0x71/0x2e0
> [ 1339.751833]  nvme_ns_head_submit_bio+0x290/0x320 [nvme_core]
> [ 1339.754073]  __submit_bio+0x222/0x5e0
> [ 1339.755623]  ? rcu_is_watching+0xd/0x40
> [ 1339.757201]  ? submit_bio_noacct_nocheck+0x131/0x370
> [ 1339.759210]  submit_bio_noacct_nocheck+0x131/0x370
> [ 1339.761189]  ? submit_bio_noacct+0x20/0x620
> [ 1339.762849]  nvme_requeue_work+0x4b/0x60 [nvme_core]
> [ 1339.764828]  process_one_work+0x20e/0x630
> [ 1339.766528]  worker_thread+0x184/0x330
> [ 1339.768129]  ? __pfx_worker_thread+0x10/0x10
> [ 1339.769942]  kthread+0x10a/0x250
> [ 1339.771263]  ? __pfx_kthread+0x10/0x10
> [ 1339.772776]  ? __pfx_kthread+0x10/0x10
> [ 1339.774381]  ret_from_fork+0x273/0x2e0
> [ 1339.775948]  ? __pfx_kthread+0x10/0x10
> [ 1339.777504]  ret_from_fork_asm+0x1a/0x30
> [ 1339.779163]  </TASK>
> 
> Fix this by clearing both BIO_QOS_THROTTLED and BIO_QOS_MERGED flags
> when bios are redirected to the multipath head in nvme_failover_req().
> This is consistent with the existing code that clears REQ_POLLED and
> REQ_NOWAIT flags when the bio changes queues.
> 
> Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
> ---
>  drivers/nvme/host/multipath.c | 10 ++++++++++
>  1 file changed, 10 insertions(+)
> 
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 3da980dc60d9..2535dba8ce1e 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -168,6 +168,16 @@ void nvme_failover_req(struct request *req)
>  		 * the flag to avoid spurious EAGAIN I/O failures.
>  		 */
>  		bio->bi_opf &= ~REQ_NOWAIT;
> +		/*
> +		 * BIO_QOS_THROTTLED and BIO_QOS_MERGED were set when the bio
> +		 * went through the path's request queue rq_qos infrastructure.
> +		 * The bio is now being redirected to the multipath head's
> +		 * queue which may not have rq_qos enabled, so these flags are
> +		 * no longer valid and must be cleared to prevent
> +		 * rq_qos_done_bio() from dereferencing a NULL q->rq_qos.
> +		 */
> +		bio_clear_flag(bio, BIO_QOS_THROTTLED);
> +		bio_clear_flag(bio, BIO_QOS_MERGED);

This really should go into blk_steal_bios instead.  As should be the
existing nowait/polled fixups..



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue
  2025-11-24  6:25   ` Christoph Hellwig
@ 2025-11-24  6:45     ` Chaitanya Kulkarni
  2025-11-24  7:01       ` Christoph Hellwig
  0 siblings, 1 reply; 7+ messages in thread
From: Chaitanya Kulkarni @ 2025-11-24  6:45 UTC (permalink / raw)
  To: Christoph Hellwig, Chaitanya Kulkarni
  Cc: kbusch@kernel.org, axboe@kernel.dk, sagi@grimberg.me,
	linux-nvme@lists.infradead.org

>> Fix this by clearing both BIO_QOS_THROTTLED and BIO_QOS_MERGED flags
>> when bios are redirected to the multipath head in nvme_failover_req().
>> This is consistent with the existing code that clears REQ_POLLED and
>> REQ_NOWAIT flags when the bio changes queues.
>>
>> Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
>> ---
>>   drivers/nvme/host/multipath.c | 10 ++++++++++
>>   1 file changed, 10 insertions(+)
>>
>> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
>> index 3da980dc60d9..2535dba8ce1e 100644
>> --- a/drivers/nvme/host/multipath.c
>> +++ b/drivers/nvme/host/multipath.c
>> @@ -168,6 +168,16 @@ void nvme_failover_req(struct request *req)
>>   		 * the flag to avoid spurious EAGAIN I/O failures.
>>   		 */
>>   		bio->bi_opf &= ~REQ_NOWAIT;
>> +		/*
>> +		 * BIO_QOS_THROTTLED and BIO_QOS_MERGED were set when the bio
>> +		 * went through the path's request queue rq_qos infrastructure.
>> +		 * The bio is now being redirected to the multipath head's
>> +		 * queue which may not have rq_qos enabled, so these flags are
>> +		 * no longer valid and must be cleared to prevent
>> +		 * rq_qos_done_bio() from dereferencing a NULL q->rq_qos.
>> +		 */
>> +		bio_clear_flag(bio, BIO_QOS_THROTTLED);
>> +		bio_clear_flag(bio, BIO_QOS_MERGED);
> This really should go into blk_steal_bios instead.  As should be the
> existing nowait/polled fixups..
>
>
even better, should I send a prep patch to move existing nowait/polled
and second patch for QOS THROTTLED and MERGED that fixes the bug ?

or just single patch is fine ?

-ck



^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue
  2025-11-24  6:45     ` Chaitanya Kulkarni
@ 2025-11-24  7:01       ` Christoph Hellwig
  0 siblings, 0 replies; 7+ messages in thread
From: Christoph Hellwig @ 2025-11-24  7:01 UTC (permalink / raw)
  To: Chaitanya Kulkarni
  Cc: Christoph Hellwig, Chaitanya Kulkarni, kbusch@kernel.org,
	axboe@kernel.dk, sagi@grimberg.me, linux-nvme@lists.infradead.org

On Mon, Nov 24, 2025 at 06:45:58AM +0000, Chaitanya Kulkarni wrote:
> >> Fix this by clearing both BIO_QOS_THROTTLED and BIO_QOS_MERGED flags
> >> when bios are redirected to the multipath head in nvme_failover_req().
> >> This is consistent with the existing code that clears REQ_POLLED and
> >> REQ_NOWAIT flags when the bio changes queues.
> >>
> >> Signed-off-by: Chaitanya Kulkarni <ckulkarnilinux@gmail.com>
> >> ---
> >>   drivers/nvme/host/multipath.c | 10 ++++++++++
> >>   1 file changed, 10 insertions(+)
> >>
> >> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> >> index 3da980dc60d9..2535dba8ce1e 100644
> >> --- a/drivers/nvme/host/multipath.c
> >> +++ b/drivers/nvme/host/multipath.c
> >> @@ -168,6 +168,16 @@ void nvme_failover_req(struct request *req)
> >>   		 * the flag to avoid spurious EAGAIN I/O failures.
> >>   		 */
> >>   		bio->bi_opf &= ~REQ_NOWAIT;
> >> +		/*
> >> +		 * BIO_QOS_THROTTLED and BIO_QOS_MERGED were set when the bio
> >> +		 * went through the path's request queue rq_qos infrastructure.
> >> +		 * The bio is now being redirected to the multipath head's
> >> +		 * queue which may not have rq_qos enabled, so these flags are
> >> +		 * no longer valid and must be cleared to prevent
> >> +		 * rq_qos_done_bio() from dereferencing a NULL q->rq_qos.
> >> +		 */
> >> +		bio_clear_flag(bio, BIO_QOS_THROTTLED);
> >> +		bio_clear_flag(bio, BIO_QOS_MERGED);
> > This really should go into blk_steal_bios instead.  As should be the
> > existing nowait/polled fixups..
> >
> >
> even better, should I send a prep patch to move existing nowait/polled
> and second patch for QOS THROTTLED and MERGED that fixes the bug ?

Yes, we can probably move all of them including your new clear
in a separate patch (and wait until the next merge window).


^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2025-11-24  7:02 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-23 19:18 [PATCH 0/2] nvme: blktests bug fix for 6.19 Chaitanya Kulkarni
2025-11-23 19:18 ` [PATCH BUF FIX 1/2] nvme-tcp: use __fput_sync() to avoid use-after-free on reset Chaitanya Kulkarni
2025-11-24  6:24   ` Christoph Hellwig
2025-11-23 19:18 ` [PATCH BUG FIX 2/2] nvme-multipath: clear BIO_QOS flags on requeue Chaitanya Kulkarni
2025-11-24  6:25   ` Christoph Hellwig
2025-11-24  6:45     ` Chaitanya Kulkarni
2025-11-24  7:01       ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox