Linux block layer
 help / color / mirror / Atom feed
* [PATCH] block/partitions/of: Fix of_node reference leak in of_partition()
From: Wentao Liang @ 2026-06-25  9:36 UTC (permalink / raw)
  To: axboe; +Cc: kees, objecting, vulab, linux-block, linux-kernel, stable

of_partition() calls of_node_get(ddev->of_node) at entry to take a
reference on the device node, but only releases it on the validation
error path. On three other exit paths the reference is leaked:

  - When the device node is not compatible with "fixed-partitions",
    the function returns 0 without calling of_node_put().
  - On normal success (return 1), partitions_np is never released.
  - When the partition slot limit is reached in the second child
    loop (break followed by return 1), partitions_np is also leaked.

Fix by splitting the NULL check from the compatible check so that
of_node_put() can be called before the early return on incompatibility,
and add of_node_put(partitions_np) before the final return 1 to cover
both the normal and the break paths.

The NULL case is safe because of_node_get(NULL) is a no-op.

Cc: stable@vger.kernel.org
Fixes: 2e3a191e89f9 ("block: add support for partition table defined in OF")
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
 block/partitions/of.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/block/partitions/of.c b/block/partitions/of.c
index c22b60661098..afaaae5e72a1 100644
--- a/block/partitions/of.c
+++ b/block/partitions/of.c
@@ -73,9 +73,12 @@ int of_partition(struct parsed_partitions *state)
 
 	struct device_node *partitions_np = of_node_get(ddev->of_node);
 
-	if (!partitions_np ||
-	    !of_device_is_compatible(partitions_np, "fixed-partitions"))
+	if (!partitions_np)
 		return 0;
+	if (!of_device_is_compatible(partitions_np, "fixed-partitions")) {
+		of_node_put(partitions_np);
+		return 0;
+	}
 
 	slot = 1;
 	/* Validate parition offset and size */
@@ -104,5 +107,7 @@ int of_partition(struct parsed_partitions *state)
 
 	seq_buf_puts(&state->pp_buf, "\n");
 
+	of_node_put(partitions_np);
+
 	return 1;
 }
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related

* [PATCH] block: Fix dio->ref leak on integrity error in __blkdev_direct_IO()
From: Wentao Liang @ 2026-06-25  9:21 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, linux-kernel, Wentao Liang, stable

When __blkdev_direct_IO() splits an I/O across multiple bios and
bio_integrity_map_iter() fails on a non-first bio, the function jumps
to the 'fail' label which frees the current bio without accounting for
the dio->ref increments from previously submitted bios.

The in-flight bios complete normally but their atomic_dec_and_test()
in blkdev_bio_end_io() can never bring dio->ref to zero, leaving the
dio structure (and the first bio it is embedded in) permanently leaked.
For synchronous I/O, the waiter is never woken, causing a hang.

Fix by matching the existing error handling pattern used for
blkdev_iov_iter_get_pages() failure: end the current bio with an error
status and break out of the submission loop. This ensures the normal
completion path properly accounts for all dio->ref references, both
from the current errored bio and from any in-flight bios.

The NOWAIT error path is unaffected as its goto fail only triggers
on the first iteration where no bios have been submitted yet.

Cc: stable@vger.kernel.org
Fixes: 3d8b5a22d404 ("block: add support to pass user meta buffer")
Signed-off-by: Wentao Liang <vulab@iscas.ac.cn>
---
 block/fops.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/block/fops.c b/block/fops.c
index bb6642b45937..9f16b995c60c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -239,8 +239,11 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		if (iocb->ki_flags & IOCB_HAS_METADATA) {
 			ret = bio_integrity_map_iter(bio, iocb->private);
-			if (unlikely(ret))
-				goto fail;
+			if (unlikely(ret)) {
+				bio->bi_status = errno_to_blk_status(ret);
+				bio_endio(bio);
+				break;
+			}
 		}
 
 		if (is_read) {
-- 
2.39.5 (Apple Git-154)


^ permalink raw reply related

* [PATCH v2 3/5] nbd: remove redundant num_connections boundary checks
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd
In-Reply-To: <20260625084458.4171890-1-yangerkun@huawei.com>

From: Long Li <leo.lilong@huawei.com>

Now that config->socks uses xarray instead of a plain array, explicit
bounds checking against num_connections is no longer necessary.
xa_load() returns NULL for any out-of-range or missing index, and
xa_for_each() is a no-op on an empty xarray, making these guards
redundant.

Signed-off-by: Long Li <leo.lilong@huawei.com>
---
 drivers/block/nbd.c | 16 ++++------------
 1 file changed, 4 insertions(+), 12 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index d88bdc97f4d1..409a655c40f0 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1094,7 +1094,7 @@ static int find_fallback(struct nbd_device *nbd, int index)
 		goto no_fallback;
 
 	fallback = nsock->fallback_index;
-	if (fallback >= 0 && fallback < config->num_connections) {
+	if (fallback >= 0) {
 		fallback_nsock = xa_load(&config->socks, fallback);
 		if (fallback_nsock && !fallback_nsock->dead)
 			return fallback;
@@ -1149,12 +1149,6 @@ static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 		return BLK_STS_IOERR;
 	}
 
-	if (index >= config->num_connections) {
-		dev_err_ratelimited(disk_to_dev(nbd->disk),
-				    "Attempted send on invalid socket\n");
-		nbd_config_put(nbd);
-		return BLK_STS_IOERR;
-	}
 	cmd->status = BLK_STS_OK;
 again:
 	nsock = xa_load(&config->socks, index);
@@ -1512,11 +1506,9 @@ static void nbd_config_put(struct nbd_device *nbd)
 		}
 		nbd_clear_sock(nbd);
 
-		if (config->num_connections) {
-			xa_for_each(&config->socks, i, nsock) {
-				sockfd_put(nsock->sock);
-				kfree(nsock);
-			}
+		xa_for_each(&config->socks, i, nsock) {
+			sockfd_put(nsock->sock);
+			kfree(nsock);
 		}
 		xa_destroy(&config->socks);
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 5/5] nbd: set nr_hw_queues at device creation to skip queue freeze
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd
In-Reply-To: <20260625084458.4171890-1-yangerkun@huawei.com>

The preceding patches in this series removed the blk_mq_freeze_queue()
call from nbd_add_socket(), eliminating the freeze/unfreeze overhead
during socket insertion.  However, nbd_start_device() still calls
blk_mq_update_nr_hw_queues() when the hardware queue count differs
from the actual number of connections, which introduce freeze too.

There are two reasons nr_hw_queues may not match num_connections:

1. Reusing an existing nbd device (e.g. one pre-created at module
   load which nr_hw_queues always set as 1) that was originally
   configured with a different connection count.  This case genuinely
   requires blk_mq_update_nr_hw_queues() to adjust the hardware
   queue count.

2. Creating a new nbd device via the netlink connect path
   (nbd_genl_connect), where we know the exact number of connections
   upfront from the NBD_ATTR_SOCKETS attribute.  In this case, there
   is no need to default to nr_hw_queues=1 and then update.

This patch optimizes case 2 by setting nr_hw_queues correctly at
device creation time, so that nbd_start_device() can skip
blk_mq_update_nr_hw_queues() entirely when the count already matches.
Two changes are made:

1. Add a nr_hw_queues parameter to nbd_dev_add() so callers can
   specify the desired queue count instead of the hardcoded 1.

2. Add nbd_genl_count_sockets() to count socket FDs from the netlink
   NBD_ATTR_SOCKETS attribute before the device is created, and pass
   the count as nr_hw_queues when creating a new nbd device via
   netlink.

The ioctl path (NBD_SET_SOCK + NBD_DO_IT) remains fully functional:
pre-created devices with nbds_max>0 default to nr_hw_queues=1, and
nbd_start_device() still calls blk_mq_update_nr_hw_queues() when
needed.

Signed-off-by: Yang Erkun <yangerkun@huawei.com>
---
 drivers/block/nbd.c | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 953146c85f17..2b6f896037ad 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1952,7 +1952,8 @@ static const struct blk_mq_ops nbd_mq_ops = {
 	.timeout	= nbd_xmit_timeout,
 };
 
-static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
+static struct nbd_device *nbd_dev_add(int index, unsigned int refs,
+				       int nr_hw_queues)
 {
 	struct queue_limits lim = {
 		.max_hw_sectors		= 65536,
@@ -1969,7 +1970,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
 		goto out;
 
 	nbd->tag_set.ops = &nbd_mq_ops;
-	nbd->tag_set.nr_hw_queues = 1;
+	nbd->tag_set.nr_hw_queues = nr_hw_queues;
 	nbd->tag_set.queue_depth = 128;
 	nbd->tag_set.numa_node = NUMA_NO_NODE;
 	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
@@ -2092,6 +2093,35 @@ static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
 	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
 };
 
+/*
+ * Count the number of socket FDs in the NBD_ATTR_SOCKETS netlink attribute.
+ * This is used to determine the correct nr_hw_queues before creating the
+ * nbd device, so that blk_mq_update_nr_hw_queues (and its RCU grace period
+ * overhead) can be avoided entirely.
+ */
+static int nbd_genl_count_sockets(struct genl_info *info)
+{
+	struct nlattr *attr;
+	int rem, count = 0;
+
+	if (!info->attrs[NBD_ATTR_SOCKETS])
+		return 0;
+
+	nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS], rem) {
+		struct nlattr *socks[NBD_SOCK_MAX + 1];
+
+		if (nla_type(attr) != NBD_SOCK_ITEM)
+			continue;
+		if (nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
+						  attr, nbd_sock_policy,
+						  info->extack) != 0)
+			continue;
+		if (socks[NBD_SOCK_FD])
+			count++;
+	}
+	return count;
+}
+
 /* We don't use this right now since we don't parse the incoming list, but we
  * still want it here so userspace knows what to expect.
  */
@@ -2123,6 +2153,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 	struct nbd_device *nbd;
 	struct nbd_config *config;
 	int index = -1;
+	int num_connections = nbd_genl_count_sockets(info);
 	int ret;
 	bool put_dev = false;
 
@@ -2170,7 +2201,7 @@ static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
 	mutex_unlock(&nbd_index_mutex);
 
 	if (!nbd) {
-		nbd = nbd_dev_add(index, 2);
+		nbd = nbd_dev_add(index, 2, num_connections);
 		if (IS_ERR(nbd)) {
 			pr_err("failed to add new device\n");
 			return PTR_ERR(nbd);
@@ -2737,7 +2768,7 @@ static int __init nbd_init(void)
 	nbd_dbg_init();
 
 	for (i = 0; i < nbds_max; i++)
-		nbd_dev_add(i, 1);
+		nbd_dev_add(i, 1, 1);
 	return 0;
 }
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 1/5] nbd: simplify find_fallback() by removing redundant logic
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd
In-Reply-To: <20260625084458.4171890-1-yangerkun@huawei.com>

From: Long Li <leo.lilong@huawei.com>

The second conditional checking nsock->fallback_index validity is the
logical inverse of the first, so drop it and let execution fall through
naturally. Consolidate the two identical dev_err_ratelimited() + return
paths into a single no_fallback label to reduce duplication.

Signed-off-by: Long Li <leo.lilong@huawei.com>
---
 drivers/block/nbd.c | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 3a585a0c882a..dcba3042862a 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1061,40 +1061,31 @@ static int find_fallback(struct nbd_device *nbd, int index)
 	int new_index = -1;
 	struct nbd_sock *nsock = config->socks[index];
 	int fallback = nsock->fallback_index;
+	int i;
 
 	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return new_index;
 
-	if (config->num_connections <= 1) {
-		dev_err_ratelimited(disk_to_dev(nbd->disk),
-				    "Dead connection, failed to find a fallback\n");
-		return new_index;
-	}
+	if (config->num_connections <= 1)
+		goto no_fallback;
 
 	if (fallback >= 0 && fallback < config->num_connections &&
 	    !config->socks[fallback]->dead)
 		return fallback;
 
-	if (nsock->fallback_index < 0 ||
-	    nsock->fallback_index >= config->num_connections ||
-	    config->socks[nsock->fallback_index]->dead) {
-		int i;
-		for (i = 0; i < config->num_connections; i++) {
-			if (i == index)
-				continue;
-			if (!config->socks[i]->dead) {
-				new_index = i;
-				break;
-			}
-		}
-		nsock->fallback_index = new_index;
-		if (new_index < 0) {
-			dev_err_ratelimited(disk_to_dev(nbd->disk),
-					    "Dead connection, failed to find a fallback\n");
-			return new_index;
+	for (i = 0; i < config->num_connections; i++) {
+		if (i != index && !config->socks[i]->dead) {
+			new_index = i;
+			break;
 		}
 	}
-	new_index = nsock->fallback_index;
+	nsock->fallback_index = new_index;
+	if (new_index >= 0)
+		return new_index;
+
+no_fallback:
+	dev_err_ratelimited(disk_to_dev(nbd->disk),
+			    "Dead connection, failed to find a fallback\n");
 	return new_index;
 }
 
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 2/5] nbd: replace socks pointer array with xarray
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd
In-Reply-To: <20260625084458.4171890-1-yangerkun@huawei.com>

From: Long Li <leo.lilong@huawei.com>

Replace the krealloc-based struct nbd_sock **socks array with struct
xarray socks. Each nbd sock is fully initialized before being stored
into the xarray via xa_store(), ensuring concurrent readers calling
xa_load() never observe a partially initialized socket.

Convert all array index accesses to xa_load() and open-coded for-loops
to xa_for_each().

Signed-off-by: Long Li <leo.lilong@huawei.com>
---
 drivers/block/nbd.c | 155 +++++++++++++++++++++++++++-----------------
 1 file changed, 96 insertions(+), 59 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index dcba3042862a..d88bdc97f4d1 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -38,6 +38,7 @@
 #include <linux/types.h>
 #include <linux/debugfs.h>
 #include <linux/blk-mq.h>
+#include <linux/xarray.h>
 
 #include <linux/uaccess.h>
 #include <asm/types.h>
@@ -94,7 +95,7 @@ struct nbd_config {
 	unsigned long runtime_flags;
 	u64 dead_conn_timeout;
 
-	struct nbd_sock **socks;
+	struct xarray socks;
 	int num_connections;
 	atomic_t live_connections;
 	wait_queue_head_t conn_wait;
@@ -398,15 +399,15 @@ static void nbd_complete_rq(struct request *req)
 static void sock_shutdown(struct nbd_device *nbd)
 {
 	struct nbd_config *config = nbd->config;
-	int i;
+	struct nbd_sock *nsock;
+	unsigned long i;
 
 	if (config->num_connections == 0)
 		return;
 	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return;
 
-	for (i = 0; i < config->num_connections; i++) {
-		struct nbd_sock *nsock = config->socks[i];
+	xa_for_each(&config->socks, i, nsock) {
 		mutex_lock(&nsock->tx_lock);
 		nbd_mark_nsock_dead(nbd, nsock, 0);
 		mutex_unlock(&nsock->tx_lock);
@@ -453,6 +454,7 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
 	struct nbd_device *nbd = cmd->nbd;
 	struct nbd_config *config;
+	struct nbd_sock *nsock;
 
 	if (!mutex_trylock(&cmd->lock))
 		return BLK_EH_RESET_TIMER;
@@ -488,10 +490,9 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 		 * connection is configured, the submit path will wait util
 		 * a new connection is reconfigured or util dead timeout.
 		 */
-		if (config->socks) {
-			if (cmd->index < config->num_connections) {
-				struct nbd_sock *nsock =
-					config->socks[cmd->index];
+		if (!xa_empty(&config->socks)) {
+			nsock = xa_load(&config->socks, cmd->index);
+			if (nsock) {
 				mutex_lock(&nsock->tx_lock);
 				/* We can have multiple outstanding requests, so
 				 * we don't want to mark the nsock dead if we've
@@ -515,22 +516,24 @@ static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
 		 * Userspace sets timeout=0 to disable socket disconnection,
 		 * so just warn and reset the timer.
 		 */
-		struct nbd_sock *nsock = config->socks[cmd->index];
 		cmd->retries++;
 		dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
 			req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
 			blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
 
-		mutex_lock(&nsock->tx_lock);
-		if (cmd->cookie != nsock->cookie) {
-			nbd_requeue_cmd(cmd);
+		nsock = xa_load(&config->socks, cmd->index);
+		if (nsock) {
+			mutex_lock(&nsock->tx_lock);
+			if (cmd->cookie != nsock->cookie) {
+				nbd_requeue_cmd(cmd);
+				mutex_unlock(&nsock->tx_lock);
+				mutex_unlock(&cmd->lock);
+				nbd_config_put(nbd);
+				return BLK_EH_DONE;
+			}
 			mutex_unlock(&nsock->tx_lock);
-			mutex_unlock(&cmd->lock);
-			nbd_config_put(nbd);
-			return BLK_EH_DONE;
 		}
-		mutex_unlock(&nsock->tx_lock);
 		mutex_unlock(&cmd->lock);
 		nbd_config_put(nbd);
 		return BLK_EH_RESET_TIMER;
@@ -600,8 +603,16 @@ static int sock_xmit(struct nbd_device *nbd, int index, int send,
 		     struct iov_iter *iter, int msg_flags, int *sent)
 {
 	struct nbd_config *config = nbd->config;
-	struct socket *sock = config->socks[index]->sock;
+	struct nbd_sock *nsock;
+	struct socket *sock;
 
+	nsock = xa_load(&config->socks, index);
+	if (unlikely(!nsock)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted xmit on invalid socket\n");
+		return -EINVAL;
+	}
+	sock = nsock->sock;
 	return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
 }
 
@@ -647,7 +658,7 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
 {
 	struct request *req = blk_mq_rq_from_pdu(cmd);
 	struct nbd_config *config = nbd->config;
-	struct nbd_sock *nsock = config->socks[index];
+	struct nbd_sock *nsock;
 	int result;
 	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
@@ -656,7 +667,14 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
 	u64 handle;
 	u32 type;
 	u32 nbd_cmd_flags = 0;
-	int sent = nsock->sent, skip = 0;
+	int sent, skip = 0;
+
+	nsock = xa_load(&config->socks, index);
+	if (unlikely(!nsock)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on invalid socket\n");
+		return BLK_STS_IOERR;
+	}
 
 	lockdep_assert_held(&cmd->lock);
 	lockdep_assert_held(&nsock->tx_lock);
@@ -683,6 +701,7 @@ static blk_status_t nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd,
 	 * request struct, so just go and send the rest of the pages in the
 	 * request.
 	 */
+	sent = nsock->sent;
 	if (sent) {
 		if (sent >= sizeof(request)) {
 			skip = sent - sizeof(request);
@@ -1059,9 +1078,10 @@ static int find_fallback(struct nbd_device *nbd, int index)
 {
 	struct nbd_config *config = nbd->config;
 	int new_index = -1;
-	struct nbd_sock *nsock = config->socks[index];
-	int fallback = nsock->fallback_index;
-	int i;
+	struct nbd_sock *nsock;
+	struct nbd_sock *fallback_nsock;
+	unsigned long i;
+	int fallback;
 
 	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
 		return new_index;
@@ -1069,12 +1089,19 @@ static int find_fallback(struct nbd_device *nbd, int index)
 	if (config->num_connections <= 1)
 		goto no_fallback;
 
-	if (fallback >= 0 && fallback < config->num_connections &&
-	    !config->socks[fallback]->dead)
-		return fallback;
+	nsock = xa_load(&config->socks, index);
+	if (unlikely(!nsock))
+		goto no_fallback;
+
+	fallback = nsock->fallback_index;
+	if (fallback >= 0 && fallback < config->num_connections) {
+		fallback_nsock = xa_load(&config->socks, fallback);
+		if (fallback_nsock && !fallback_nsock->dead)
+			return fallback;
+	}
 
-	for (i = 0; i < config->num_connections; i++) {
-		if (i != index && !config->socks[i]->dead) {
+	xa_for_each(&config->socks, i, fallback_nsock) {
+		if (i != index && !fallback_nsock->dead) {
 			new_index = i;
 			break;
 		}
@@ -1130,7 +1157,14 @@ static blk_status_t nbd_handle_cmd(struct nbd_cmd *cmd, int index)
 	}
 	cmd->status = BLK_STS_OK;
 again:
-	nsock = config->socks[index];
+	nsock = xa_load(&config->socks, index);
+	if (unlikely(!nsock)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on invalid socket\n");
+		nbd_config_put(nbd);
+		return BLK_STS_IOERR;
+	}
+
 	mutex_lock(&nsock->tx_lock);
 	if (nsock->dead) {
 		int old_index = index;
@@ -1270,9 +1304,9 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 {
 	struct nbd_config *config = nbd->config;
 	struct socket *sock;
-	struct nbd_sock **socks;
 	struct nbd_sock *nsock;
 	unsigned int memflags;
+	unsigned int index;
 	int err;
 
 	/* Arg will be cast to int, check it to avoid overflow */
@@ -1308,16 +1342,6 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 		goto put_socket;
 	}
 
-	socks = krealloc(config->socks, (config->num_connections + 1) *
-			 sizeof(struct nbd_sock *), GFP_KERNEL);
-	if (!socks) {
-		kfree(nsock);
-		err = -ENOMEM;
-		goto put_socket;
-	}
-
-	config->socks = socks;
-
 	nsock->fallback_index = -1;
 	nsock->dead = false;
 	mutex_init(&nsock->tx_lock);
@@ -1326,7 +1350,14 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 	nsock->sent = 0;
 	nsock->cookie = 0;
 	INIT_WORK(&nsock->work, nbd_pending_cmd_work);
-	socks[config->num_connections++] = nsock;
+
+	err = xa_alloc(&config->socks, &index, nsock, xa_limit_32b, GFP_KERNEL);
+	if (err < 0) {
+		kfree(nsock);
+		goto put_socket;
+	}
+
+	config->num_connections++;
 	atomic_inc(&config->live_connections);
 	blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
 
@@ -1343,7 +1374,8 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 	struct nbd_config *config = nbd->config;
 	struct socket *sock, *old;
 	struct recv_thread_args *args;
-	int i;
+	struct nbd_sock *nsock;
+	unsigned long i;
 	int err;
 
 	sock = nbd_get_socket(nbd, arg, &err);
@@ -1356,9 +1388,7 @@ static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
 		return -ENOMEM;
 	}
 
-	for (i = 0; i < config->num_connections; i++) {
-		struct nbd_sock *nsock = config->socks[i];
-
+	xa_for_each(&config->socks, i, nsock) {
 		if (!nsock->dead)
 			continue;
 
@@ -1424,10 +1454,11 @@ static void send_disconnects(struct nbd_device *nbd)
 	};
 	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
 	struct iov_iter from;
-	int i, ret;
+	struct nbd_sock *nsock;
+	unsigned long i;
+	int ret;
 
-	for (i = 0; i < config->num_connections; i++) {
-		struct nbd_sock *nsock = config->socks[i];
+	xa_for_each(&config->socks, i, nsock) {
 
 		iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
 		mutex_lock(&nsock->tx_lock);
@@ -1462,6 +1493,9 @@ static void nbd_config_put(struct nbd_device *nbd)
 	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
 					&nbd->config_lock)) {
 		struct nbd_config *config = nbd->config;
+		struct nbd_sock *nsock;
+		unsigned long i;
+
 		nbd_dev_dbg_close(nbd);
 		invalidate_disk(nbd->disk);
 		if (nbd->config->bytesize)
@@ -1477,14 +1511,15 @@ static void nbd_config_put(struct nbd_device *nbd)
 			nbd->backend = NULL;
 		}
 		nbd_clear_sock(nbd);
+
 		if (config->num_connections) {
-			int i;
-			for (i = 0; i < config->num_connections; i++) {
-				sockfd_put(config->socks[i]->sock);
-				kfree(config->socks[i]);
+			xa_for_each(&config->socks, i, nsock) {
+				sockfd_put(nsock->sock);
+				kfree(nsock);
 			}
-			kfree(config->socks);
 		}
+		xa_destroy(&config->socks);
+
 		kfree(nbd->config);
 		nbd->config = NULL;
 
@@ -1500,11 +1535,13 @@ static int nbd_start_device(struct nbd_device *nbd)
 {
 	struct nbd_config *config = nbd->config;
 	int num_connections = config->num_connections;
-	int error = 0, i;
+	int error = 0;
+	unsigned long i;
+	struct nbd_sock *nsock;
 
 	if (nbd->pid)
 		return -EBUSY;
-	if (!config->socks)
+	if (xa_empty(&config->socks))
 		return -EINVAL;
 	if (num_connections > 1 &&
 	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
@@ -1535,7 +1572,7 @@ static int nbd_start_device(struct nbd_device *nbd)
 	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
 
 	nbd_dev_dbg_init(nbd);
-	for (i = 0; i < num_connections; i++) {
+	xa_for_each(&config->socks, i, nsock) {
 		struct recv_thread_args *args;
 
 		args = kzalloc_obj(*args);
@@ -1553,15 +1590,14 @@ static int nbd_start_device(struct nbd_device *nbd)
 				flush_workqueue(nbd->recv_workq);
 			return -ENOMEM;
 		}
-		sk_set_memalloc(config->socks[i]->sock->sk);
+		sk_set_memalloc(nsock->sock->sk);
 		if (nbd->tag_set.timeout)
-			config->socks[i]->sock->sk->sk_sndtimeo =
-				nbd->tag_set.timeout;
+			nsock->sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
 		atomic_inc(&config->recv_threads);
 		refcount_inc(&nbd->config_refs);
 		INIT_WORK(&args->work, recv_work);
 		args->nbd = nbd;
-		args->nsock = config->socks[i];
+		args->nsock = nsock;
 		args->index = i;
 		queue_work(nbd->recv_workq, &args->work);
 	}
@@ -1711,6 +1747,7 @@ static int nbd_alloc_and_init_config(struct nbd_device *nbd)
 		return -ENOMEM;
 	}
 
+	xa_init_flags(&config->socks, XA_FLAGS_ALLOC);
 	atomic_set(&config->recv_threads, 0);
 	init_waitqueue_head(&config->recv_wq);
 	init_waitqueue_head(&config->conn_wait);
-- 
2.52.0


^ permalink raw reply related

* [PATCH v2 0/5] nbd: eliminate queue freeze/unfreeze overhead in connection setup
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd

This series eliminates the blk_mq_freeze_queue()/unfreeze_queue()
overhead that currently occurs during nbd connection setup.  On
multi-core systems, each freeze cycle involves waiting for percpu_ref
to drain, and blk_mq_update_nr_hw_queues() further adds RCU grace
period waits (synchronize_rcu/srcu for elevator switch, kfree_rcu on
the old hctx array).  These delays are significant when nbd devices
must come online quickly.

The series proceeds in three stages:

1. Cleanups that simplify the sock management code and remove dead
   logic (patches 1-3).

2. A structural change that replaces the krealloc-based socks pointer
   array with an xarray (patch 2), which is the key prerequisite for
   removing the queue freeze from nbd_add_socket().  With xarray,
   concurrent readers via xa_load() never observe a partially
   initialized socket, and xa_store() is safe under RCU without
   freezing the queue.

3. Two patches that target the remaining freeze/unfreeze points:
   - Patch 4 removes the freeze from nbd_add_socket(), since the
     xarray no longer requires it.  This alone reduces connection
     setup time from ~4.5s to ~0.26s with 256 connections (-C 256).
   - Patch 5 (new) targets nbd_start_device(), which still calls
     blk_mq_update_nr_hw_queues() when nr_hw_queues differs from
     num_connections.  For devices created via the netlink connect
     path, the connection count is known upfront from NBD_ATTR_SOCKETS,
     so we can set nr_hw_queues correctly at device creation time and
     skip the runtime update entirely.

The ioctl path (NBD_SET_SOCK + NBD_DO_IT) remains fully functional:
pre-created devices with nbds_max>0 default to nr_hw_queues=1, and
nbd_start_device() still calls blk_mq_update_nr_hw_queues() when the
hardware queue count needs adjustment.

v1->v2:
1. rewrite cover letter
2. add patch 5

Long Li (4):
  nbd: simplify find_fallback() by removing redundant logic
  nbd: replace socks pointer array with xarray
  nbd: remove redundant num_connections boundary checks
  nbd: remove queue freeze in nbd_add_socket

Yang Erkun (1):
  nbd: set nr_hw_queues at device creation to skip queue freeze

 drivers/block/nbd.c | 236 ++++++++++++++++++++++++++------------------
 1 file changed, 139 insertions(+), 97 deletions(-)

-- 
2.52.0


^ permalink raw reply

* [PATCH v2 4/5] nbd: remove queue freeze in nbd_add_socket
From: Yang Erkun @ 2026-06-25  8:44 UTC (permalink / raw)
  To: josef, axboe, hch, yukuai
  Cc: yi.zhang, chengzhihao1, echo.chenlin, leo.lilong, wangkefeng.wang,
	linux-block, nbd
In-Reply-To: <20260625084458.4171890-1-yangerkun@huawei.com>

From: Long Li <leo.lilong@huawei.com>

The queue freeze was originally needed to prevent concurrent requests
from accessing config->socks while the backing array was being
reallocated. Since config->socks is now an xarray, insertions are
safe under RCU without freezing the queue.

This significantly reduces connection setup time when using a large
number of connections (-C 256):

  before: real 4.510s
  after:  real 0.263s

Signed-off-by: Long Li <leo.lilong@huawei.com>
---
 drivers/block/nbd.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 409a655c40f0..953146c85f17 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -1299,7 +1299,6 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 	struct nbd_config *config = nbd->config;
 	struct socket *sock;
 	struct nbd_sock *nsock;
-	unsigned int memflags;
 	unsigned int index;
 	int err;
 
@@ -1311,12 +1310,6 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 		return err;
 	nbd_reclassify_socket(sock);
 
-	/*
-	 * We need to make sure we don't get any errant requests while we're
-	 * reallocating the ->socks array.
-	 */
-	memflags = blk_mq_freeze_queue(nbd->disk->queue);
-
 	if (!netlink && !nbd->task_setup &&
 	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
 		nbd->task_setup = current;
@@ -1353,12 +1346,10 @@ static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
 
 	config->num_connections++;
 	atomic_inc(&config->live_connections);
-	blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
 
 	return 0;
 
 put_socket:
-	blk_mq_unfreeze_queue(nbd->disk->queue, memflags);
 	sockfd_put(sock);
 	return err;
 }
-- 
2.52.0


^ permalink raw reply related

* Re: [PATCH] block: partitions: Use seq_buf_putc() in cmdline_partition()
From: Andy Shevchenko @ 2026-06-25  8:18 UTC (permalink / raw)
  To: Markus Elfring
  Cc: linux-block, Jens Axboe, Josh Law, Kees Cook, LKML,
	kernel-janitors, Woradorn Laodhanadhaworn
In-Reply-To: <59dfd2ef-2fda-4dd0-a288-52c35613e778@web.de>

On Thu, Jun 25, 2026 at 10:08:01AM +0200, Markus Elfring wrote:
> 
> A single line break should be put into a sequence buffer.
> Thus use the corresponding function “seq_buf_putc”.

“seq_buf_putc()”.

> The source code was transformed by using the Coccinelle software.

...

>  	cmdline_parts_set(parts, disk_size, state);
>  	cmdline_parts_verifier(1, state);

> -
> -	seq_buf_puts(&state->pp_buf, "\n");
> -
> +	seq_buf_putc(&state->pp_buf, '\n');

Why did you remove blank lines?

>  	return 1;

-- 
With Best Regards,
Andy Shevchenko



^ permalink raw reply

* [PATCH] block: partitions: Use seq_buf_putc() in cmdline_partition()
From: Markus Elfring @ 2026-06-25  8:08 UTC (permalink / raw)
  To: linux-block, Jens Axboe, Josh Law, Kees Cook
  Cc: LKML, kernel-janitors, Andy Shevchenko, Woradorn Laodhanadhaworn

From: Markus Elfring <elfring@users.sourceforge.net>
Date: Thu, 25 Jun 2026 09:50:33 +0200

A single line break should be put into a sequence buffer.
Thus use the corresponding function “seq_buf_putc”.

The source code was transformed by using the Coccinelle software.

Signed-off-by: Markus Elfring <elfring@users.sourceforge.net>
---

All affected source code places could be adjusted at once
if the change acceptance would evolve accordingly.

18 source files are left over for similar development considerations.


 block/partitions/cmdline.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 4fd52ed154b4..ce7488b3d2db 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -376,8 +376,6 @@ int cmdline_partition(struct parsed_partitions *state)
 
 	cmdline_parts_set(parts, disk_size, state);
 	cmdline_parts_verifier(1, state);
-
-	seq_buf_puts(&state->pp_buf, "\n");
-
+	seq_buf_putc(&state->pp_buf, '\n');
 	return 1;
 }
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH] iomap: Remove FGP_NOFS from iomap_get_folio()
From: Christian Brauner @ 2026-06-25  7:43 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: Darrick J. Wong, Jens Axboe, Namjae Jeon, Sungjong Seo,
	Yuezhang Mo, Miklos Szeredi, Andreas Gruenbacher, Hyunchul Lee,
	Konstantin Komarov, Carlos Maiolino, Damien Le Moal, Naohiro Aota,
	Johannes Thumshirn, linux-xfs, linux-fsdevel, linux-block,
	fuse-devel, gfs2, ntfs3
In-Reply-To: <20260624174228.2015893-1-willy@infradead.org>

On Wed, 24 Jun 2026 18:42:26 +0100, Matthew Wilcox (Oracle) wrote:
> iomap: Remove FGP_NOFS from iomap_get_folio()

Moving into -next so we see fallout...

---

Applied to the vfs-7.3.iomap branch of the vfs/vfs.git tree.
Patches in the vfs-7.3.iomap branch should appear in linux-next soon.

Please report any outstanding bugs that were missed during review in a
new review to the original patch series allowing us to drop it.

It's encouraged to provide Acked-bys and Reviewed-bys even though the
patch has now been applied. If possible patch trailers will be updated.

Note that commit hashes shown below are subject to change due to rebase,
trailer updates or similar. If in doubt, please check the listed branch.

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs.git
branch: vfs-7.3.iomap

[1/1] iomap: Remove FGP_NOFS from iomap_get_folio()
      https://git.kernel.org/vfs/vfs/c/ec17795fb797


^ permalink raw reply

* Re: [PATCH v2] block: serialize elevator changes for the same queue using a writer lock
From: Shin'ichiro Kawasaki @ 2026-06-25  7:05 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block, Jens Axboe, Nilay Shroff
In-Reply-To: <ajvz98mmlJEaTNxd@fedora>

On Jun 24, 2026 / 10:12, Ming Lei wrote:
> On Tue, Jun 23, 2026 at 10:32:38AM +0900, Shin'ichiro Kawasaki wrote:
> > When elevator_change() is called concurrently for the same queue, the
> > elevator_change_done() function runs concurrently as well. This function
> > adds or deletes kobjects for the debugfs entry of the queue. Then the
> > concurrent calls cause memory corruption of the kobjects and result in a
> > process hang. The core part of the elevator switch is protected by queue
> > freeze and q->elevator_lock. However, since the commit 559dc11143eb
> > ("block: move elv_register[unregister]_queue out of elevator_lock"), the
> > elevator_change_done() is not serialized. Hence the memory corruption
> > and the hang.
> > 
> > The failures are observed when udev-worker writes to a sysfs
> > queue/scheduler attribute file while the blktests test case block/005
> > writes to the same attribute file. The failure also can be recreated by
> > running two processes that write to the same queue/scheduler file
> > concurrently. The failure is observed since another commit 370ac285f23a
> > ("block: avoid cpu_hotplug_lock depedency on freeze_lock"). This commit
> > changed the behavior of queue freeze and it unveiled the failure.
> > 
> > Fix the failure by changing elv_iosched_store() to acquire
> > update_nr_hwq_lock as the writer lock instead of the reader lock. This
> > serializes the whole elevator switch steps, including the
> > elevator_change_done() call.
> > 
> > Fixes: 559dc11143eb ("block: move elv_register[unregister]_queue out of elevator_lock")
> > Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki@wdc.com>
> > ---
> > I observed that the blktests test case block/005 hung on a specific
> > server hardware using a specific HDD as a block device. During the test
> > case run, the kernel reported KASAN null-ptr-deref and slab-use-after-
> > free errors. The failure happened when a sysfs queue/scheduler attribute
> > file is written concurrently. I reported the failure and shared a
> > candidate fix patch as RFC [1]. Based on the comments and discussion on
> > the RFC patch, I propose this v2 patch that avoids introducing a new
> > lock. My thanks go to Ming and Nilay for the discussion.
> > 
> > Please refer to [1] for details of the failure. Also, I created a
> > blktests test case that recreates the hang [2], which I used to test the
> > fix.
> > 
> > * Changes from RFC v1
> > - Instead of adding a new mutex to struct request_queue, replace the
> >   reader lock on update_nr_hwq_lock with the writer lock in
> >   elv_iosched_store().
> > 
> > [1] https://lore.kernel.org/linux-block/20260611074200.474676-1-shinichiro.kawasaki@wdc.com/
> > [2] https://github.com/kawasaki/blktests/commit/8e80b3ccc0bbbe3f209d00eacd138d020de97fc6
> > 
> >  block/elevator.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> > 
> > diff --git a/block/elevator.c b/block/elevator.c
> > index 3bcd37c2aa34..b03185a217ff 100644
> > --- a/block/elevator.c
> > +++ b/block/elevator.c
> > @@ -813,7 +813,7 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
> >  	 *   update_nr_hwq_lock -> kn->active (via del_gendisk -> kobject_del)
> >  	 *   kn->active -> update_nr_hwq_lock (via this sysfs write path)
> >  	 */
> > -	if (!down_read_trylock(&set->update_nr_hwq_lock)) {
> > +	if (!down_write_trylock(&set->update_nr_hwq_lock)) {
> 
> I'd suggest to document why using write_trylock above, such as serializing
> 2-stage elevator switch, anyway this patch looks good as bug fix:
> 
> Reviewed-by: Ming Lei <tom.leiming@gmail.com>

Thanks for the comment. As to the suggested documentation, I think we can add
the block comment as follows. I will prepare the v3 patch tomorrow to fold-in
the comment.

diff --git a/block/elevator.c b/block/elevator.c
index b03185a217ff..2161b6eea680 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -812,6 +812,11 @@ ssize_t elv_iosched_store(struct gendisk *disk, const char *buf,
 	 * reference during concurrent disk deletion:
 	 *   update_nr_hwq_lock -> kn->active (via del_gendisk -> kobject_del)
 	 *   kn->active -> update_nr_hwq_lock (via this sysfs write path)
+	 *
+	 * Use the writer lock instead of the reader lock of update_nr_hwq_lock
+	 * to serialize the two-stage elevator switch steps in
+	 * elevator_change(): the core switch step under the elevator lock and
+	 * the elevator_change_done() step outside the elevator lock.
 	 */
 	if (!down_write_trylock(&set->update_nr_hwq_lock)) {
 		ret = -EBUSY;

^ permalink raw reply related

* Re: [PATCH 10/16] fs/buffer: Remove fs-layer decryption code
From: Christian Brauner @ 2026-06-25  7:01 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-fscrypt, linux-fsdevel, linux-ext4, linux-f2fs-devel,
	linux-block, Christoph Hellwig, Theodore Ts'o, Andreas Dilger,
	Baokun Li, Jan Kara, Ojaswin Mujoo, Ritesh Harjani, Zhang Yi,
	Jaegeuk Kim, Chao Yu
In-Reply-To: <20260624050334.124606-11-ebiggers@kernel.org>

On 2026-06-23 22:03 -0700, Eric Biggers wrote:
> Now that fscrypt's file contents en/decryption is always implemented
> using blk-crypto when the filesystem is block-based, the fs-layer
> decryption code in fs/buffer.c is unused code.  Remove it.
> 
> Signed-off-by: Eric Biggers <ebiggers@kernel.org>
> ---

Reviewed-by: Christian Brauner (Amutable) <brauner@kernel.org>


^ permalink raw reply

* Re: [PATCH blktests] block/044: basic block error injection sanity test
From: Shin'ichiro Kawasaki @ 2026-06-25  6:36 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-block
In-Reply-To: <20260622160808.1552568-1-hch@lst.de>

[-- Attachment #1: Type: text/plain, Size: 3966 bytes --]

Hi Christoph, thanks for the patch. I ran the test case with block/for-next
kernel branch tip and confirmed that it is working as expected.

Please find my comments in line. FYI, I atttach the patch which reflects my
comments. If you are fine with the changes, please let me know so that I can
fold in the change and apply this patch.

On Jun 22, 2026 / 18:08, Christoph Hellwig wrote:
> Test the basic block layer error injection functionality.
> 
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  tests/block/044     | 71 +++++++++++++++++++++++++++++++++++++++++++++
>  tests/block/044.out |  9 ++++++
>  2 files changed, 80 insertions(+)
>  create mode 100755 tests/block/044
>  create mode 100644 tests/block/044.out
> 
> diff --git a/tests/block/044 b/tests/block/044
> new file mode 100755
> index 000000000000..8baf9fa59c68
> --- /dev/null
> +++ b/tests/block/044
> @@ -0,0 +1,71 @@
> +#!/bin/bash
> +# SPDX-License-Identifier: GPL-2.0

Nit: Majority of the blktests test cases have GPL-3.0+. If you do not mind,
I suggest GPL-3.0+.

> +# Copyright (c) 2026 Christoph Hellwig.
> +#
> +# Basic block error injection test.
> +
> +. tests/block/rc
> +. common/scsi_debug
> +
> +DESCRIPTION="basic block error injection test"
> +QUICK=1
> +
> +requires()
> +{	

Nit: a stray tab in the line above.

I suggest to add the line below.

       _have_kernel_option BLK_ERROR_INJECTION

This way, we can confirm the kernel has the required changes and
the dependent feature is enabled.

> +	_have_loadable_scsi_debug
> +	_have_program xfs_io
> +}
> +
> +# load and remove scsi_debug once to test the static_key bug in the
> +# initial commit
> +_test_load_unload()
> +{
> +	if ! _init_scsi_debug dev_size_mb=500; then
> +		return 1
> +	fi
> +
> +	local dev=${SCSI_DEBUG_DEVICES[0]}
> +	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
> +	if [[ ! -f "${debugfs_file}" ]]; then
> +		SKIP_REASONS+=("error injection not supported")
> +		_exit_scsi_debug
> +		return 1
> +	fi

With BLK_ERROR_INJECTION check in requires(), 7 lines above will not be
required. This will simplify this bash function.

> +	echo "Testing unload without rules"
> +	_exit_scsi_debug
> +}
> +
> +_test_rules()
> +{
> +	if ! _init_scsi_debug dev_size_mb=500; then
> +		return 1
> +	fi
> +
> +	local dev=${SCSI_DEBUG_DEVICES[0]}
> +	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
> +
> +	echo "Testing valid rules"
> +	echo "add,op=WRITE,status=RESOURCE,start=0,nr_sectors=8" > $debugfs_file
> +	echo "add,op=READ,status=IOERR,start=16,nr_sectors=8" > $debugfs_file
> +	xfs_io -d -c 'pwrite -q 0 4096' /dev/$dev
> +	xfs_io -d -c 'pread -q 0 4096' /dev/$dev
> +	xfs_io -d -c 'pwrite -q 4096 4096' /dev/$dev
> +	xfs_io -d -c 'pread -q 8192 8192' /dev/$dev
> +
> +	echo "Testing invalid rules"
> +	echo "op=READ,status=IOERR" > $debugfs_file
> +	echo "add,op=READ,status=EIO,start=32" > $debugfs_file

$debugfs_file and $dev in this function require double quotation. Otherwise,
shellcheck will warn about them.

$ make check-parallel
Running shellcheck with 4 parallel jobs...
tests/block/044:48:61: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:49:58: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:50:39: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:51:38: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:52:42: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:53:41: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:56:32: note: Double quote to prevent globbing and word splitting. [SC2086]
tests/block/044:57:43: note: Double quote to prevent globbing and word splitting. [SC2086]

> +	_exit_scsi_debug
> +}
> +
> +test()
> +{
> +	echo "Running ${TEST_NAME}"
> +
> +	local ret
> +

The local variable declaration is not required.



[-- Attachment #2: patch --]
[-- Type: text/plain, Size: 1885 bytes --]

diff --git a/tests/block/044 b/tests/block/044
index 8baf9fa..d31aa4c 100755
--- a/tests/block/044
+++ b/tests/block/044
@@ -11,7 +11,8 @@ DESCRIPTION="basic block error injection test"
 QUICK=1
 
 requires()
-{	
+{
+	_have_kernel_option BLK_ERROR_INJECTION
 	_have_loadable_scsi_debug
 	_have_program xfs_io
 }
@@ -23,14 +24,6 @@ _test_load_unload()
 	if ! _init_scsi_debug dev_size_mb=500; then
 		return 1
 	fi
-
-	local dev=${SCSI_DEBUG_DEVICES[0]}
-	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
-	if [[ ! -f "${debugfs_file}" ]]; then
-		SKIP_REASONS+=("error injection not supported")
-		_exit_scsi_debug
-		return 1
-	fi
 	echo "Testing unload without rules"
 	_exit_scsi_debug
 }
@@ -45,16 +38,16 @@ _test_rules()
 	local debugfs_file="/sys/kernel/debug/block/$dev/error_injection"
 
 	echo "Testing valid rules"
-	echo "add,op=WRITE,status=RESOURCE,start=0,nr_sectors=8" > $debugfs_file
-	echo "add,op=READ,status=IOERR,start=16,nr_sectors=8" > $debugfs_file
-	xfs_io -d -c 'pwrite -q 0 4096' /dev/$dev
-	xfs_io -d -c 'pread -q 0 4096' /dev/$dev
-	xfs_io -d -c 'pwrite -q 4096 4096' /dev/$dev
-	xfs_io -d -c 'pread -q 8192 8192' /dev/$dev
+	echo "add,op=WRITE,status=RESOURCE,start=0,nr_sectors=8" > "$debugfs_file"
+	echo "add,op=READ,status=IOERR,start=16,nr_sectors=8" > "$debugfs_file"
+	xfs_io -d -c 'pwrite -q 0 4096' /dev/"$dev"
+	xfs_io -d -c 'pread -q 0 4096' /dev/"$dev"
+	xfs_io -d -c 'pwrite -q 4096 4096' /dev/"$dev"
+	xfs_io -d -c 'pread -q 8192 8192' /dev/"$dev"
 
 	echo "Testing invalid rules"
-	echo "op=READ,status=IOERR" > $debugfs_file
-	echo "add,op=READ,status=EIO,start=32" > $debugfs_file
+	echo "op=READ,status=IOERR" > "$debugfs_file"
+	echo "add,op=READ,status=EIO,start=32" > "$debugfs_file"
 	_exit_scsi_debug
 }
 
@@ -62,8 +55,6 @@ test()
 {
 	echo "Running ${TEST_NAME}"
 
-	local ret
-
 	_test_load_unload
 	_test_rules
 

^ permalink raw reply related

* [PATCH] block: avoid potential deadlock on zone revalidation failure
From: Damien Le Moal @ 2026-06-25  6:28 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: Christoph Hellwig

If revalidating the zones of a zoned block device with
blk_revalidate_disk_zones() fails during a SCSI disk rescan, the following
lockdep splat is thrown:

[  347.251859] [  T11230] sda: failed to revalidate zones

[  347.261380] [  T11230] ======================================================
[  347.263882] [  T11230] WARNING: possible circular locking dependency detected
[  347.266353] [  T11230] 7.1.0+ #1194 Not tainted
[  347.268052] [  T11230] ------------------------------------------------------
[  347.270537] [  T11230] tcsh/11230 is trying to acquire lock:
[  347.272555] [  T11230] ffffffff8f91d400 (wq_pool_mutex){+.+.}-{4:4}, at: destroy_workqueue+0x15d/0x8d0
[  347.275914] [  T11230]
                          but task is already holding lock:
[  347.278646] [  T11230] ffff88812fa1bcc0 (&q->q_usage_counter(io)#5){++++}-{0:0}, at: blk_mq_freeze_queue_nomemsave+0x16/0x30
[  347.282503] [  T11230]
                          which lock already depends on the new lock.

[  347.286239] [  T11230]
                          the existing dependency chain (in reverse order) is:
[  347.289408] [  T11230]
                          -> #2 (&q->q_usage_counter(io)#5){++++}-{0:0}:
[  347.292437] [  T11230]        blk_alloc_queue+0x5ca/0x750
[  347.294379] [  T11230]        blk_mq_alloc_queue+0x14c/0x240
[  347.296375] [  T11230]        scsi_alloc_sdev+0x871/0xd10 [scsi_mod]
[  347.298619] [  T11230]        scsi_probe_and_add_lun+0x600/0xc50 [scsi_mod]
[  347.301056] [  T11230]        __scsi_scan_target+0x187/0x3b0 [scsi_mod]
[  347.303385] [  T11230]        scsi_scan_channel+0xf2/0x180 [scsi_mod]
[  347.305651] [  T11230]        scsi_scan_host_selected+0x20b/0x2d0 [scsi_mod]
[  347.308119] [  T11230]        do_scan_async+0x42/0x420 [scsi_mod]
[  347.310276] [  T11230]        async_run_entry_fn+0x94/0x5a0
[  347.312284] [  T11230]        process_one_work+0x8da/0x1690
[  347.314287] [  T11230]        worker_thread+0x5fe/0x1010
[  347.316216] [  T11230]        kthread+0x358/0x450
[  347.317675] [  T11230]        ret_from_fork+0x5b9/0x8e0
[  347.319181] [  T11230]        ret_from_fork_asm+0x11/0x20
[  347.320778] [  T11230]
                          -> #1 (fs_reclaim){+.+.}-{0:0}:
[  347.322890] [  T11230]        fs_reclaim_acquire+0xd5/0x120
[  347.324464] [  T11230]        __kmalloc_cache_node_noprof+0x39/0x620
[  347.326223] [  T11230]        init_rescuer+0x19b/0x560
[  347.327697] [  T11230]        workqueue_init+0x33b/0x6a0
[  347.329224] [  T11230]        kernel_init_freeable+0x2eb/0x600
[  347.330881] [  T11230]        kernel_init+0x1c/0x140
[  347.332334] [  T11230]        ret_from_fork+0x5b9/0x8e0
[  347.333847] [  T11230]        ret_from_fork_asm+0x11/0x20
[  347.335360] [  T11230]
                          -> #0 (wq_pool_mutex){+.+.}-{4:4}:
[  347.337510] [  T11230]        __lock_acquire+0xdea/0x2260
[  347.339030] [  T11230]        lock_acquire+0x187/0x2f0
[  347.340495] [  T11230]        __mutex_lock+0x1ab/0x2600
[  347.341464] [  T11230]        destroy_workqueue+0x15d/0x8d0
[  347.342485] [  T11230]        disk_free_zone_resources+0xd5/0x560
[  347.343577] [  T11230]        blk_revalidate_disk_zones+0x620/0xac7
[  347.344723] [  T11230]        sd_zbc_revalidate_zones+0x1dd/0x790 [sd_mod]
[  347.345938] [  T11230]        sd_revalidate_disk+0xc66/0x8e60 [sd_mod]
[  347.347112] [  T11230]        scsi_rescan_device+0x1f9/0x310 [scsi_mod]
[  347.348318] [  T11230]        store_rescan_field+0x19/0x20 [scsi_mod]
[  347.349507] [  T11230]        kernfs_fop_write_iter+0x3d2/0x5e0
[  347.350565] [  T11230]        vfs_write+0x469/0x1000
[  347.351484] [  T11230]        ksys_write+0x116/0x250
[  347.352403] [  T11230]        do_syscall_64+0xf0/0x6e0
[  347.353361] [  T11230]        entry_SYSCALL_64_after_hwframe+0x4b/0x53
[  347.354533] [  T11230]
                          other info that might help us debug this:

[  347.356432] [  T11230] Chain exists of:
                            wq_pool_mutex --> fs_reclaim --> &q->q_usage_counter(io)#5

[  347.358919] [  T11230]  Possible unsafe locking scenario:

[  347.360307] [  T11230]        CPU0                    CPU1
[  347.361327] [  T11230]        ----                    ----
[  347.362340] [  T11230]   lock(&q->q_usage_counter(io)#5);
[  347.363344] [  T11230]                                lock(fs_reclaim);
[  347.364526] [  T11230]                                lock(&q->q_usage_counter(io)#5);
[  347.365968] [  T11230]   lock(wq_pool_mutex);
[  347.366811] [  T11230]
                           *** DEADLOCK ***

This happens because SCSI disk rescan is executed from a work context
and a failure of blk_revalidate_disk_zones() causes a call to
disk_free_zone_resources() which will free the disk zone write plug
workqueue.

Avoid this by delaying the destruction of the disk zone write plug
workqueue to disk_release(). Do this by introducing the function
disk_release_zone_resources() and using this new function from
disk_release(). This new function calls disk_free_zone_resources() and
destroys the zone write plugs workqueue, thus allowing to remove the
call to destroy_workqueue() from disk_free_zone_resources().
disk_alloc_zone_resources() is modified to not create the disk zone
write plug work queue if it already exists.

Fixes: a8f59e5a5dea ("block: use a per disk workqueue for zone write plugging")
Cc: stable@vger.kernek.org
Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 block/blk-zoned.c | 40 +++++++++++++++++++++++++---------------
 block/blk.h       |  4 ++--
 block/genhd.c     |  2 +-
 3 files changed, 28 insertions(+), 18 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index bea817f3de56..133600ebe05c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -1923,11 +1923,20 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 	if (!disk->zone_wplugs_pool)
 		goto free_hash;
 
-	disk->zone_wplugs_wq =
-		alloc_workqueue("%s_zwplugs", WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU,
-				pool_size, disk->disk_name);
-	if (!disk->zone_wplugs_wq)
-		goto destroy_pool;
+	/*
+	 * We may already have a zone write plug workqueue as this function may
+	 * be called after disk_free_zone_resources(), which does not destroy
+	 * the workqueue (the zone write plugs workqueue is destroyed at
+	 * disk_release() time).
+	 */
+	if (!disk->zone_wplugs_wq) {
+		disk->zone_wplugs_wq =
+			alloc_workqueue("%s_zwplugs",
+					WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_PERCPU,
+					pool_size, disk->disk_name);
+		if (!disk->zone_wplugs_wq)
+			goto destroy_pool;
+	}
 
 	disk->zone_wplugs_worker =
 		kthread_create(disk_zone_wplugs_worker, disk,
@@ -1935,15 +1944,12 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 	if (IS_ERR(disk->zone_wplugs_worker)) {
 		ret = PTR_ERR(disk->zone_wplugs_worker);
 		disk->zone_wplugs_worker = NULL;
-		goto destroy_wq;
+		goto destroy_pool;
 	}
 	wake_up_process(disk->zone_wplugs_worker);
 
 	return 0;
 
-destroy_wq:
-	destroy_workqueue(disk->zone_wplugs_wq);
-	disk->zone_wplugs_wq = NULL;
 destroy_pool:
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
@@ -1999,7 +2005,7 @@ static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
 	kfree_rcu_mightsleep(zones_cond);
 }
 
-void disk_free_zone_resources(struct gendisk *disk)
+static void disk_free_zone_resources(struct gendisk *disk)
 {
 	if (disk->zone_wplugs_worker) {
 		kthread_stop(disk->zone_wplugs_worker);
@@ -2007,11 +2013,6 @@ void disk_free_zone_resources(struct gendisk *disk)
 	}
 	WARN_ON_ONCE(!list_empty(&disk->zone_wplugs_list));
 
-	if (disk->zone_wplugs_wq) {
-		destroy_workqueue(disk->zone_wplugs_wq);
-		disk->zone_wplugs_wq = NULL;
-	}
-
 	disk_destroy_zone_wplugs_hash_table(disk);
 
 	disk_set_zones_cond_array(disk, NULL);
@@ -2020,6 +2021,15 @@ void disk_free_zone_resources(struct gendisk *disk)
 	disk->nr_zones = 0;
 }
 
+void disk_release_zone_resources(struct gendisk *disk)
+{
+	disk_free_zone_resources(disk);
+	if (disk->zone_wplugs_wq) {
+		destroy_workqueue(disk->zone_wplugs_wq);
+		disk->zone_wplugs_wq = NULL;
+	}
+}
+
 struct blk_revalidate_zone_args {
 	struct gendisk	*disk;
 	u8		*zones_cond;
diff --git a/block/blk.h b/block/blk.h
index 25af8ac5ef0f..fb95d3c58950 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -528,7 +528,7 @@ static inline void ioc_clear_queue(struct request_queue *q)
 
 #ifdef CONFIG_BLK_DEV_ZONED
 void disk_init_zone_resources(struct gendisk *disk);
-void disk_free_zone_resources(struct gendisk *disk);
+void disk_release_zone_resources(struct gendisk *disk);
 static inline bool bio_zone_write_plugging(struct bio *bio)
 {
 	return bio_flagged(bio, BIO_ZONE_WRITE_PLUGGING);
@@ -581,7 +581,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
 static inline void disk_init_zone_resources(struct gendisk *disk)
 {
 }
-static inline void disk_free_zone_resources(struct gendisk *disk)
+static inline void disk_release_zone_resources(struct gendisk *disk)
 {
 }
 static inline bool bio_zone_write_plugging(struct bio *bio)
diff --git a/block/genhd.c b/block/genhd.c
index f84b6a355b57..30ac0ffe6517 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1300,7 +1300,7 @@ static void disk_release(struct device *dev)
 
 	disk_release_events(disk);
 	kfree(disk->random);
-	disk_free_zone_resources(disk);
+	disk_release_zone_resources(disk);
 	xa_destroy(&disk->part_tbl);
 
 	kobject_put(&disk->queue_kobj);
-- 
2.54.0


^ permalink raw reply related

* Re: [PATCH v3 5/5] block: validate user space vectors during extraction
From: Hannes Reinecke @ 2026-06-25  6:28 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch, stable
In-Reply-To: <20260624170905.3972095-6-kbusch@meta.com>

On 6/24/26 7:09 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The bio-based drivers don't necessarily check the alignment split, and
> stacking block drivers don't always handle a misalignment detected after
> submitting the bio. Validate user vectors against the device's
> dma_alignment as the bio is built from the iov_iter, rejecting
> misaligned early with -EINVAL.
> 
> Cc: stable@vger.kernel.org
> Fixes: 5ff3f74e145a ("block: simplify direct io validity check")
> Fixes: 7eac33186957 ("iomap: simplify direct io validity check")
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/bio.c          | 56 +++++++++++++++++++++++++++++++++++++++++---
>   block/blk-map.c      |  2 +-
>   block/fops.c         |  2 +-
>   fs/iomap/direct-io.c |  1 +
>   include/linux/bio.h  |  2 +-
>   include/linux/uio.h  | 10 +++++++-
>   lib/iov_iter.c       |  9 ++++++-
>   7 files changed, 74 insertions(+), 8 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@kernel.org>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply

* Re: [PATCH v3 4/5] zloop: set dma_alignment from the backing files for direct I/O
From: Hannes Reinecke @ 2026-06-25  6:26 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260624170905.3972095-5-kbusch@meta.com>

On 6/24/26 7:09 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Direct I/O user pages are forwarded to the backing files unchanged, so
> the backing's DMA alignment requirement applies to them. Track the
> backing file's dio_mem_align and advertise it as the zloop device's
> dma_alignment if it is larger than the default so we advertise proper
> limits and misaligned I/O is rejected early instead of being dispatched
> to the backend.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/block/zloop.c | 35 +++++++++++++++++++++++++----------
>   1 file changed, 25 insertions(+), 10 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@kernel.org>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply

* Re: [PATCH v3 3/5] loop: set dma_alignment from the backing file for direct I/O
From: Hannes Reinecke @ 2026-06-25  6:25 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260624170905.3972095-4-kbusch@meta.com>

On 6/24/26 7:09 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Direct I/O user pages are forwarded to the backing file unchanged, so
> the backing's DMA alignment requirement applies to them. Track the
> backing file's dio_mem_align and advertise it as the loop device's
> dma_alignment if it is larger than the default so we advertise proper
> limits and misaligned I/O is rejected early instead of being dispatched
> to the backend.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   drivers/block/loop.c | 46 ++++++++++++++++++++++++++++++++++++--------
>   1 file changed, 38 insertions(+), 8 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 310de0463beb1..5fe61d542f8b7 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -54,6 +54,7 @@ struct loop_device {
>   
>   	struct file	*lo_backing_file;
>   	unsigned int	lo_min_dio_size;
> +	unsigned int	lo_dio_mem_align;
>   	struct block_device *lo_device;
>   
>   	gfp_t		old_gfp_mask;
> @@ -447,26 +448,37 @@ static void loop_reread_partitions(struct loop_device *lo)
>   			__func__, lo->lo_number, lo->lo_file_name, rc);
>   }
>   
> -static unsigned int loop_query_min_dio_size(struct loop_device *lo)
> +static void loop_update_dio_alignment(struct loop_device *lo)
>   {
>   	struct file *file = lo->lo_backing_file;
>   	struct block_device *sb_bdev = file->f_mapping->host->i_sb->s_bdev;
>   	struct kstat st;
>   
>   	/*
> -	 * Use the minimal dio alignment of the file system if provided.
> +	 * Use the dio alignment of the file system if provided.  The incomoing
> +	 * request's bio_vec is forwarded to the backing file unchanged, so its
> +	 * required memory alignment becomes the device's dma_alignment when
> +	 * used for direct-io.
>   	 */
>   	if (!vfs_getattr(&file->f_path, &st, STATX_DIOALIGN, 0) &&
> -	    (st.result_mask & STATX_DIOALIGN))
> -		return st.dio_offset_align;
> +	    (st.result_mask & STATX_DIOALIGN)) {
> +		lo->lo_min_dio_size = st.dio_offset_align;
> +		lo->lo_dio_mem_align = st.dio_mem_align - 1;
> +		return;
> +	}
>   
>   	/*
>   	 * In a perfect world this wouldn't be needed, but as of Linux 6.13 only
>   	 * a handful of file systems support the STATX_DIOALIGN flag.
>   	 */
> -	if (sb_bdev)
> -		return bdev_logical_block_size(sb_bdev);
> -	return SECTOR_SIZE;
> +	if (sb_bdev) {
> +		lo->lo_min_dio_size = bdev_logical_block_size(sb_bdev);
> +		lo->lo_dio_mem_align = bdev_dma_alignment(sb_bdev);
> +		return;
> +	}
> +
> +	lo->lo_min_dio_size = SECTOR_SIZE;
> +	lo->lo_dio_mem_align = SECTOR_SIZE - 1;
>   }
>   
>   static inline int is_loop_device(struct file *file)
> @@ -509,7 +521,7 @@ static void loop_assign_backing_file(struct loop_device *lo, struct file *file)
>   			lo->old_gfp_mask & ~(__GFP_IO | __GFP_FS));
>   	if (lo->lo_backing_file->f_flags & O_DIRECT)
>   		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
> -	lo->lo_min_dio_size = loop_query_min_dio_size(lo);
> +	loop_update_dio_alignment(lo);
>   }
>   
>   static int loop_check_backing_file(struct file *file)
> @@ -940,6 +952,19 @@ static unsigned int loop_default_blocksize(struct loop_device *lo)
>   	return SECTOR_SIZE;
>   }
>   
> +static void loop_set_dma_limit(struct loop_device *lo, struct queue_limits *lim)
> +{
> +	/*
> +	 * Direct I/O forwards the user pages to the backing file unchanged, so
> +	 * track the backing's DMA alignment requirement as the mode is toggled.
> +	 */
> +	if (lo->lo_flags & LO_FLAGS_DIRECT_IO)
> +		lim->dma_alignment = max_t(unsigned int, lo->lo_dio_mem_align,
> +					   SECTOR_SIZE - 1);
> +	else
> +		lim->dma_alignment = SECTOR_SIZE - 1;

Nit: we never set this previously, so we could drop this line.

> +}
> +
>   static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
>   		unsigned int bsize)
>   {
> @@ -961,6 +986,7 @@ static void loop_update_limits(struct loop_device *lo, struct queue_limits *lim,
>   	lim->logical_block_size = bsize;
>   	lim->physical_block_size = bsize;
>   	lim->io_min = bsize;
> +	loop_set_dma_limit(lo, lim);
>   	lim->features &= ~(BLK_FEAT_WRITE_CACHE | BLK_FEAT_ROTATIONAL);
>   	if (file->f_op->fsync && !(lo->lo_flags & LO_FLAGS_READ_ONLY))
>   		lim->features |= BLK_FEAT_WRITE_CACHE;
> @@ -1416,6 +1442,7 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
>   {
>   	bool use_dio = !!arg;
>   	unsigned int memflags;
> +	struct queue_limits lim;
>   
>   	if (lo->lo_state != Lo_bound)
>   		return -ENXIO;
> @@ -1434,6 +1461,9 @@ static int loop_set_dio(struct loop_device *lo, unsigned long arg)
>   		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
>   	else
>   		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
> +	lim = queue_limits_start_update(lo->lo_queue);
> +	loop_set_dma_limit(lo, &lim);
> +	queue_limits_commit_update(lo->lo_queue, &lim);
>   	blk_mq_unfreeze_queue(lo->lo_queue, memflags);
>   	return 0;
>   }

Otherwise looks good.

Reviewed-by: Hannes Reinecke <hare@kernel.org>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply

* Re: [PATCH v3 2/5] block: fix dio leak on metadata mapping error
From: Hannes Reinecke @ 2026-06-25  6:21 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260624170905.3972095-3-kbusch@meta.com>

On 6/24/26 7:09 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> A failed integrity mapping holds a dio reference, so we need to go
> through the full bio ending in case there were previously submitted
> bio's in the sequence.
> 
> Fixes: 2729a60bbfb92 ("block: don't silently ignore metadata for sync read/write")
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/fops.c | 6 ++++--
>   1 file changed, 4 insertions(+), 2 deletions(-)
> 
Reviewed-by: Hannes Reinecke <hare@kernel.org>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply

* Re: [PATCH v3 1/5] block: use blkdev_iov_iter_get_pages status for errors
From: Hannes Reinecke @ 2026-06-25  6:20 UTC (permalink / raw)
  To: Keith Busch, linux-block, linux-fsdevel
  Cc: dm-devel, hch, axboe, brauner, djwong, viro, Keith Busch
In-Reply-To: <20260624170905.3972095-2-kbusch@meta.com>

On 6/24/26 7:09 PM, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> blkdev_iov_iter_get_pages() can return various error values, including
> EIO, EFAULT, and ENOMEM. Set the actual reported status so user space
> can know a little more on why an operation failed.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/fops.c | 2 +-
>   1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/block/fops.c b/block/fops.c
> index 15783a6180dec..0827bb884d473 100644
> --- a/block/fops.c
> +++ b/block/fops.c
> @@ -218,7 +218,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
>   
>   		ret = blkdev_iov_iter_get_pages(bio, iter, bdev);
>   		if (unlikely(ret)) {
> -			bio_endio_status(bio, BLK_STS_IOERR);
> +			bio_endio_status(bio, errno_to_blk_status(ret));
>   			break;
>   		}
>   		if (iocb->ki_flags & IOCB_NOWAIT) {

Reviewed-by: Hannes Reinecke <hare@kernel.org>

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

^ permalink raw reply

* [PATCH] drbd: Fix potential NULL pointer dereference in _drbd_set_state()
From: Ваторопин Андрей @ 2026-06-25  5:03 UTC (permalink / raw)
  To: Philipp Reisner
  Cc: Ваторопин Андрей,
	Lars Ellenberg, Christoph Böhmwalder, Jens Axboe,
	Andreas Gruenbacher, drbd-dev@lists.linbit.com,
	linux-block@vger.kernel.org, linux-kernel@vger.kernel.org,
	lvc-project@linuxtesting.org, stable@vger.kernel.org

From: Andrey Vatoropin <a.vatoropin@crpt.ru>

The connection pointer receives a value in the _drbd_set_state()
function, including through a call to the first_peer_device() function.
This function returns a pointer to a list element. If the list is empty, it
returns a NULL pointer, which is later assigned to the connection
pointer. Subsequently, this pointer will be dereferenced.

Add a NULL check for the connection pointer to avoid dereferencing an
invalid pointer.

Found by Linux Verification Center (linuxtesting.org) with SVACE.
       
Fixes: a6b32bc3cebd ("drbd: Introduce "peer_device" object between "device" and "connection"")
Cc: stable@vger.kernel.org
Signed-off-by: Andrey Vatoropin <a.vatoropin@crpt.ru>
---
 drivers/block/drbd/drbd_state.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index adcba7f1d8ea..ea982d48017e 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1281,6 +1281,11 @@ _drbd_set_state(struct drbd_device *device, union drbd_state ns,
 	if (rv < SS_SUCCESS)
 		return rv;
 
+	if (!connection) {
+		drbd_err(device, "No connection to peer, aborting!\n");
+		return SS_ALREADY_STANDALONE;
+	}
+
 	if (!(flags & CS_HARD)) {
 		/*  pre-state-change checks ; only look at ns  */
 		/* See drbd_state_sw_errors in drbd_strings.c */
-- 
2.43.0

^ permalink raw reply related

* Re: [PATCH 2/2] block: handle REQ_OP_ZONE_APPEND in __bio_integrity_action
From: Caleb Sander Mateos @ 2026-06-25  4:07 UTC (permalink / raw)
  To: Martin K. Petersen; +Cc: Christoph Hellwig, Jens Axboe, linux-block
In-Reply-To: <yq1cxxfpfiw.fsf@ca-mkp.ca.oracle.com>

On Wed, Jun 24, 2026 at 7:29 PM Martin K. Petersen
<martin.petersen@oracle.com> wrote:
>
>
> Caleb,
>
> > Right, I don't mean partitions of zoned devices, but block devices in
> > general. I was just trying to understand why the remapping
> > infrastructure exists in the first place. Seems like we can't remove
> > it entirely, but we can at least ensure the ref tag seeds are correct
> > if it's skipped for a non-partitioned device.
>
> The entity attaching the PI to the I/O decides what the seed value
> should be.
>
> If you are an application preparing PI, you don't know which LBA a write
> is eventually going to end up at. You just know you are writing block 10
> inside your file. So you generate PI starting with a reference tag value
> of 10 because that is what makes sense to you. And thus you set the seed
> to 10 to tell the remapping code which initial reference tag value to
> expect in the prepared PI buffer.

Except currently the ref tag seed is set to sector 80 rather than block 10 :)

>
> Once the write hits the bottom of the stack, we know which initial
> reference tag the hardware expects. So we remap the reference tags in
> the PI buffer from whatever made sense to the application to whatever
> the hardware requires. I.e. an initial value of the lower 32 bits of the
> LBA for T10 PI Type 1, incremented by 1 for each subsequent protection
> interval.

Right, I understand this now. But as Christoph points out,
blk_integrity_prepare() is only called for REQ_OP_WRITE, not
REQ_OP_ZONE_APPEND. I don't think remapping is even possible for zone
appends, as the final LBA isn't known until the device completes the
operation. So I think we need to initialize the ref tag seed in units
of integrity intervals rather than sectors to ensure reading back the
data at an offset uses a ref tag for each block consistent with that
used in the zone append.

>
> For reads, it's the same thing. The application wants to read starting
> at block offset 10 inside the file so it sets the seed value to 10. At
> the bottom of the stack we know how to interpret the PI returned by the
> hardware. So we validate that the reference tags received from the
> controller match the lower 32 bits of the LBA or whatever the correct PI
> format is. And then we remap the reference tags in the received PI
> buffer, setting the reference tag to the requested value of 10 for the
> first block, and then incrementing by 1 for each subsequent protection
> interval.
>
> This allows the application to validate the received reference tags
> without ever knowing anything about which start LBA the I/O happened to
> come from.
>
> Both DIX and NVMe also allow the hardware to perform the remapping so
> the software remapping step can be skipped altogether. Christoph and I
> briefly talked about that last week. We currently don't take advantage
> of that capability in the NVMe driver.

As far as I'm aware, the SCSI driver doesn't use the ref tag seed
either. I see calls to t10_pi_ref_tag() and scsi_prot_ref_tag(), but
bip_get_seed() is only called inside the remapping code. Would indeed
be nice to take advantage of the hardware's ability to accept a ref
tag seed and skip the software remapping. It seems like initializing
the ref tag seed in units of integrity intervals rather than sectors
would be a pre-requisite to that, though. Let me know if I'm
misunderstanding something.

Best,
Caleb

^ permalink raw reply

* Re: [PATCH v2] scsi: bsg: read io_uring command fields once
From: Yang Xiuwei @ 2026-06-25  3:25 UTC (permalink / raw)
  To: James E . J . Bottomley, Martin K . Petersen
  Cc: Yang Xiuwei, Rahul Chandelkar, Jens Axboe, FUJITA Tomonori,
	linux-scsi, linux-block, io-uring, Bart Van Assche,
	Caleb Sander Mateos
In-Reply-To: <20260527191817.142769-1-rc@rexion.ai>

Hi James, Martin,

Friendly ping on v2 — anything else needed before pick-up?

Thanks,
Yang Xiuwei


^ permalink raw reply

* Re: [PATCH v3 1/7] list: Add mutable iterator variants
From: Kaitao Cheng @ 2026-06-25  3:01 UTC (permalink / raw)
  To: David Laight, Christian König, Jani Nikula,
	David Hildenbrand (Arm), Alexei Starovoitov
  Cc: Andrew Morton, David Hildenbrand, Jens Axboe, Tejun Heo,
	Alexander Viro, Christian Brauner, Daniel Borkmann,
	Andrii Nakryiko, Johannes Weiner, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Namhyung Kim, Thomas Gleixner,
	Juri Lelli, Vincent Guittot, Paul Moore, Andy Shevchenko,
	Paul E. McKenney, Shakeel Butt, David Howells, Simona Vetter,
	Randy Dunlap, Luca Ceresoli, Philipp Stanner, linux-block,
	linux-kernel, cgroups, linux-ntfs-dev, linux-fsdevel, io-uring,
	audit, bpf, netdev, dri-devel, linux-perf-users,
	linux-trace-kernel, kexec, live-patching, linux-modules,
	linux-crypto, linux-pm, rcu, sched-ext, linux-mm, virtualization,
	damon, llvm, Kaitao Cheng, Muchun Song
In-Reply-To: <20260624152324.3def88ce@pumpkin>

在 2026/6/24 22:23, David Laight 写道:
> On Wed, 24 Jun 2026 15:23:47 +0200
> Christian König <christian.koenig@amd.com> wrote:
>> On 6/24/26 15:14, Kaitao Cheng wrote:
>>> 在 2026/6/22 16:42, David Laight 写道:  
>>>> On Mon, 22 Jun 2026 12:05:31 +0800
>>>> Kaitao Cheng <kaitao.cheng@linux.dev> wrote:
>>>>  
>>>>> From: Kaitao Cheng <chengkaitao@kylinos.cn>
>>>>>
>>>>> The list_for_each*_safe() helpers are used when the loop body may
>>>>> remove the current entry.  Their API exposes the temporary cursor at
>>>>> every call site, even though most users only need it for the iterator
>>>>> implementation and never reference it in the loop body.
>>>>>
>>>>> Add *_mutable() variants for list and hlist iteration.  The new helpers
>>>>> support both forms: callers may keep passing an explicit temporary cursor
>>>>> when they need to inspect or reset it, or omit it and let the helper use
>>>>> a unique internal cursor.  
>>>>
>>>> I'm not really sure 'mutable' means anything either.
>>>> It is possible to make it valid for the loop body (or even other threads)
>>>> to delete arbitrary list items - but that needs significant extra overheads.
>>>>
>>>> It might be worth doing something that doesn't need the extra variable,
>>>> but there is little point doing all the churn just to rename things.
>>>>  
>>>>>
>>>>> This makes call sites that only mutate the list through the current entry
>>>>> less noisy, while keeping the existing *_safe() helpers available for
>>>>> compatibility.
>>>>>
>>>>> Signed-off-by: Kaitao Cheng <chengkaitao@kylinos.cn>
>>>>> ---
>>>>>  include/linux/list.h | 269 +++++++++++++++++++++++++++++++++++++------
>>>>>  1 file changed, 231 insertions(+), 38 deletions(-)
>>>>>
>>>>> diff --git a/include/linux/list.h b/include/linux/list.h
>>>>> index 09d979976b3b..1081def7cea9 100644
>>>>> --- a/include/linux/list.h
>>>>> +++ b/include/linux/list.h
>>>>> @@ -7,6 +7,7 @@
>>>>>  #include <linux/stddef.h>
>>>>>  #include <linux/poison.h>
>>>>>  #include <linux/const.h>
>>>>> +#include <linux/args.h>
>>>>>  
>>>>>  #include <asm/barrier.h>
>>>>>  
>>>>> @@ -763,28 +764,72 @@ static inline void list_splice_tail_init(struct list_head *list,
>>>>>  #define list_for_each_prev(pos, head) \
>>>>>  	for (pos = (head)->prev; !list_is_head(pos, (head)); pos = pos->prev)
>>>>>  
>>>>> -/**
>>>>> - * list_for_each_safe - iterate over a list safe against removal of list entry
>>>>> - * @pos:	the &struct list_head to use as a loop cursor.
>>>>> - * @n:		another &struct list_head to use as temporary storage
>>>>> - * @head:	the head for your list.
>>>>> +/*
>>>>> + * list_for_each_safe is an old interface, use list_for_each_mutable instead.
>>>>>   */
>>>>>  #define list_for_each_safe(pos, n, head) \
>>>>>  	for (pos = (head)->next, n = pos->next; \
>>>>>  	     !list_is_head(pos, (head)); \
>>>>>  	     pos = n, n = pos->next)
>>>>>  
>>>>> +#define __list_for_each_mutable_internal(pos, tmp, head)		\
>>>>> +	for (typeof(pos) tmp = (pos = (head)->next)->next;		\  
>>>>
>>>> Use auto
>>>>  
>>>>> +	     !list_is_head(pos, (head));				\
>>>>> +	     pos = tmp, tmp = pos->next)
>>>>> +
>>>>> +#define __list_for_each_mutable1(pos, head)				\
>>>>> +	__list_for_each_mutable_internal(pos, __UNIQUE_ID(next), head)
>>>>> +
>>>>> +#define __list_for_each_mutable2(pos, next, head)			\
>>>>> +	list_for_each_safe(pos, next, head)
>>>>> +
>>>>>  /**
>>>>> - * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry
>>>>> + * list_for_each_mutable - iterate over a list safe against entry removal
>>>>>   * @pos:	the &struct list_head to use as a loop cursor.
>>>>> - * @n:		another &struct list_head to use as temporary storage
>>>>> - * @head:	the head for your list.
>>>>> + * @...:	either (head) or (next, head)
>>>>> + *
>>>>> + * next:	another &struct list_head to use as optional temporary storage.
>>>>> + *		The temporary cursor is internal unless explicitly supplied by
>>>>> + *		the caller.
>>>>> + * head:	the head for your list.
>>>>> + */
>>>>> +#define list_for_each_mutable(pos, ...)					\
>>>>> +	CONCATENATE(__list_for_each_mutable, COUNT_ARGS(__VA_ARGS__))	\
>>>>> +		(pos, __VA_ARGS__)  
>>>>
>>>> The variable argument count logic really just slows down compilation.
>>>> Maybe there aren't enough copies of this code to make that significant.
>>>> But just because you can do it doesn't mean it is a gooD idea.
>>>> I'm also not sure it really adds anything to the readability.
>>>>
>>>> And, it you are going to make the middle argument optional there is
>>>> no need to change the macro name.  
>>>
>>> Christian König and Jani Nikula also disagree with the variadic-argument
>>> implementation approach. If we abandon that method, it means we will
>>> inevitably need to add some new macros. If mutable is not a good name,
>>> suggestions for better alternatives would be welcome; coming up with a
>>> suitable name is indeed rather tricky.  
>>
>> I don't think you need to add a new macro for the specific use case that people want to modify the next element of the iteration.
>>
>> If I remember your numbers correctly that is a really corner case and keeping using the existing *_safe() macros for that sounds perfectly fine to me.
> 
> IIRC currently you have a choice of either:
> 	define               Item that can't be deleted
> 	list_for_each()	     The current item.
> 	list_for_each_safe() The next item.
> There is also likely to be code that updates the variables to allow
> for other scenarios.
> 
> Note that if increase a reference count and release a lock then list_for_each()
> is likely safer than list_for_each_safe() :-)
> 
> list.h has 9 variants of the 'safe' loop.
> The bloat of another 9 is getting excessive.
> 
> It has to be said that this is one of my least favourite type of list...

Hi Christian König, David Laight, Jani Nikula, David Hildenbrand,
Andy Shevchenko, Alexei Starovoitov

For ease of discussion, I need to summarize the currently possible
approaches and briefly describe their respective pros and cons,
using the list_for_each_entry* interfaces as examples.

1. Add list_for_each_entry_mutable, while keeping list_for_each_entry
and list_for_each_entry_safe unchanged. list_for_each_entry_mutable
would be used specifically for safe deletion scenarios that do not
need to expose the temporary cursor externally. The code can refer to
the v1 version.

Pros: Does not depend on immediate per-subsystem adaptation and can be
      merged directly.
Cons: Requires adding a whole set of mutable interfaces, which makes the
      code somewhat redundant.

2. Directly optimize away the temporary cursor in list_for_each_entry_safe
and define it inside the loop instead, changing the interface from four
arguments to three.

Pros: Does not add redundant interfaces.
Cons: (1) Users need to manually update special cases that use the
      traversal variable of list_for_each_entry_safe, the new
      list_for_each_entry_safe would no longer apply there and would
      need to be open-coded.
      (2) Because the macro arguments changes, all list_for_each_entry_safe
      callers would need to be modified and merged together, making it
      difficult to merge such a large amount of code at once.

3. Use a variadic macro approach to optimize list_for_each_entry_safe,
so that it supports both three and four arguments.

Pros: (1) Does not add redundant interfaces.
      (2) Does not depend on immediate per-subsystem adaptation and can
      be merged directly.
Cons: (1) Increases compile time.
      (2) Makes the interface harder for users to use.

4. Optimize list_for_each_entry by defining the temporary cursor internally,
making it compatible with the functionality of list_for_each_entry_safe.
The code can refer to the v2 version.

Pros: (1) Does not add redundant interfaces.
      (2) The number of externally visible arguments of list_for_each_entry
      remains unchanged, still three.
Cons: (1) list_for_each_entry and list_for_each_entry_safe would be merged
      into one, and list_for_each_entry_safe would gradually be deprecated.
      (2) Users need to manually update special cases that use the traversal
      variable of list_for_each_entry, the new list_for_each_entry would no
      longer apply there and would need to be open-coded. There are 15 such
      cases in total.

5. Use a variadic macro approach to optimize list_for_each_entry, so that
it supports both three and four arguments.

Pros: (1) Does not add redundant interfaces.
      (2) Does not depend on immediate per-subsystem adaptation and can be
      merged directly.
Cons: (1) Increases compile time.
      (2) list_for_each_entry and list_for_each_entry_safe would be merged
      into one, and list_for_each_entry_safe would gradually be deprecated.

6. Make no changes, keep the current logic unchanged, and close the current
email discussion.


Which of the six solutions above do people prefer?

-- 
Thanks
Kaitao Cheng


^ permalink raw reply

* [PATCH v3 4/4] blk-cgroup: factor policy pd teardown loop into helper
From: Yu Kuai @ 2026-06-25  2:57 UTC (permalink / raw)
  To: Jens Axboe, Tejun Heo
  Cc: Josef Bacik, Zheng Qixing, Christoph Hellwig, Tang Yizhou,
	Yu Kuai, cgroups, linux-block, linux-kernel
In-Reply-To: <20260625025739.2459651-1-yukuai@kernel.org>

From: Zheng Qixing <zhengqixing@huawei.com>

Move the teardown sequence which offlines and frees per-policy
blkg_policy_data (pd) into a helper for readability.

No functional change intended.

Signed-off-by: Zheng Qixing <zhengqixing@huawei.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Tang Yizhou <yizhou.tang@shopee.com>
Signed-off-by: Yu Kuai <yukuai@fygo.io>
---
 block/blk-cgroup.c | 57 ++++++++++++++++++++++------------------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index d06915045bc4..0b28420c108f 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1527,10 +1527,35 @@ struct cgroup_subsys io_cgrp_subsys = {
 	.depends_on = 1 << memory_cgrp_id,
 #endif
 };
 EXPORT_SYMBOL_GPL(io_cgrp_subsys);
 
+/*
+ * Tear down per-blkg policy data for @pol on @q.
+ */
+static void blkcg_policy_teardown_pds(struct request_queue *q,
+				      const struct blkcg_policy *pol)
+{
+	struct blkcg_gq *blkg;
+
+	list_for_each_entry(blkg, &q->blkg_list, q_node) {
+		struct blkcg *blkcg = blkg->blkcg;
+		struct blkg_policy_data *pd;
+
+		spin_lock(&blkcg->lock);
+		pd = blkg->pd[pol->plid];
+		if (pd) {
+			if (pd->online && pol->pd_offline_fn)
+				pol->pd_offline_fn(pd);
+			pd->online = false;
+			pol->pd_free_fn(pd);
+			WRITE_ONCE(blkg->pd[pol->plid], NULL);
+		}
+		spin_unlock(&blkcg->lock);
+	}
+}
+
 /**
  * blkcg_activate_policy - activate a blkcg policy on a gendisk
  * @disk: gendisk of interest
  * @pol: blkcg policy to activate
  *
@@ -1642,25 +1667,11 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
 	return ret;
 
 enomem:
 	/* alloc failed, take down everything */
 	spin_lock_irq(&q->queue_lock);
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-		struct blkg_policy_data *pd;
-
-		spin_lock(&blkcg->lock);
-		pd = blkg->pd[pol->plid];
-		if (pd) {
-			if (pd->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(pd);
-			pd->online = false;
-			pol->pd_free_fn(pd);
-			WRITE_ONCE(blkg->pd[pol->plid], NULL);
-		}
-		spin_unlock(&blkcg->lock);
-	}
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	ret = -ENOMEM;
 	goto out;
 }
 EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1675,11 +1686,10 @@ EXPORT_SYMBOL_GPL(blkcg_activate_policy);
  */
 void blkcg_deactivate_policy(struct gendisk *disk,
 			     const struct blkcg_policy *pol)
 {
 	struct request_queue *q = disk->queue;
-	struct blkcg_gq *blkg;
 	unsigned int memflags;
 
 	if (!blkcg_policy_enabled(q, pol))
 		return;
 
@@ -1688,24 +1698,11 @@ void blkcg_deactivate_policy(struct gendisk *disk,
 
 	mutex_lock(&q->blkcg_mutex);
 	spin_lock_irq(&q->queue_lock);
 
 	__clear_bit(pol->plid, q->blkcg_pols);
-
-	list_for_each_entry(blkg, &q->blkg_list, q_node) {
-		struct blkcg *blkcg = blkg->blkcg;
-
-		spin_lock(&blkcg->lock);
-		if (blkg->pd[pol->plid]) {
-			if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
-				pol->pd_offline_fn(blkg->pd[pol->plid]);
-			pol->pd_free_fn(blkg->pd[pol->plid]);
-			blkg->pd[pol->plid] = NULL;
-		}
-		spin_unlock(&blkcg->lock);
-	}
-
+	blkcg_policy_teardown_pds(q, pol);
 	spin_unlock_irq(&q->queue_lock);
 	mutex_unlock(&q->blkcg_mutex);
 
 	if (queue_is_mq(q))
 		blk_mq_unfreeze_queue(q, memflags);
-- 
2.51.0


^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox