Linux block layer
 help / color / mirror / Atom feed
* [PATCH] block: Split bios in LBA order
@ 2025-05-12 22:56 Bart Van Assche
  2025-05-13  6:44 ` Christoph Hellwig
  0 siblings, 1 reply; 5+ messages in thread
From: Bart Van Assche @ 2025-05-12 22:56 UTC (permalink / raw)
  To: Jens Axboe
  Cc: linux-block, Christoph Hellwig, Bart Van Assche, Damien Le Moal,
	Ming Lei

The block layer submits bio fragments in opposite LBA order. Fix this as
follows:
- Introduce a new function bio_split_to_limits_and_submit() that has the
  same behavior as the existing bio_split_to_limits() function. This
  involves splitting a bio and submitting the fragment with the highest
  LBA by calling submit_bio_noacct().
- Use the new function bio_split_to_limits_and_submit() in all drivers
  that are fine with submitting split bios in opposite LBA order.
- Modify bio_split_to_limits() such that it returns two bio pointers
  instead of submitting one of the two bio fragments in case a bio has
  been split.
- Modify blk_mq_submit_bio() and dm_split_and_process_bio() such that
  bio fragments are submitted in LBA order.

This patch fixes unaligned write errors that I encountered with F2FS
submitting zoned writes to a dm driver stacked on top of a zoned UFS
device.

Cc: Christoph Hellwig <hch@lst.de>
Cc: Damien Le Moal <dlemoal@kernel.org>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Bart Van Assche <bvanassche@acm.org>
---
 block/blk-merge.c             | 81 +++++++++++++++++++++++++----------
 block/blk-mq.c                | 27 ++++++++----
 block/blk.h                   | 26 ++++++-----
 drivers/block/drbd/drbd_req.c |  2 +-
 drivers/block/pktcdvd.c       |  2 +-
 drivers/md/dm.c               | 14 ++++--
 drivers/md/md.c               |  2 +-
 drivers/nvme/host/multipath.c |  2 +-
 include/linux/blkdev.h        |  3 +-
 9 files changed, 108 insertions(+), 51 deletions(-)

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 782308b73b53..1076e5e1fa9c 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,8 +105,15 @@ static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
 	return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
 }
 
-static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
+/*
+ * Split a bio and prepare the bio that has been split off for submission.
+ * Returns %NULL upon error or the prefix bio upon success and stores a pointer
+ * to the suffix in *@bio_ptr.
+ */
+static struct bio *bio_submit_split(struct bio **bio_ptr, int split_sectors)
 {
+	struct bio *bio = *bio_ptr;
+
 	if (unlikely(split_sectors < 0))
 		goto error;
 
@@ -124,8 +131,9 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
 		bio_chain(split, bio);
 		trace_block_split(split, bio->bi_iter.bi_sector);
 		WARN_ON_ONCE(bio_zone_write_plugging(bio));
-		submit_bio_noacct(bio);
 		return split;
+	} else {
+		*bio_ptr = NULL;
 	}
 
 	return bio;
@@ -135,7 +143,7 @@ static struct bio *bio_submit_split(struct bio *bio, int split_sectors)
 	return NULL;
 }
 
-struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+struct bio *bio_split_discard(struct bio **bio, const struct queue_limits *lim,
 		unsigned *nsegs)
 {
 	unsigned int max_discard_sectors, granularity;
@@ -149,11 +157,13 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
 	max_discard_sectors =
 		min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
 	max_discard_sectors -= max_discard_sectors % granularity;
-	if (unlikely(!max_discard_sectors))
-		return bio;
+	if (unlikely(!max_discard_sectors) ||
+	    bio_sectors(*bio) <= max_discard_sectors) {
+		struct bio *orig_bio = *bio;
 
-	if (bio_sectors(bio) <= max_discard_sectors)
-		return bio;
+		*bio = NULL;
+		return orig_bio;
+	}
 
 	split_sectors = max_discard_sectors;
 
@@ -161,7 +171,7 @@ struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
 	 * If the next starting sector would be misaligned, stop the discard at
 	 * the previous aligned sector.
 	 */
-	tmp = bio->bi_iter.bi_sector + split_sectors -
+	tmp = (*bio)->bi_iter.bi_sector + split_sectors -
 		((lim->discard_alignment >> 9) % granularity);
 	tmp = sector_div(tmp, granularity);
 
@@ -374,12 +384,12 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
 }
 EXPORT_SYMBOL_GPL(bio_split_rw_at);
 
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+struct bio *bio_split_rw(struct bio **bio, const struct queue_limits *lim,
 		unsigned *nr_segs)
 {
 	return bio_submit_split(bio,
-		bio_split_rw_at(bio, lim, nr_segs,
-			get_max_io_size(bio, lim) << SECTOR_SHIFT));
+		bio_split_rw_at(*bio, lim, nr_segs,
+			get_max_io_size(*bio, lim) << SECTOR_SHIFT));
 }
 
 /*
@@ -389,22 +399,22 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
  * a good sanity check that the submitter built the bio correctly is nice to
  * have as well.
  */
-struct bio *bio_split_zone_append(struct bio *bio,
+struct bio *bio_split_zone_append(struct bio **bio,
 		const struct queue_limits *lim, unsigned *nr_segs)
 {
 	int split_sectors;
 
-	split_sectors = bio_split_rw_at(bio, lim, nr_segs,
+	split_sectors = bio_split_rw_at(*bio, lim, nr_segs,
 			lim->max_zone_append_sectors << SECTOR_SHIFT);
 	if (WARN_ON_ONCE(split_sectors > 0))
 		split_sectors = -EINVAL;
 	return bio_submit_split(bio, split_sectors);
 }
 
-struct bio *bio_split_write_zeroes(struct bio *bio,
+struct bio *bio_split_write_zeroes(struct bio **bio,
 		const struct queue_limits *lim, unsigned *nsegs)
 {
-	unsigned int max_sectors = get_max_io_size(bio, lim);
+	unsigned int max_sectors = get_max_io_size(*bio, lim);
 
 	*nsegs = 0;
 
@@ -414,15 +424,38 @@ struct bio *bio_split_write_zeroes(struct bio *bio,
 	 * I/O completion handler, and we can race and see this.  Splitting to a
 	 * zero limit obviously doesn't make sense, so band-aid it here.
 	 */
-	if (!max_sectors)
-		return bio;
-	if (bio_sectors(bio) <= max_sectors)
-		return bio;
+	if (!max_sectors || bio_sectors(*bio) <= max_sectors) {
+		struct bio *orig_bio = *bio;
+
+		*bio = NULL;
+		return orig_bio;
+	}
 	return bio_submit_split(bio, max_sectors);
 }
 
 /**
  * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio:     pointer to the bio to be split
+ *
+ * Check if *@bio needs splitting based on the queue limits of (*@bio)->bi_bdev,
+ * and if so split off a bio fitting the limits from the beginning of *@bio and
+ * return it.  *@bio is shortened to the remainder if the bio has been split.
+ * *@bio is cleared if splitting is not required.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
+ */
+struct bio *bio_split_to_limits(struct bio **bio)
+{
+	unsigned int nr_segs;
+
+	return __bio_split_to_limits(bio, bdev_limits((*bio)->bi_bdev),
+				     &nr_segs);
+}
+EXPORT_SYMBOL(bio_split_to_limits);
+
+/**
+ * bio_split_to_limits_and_submit - split a bio to fit the queue limits
  * @bio:     bio to be split
  *
  * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
@@ -432,13 +465,15 @@ struct bio *bio_split_write_zeroes(struct bio *bio,
  * The split bio is allocated from @q->bio_split, which is provided by the
  * block layer.
  */
-struct bio *bio_split_to_limits(struct bio *bio)
+struct bio *bio_split_to_limits_and_submit(struct bio *bio)
 {
-	unsigned int nr_segs;
+	struct bio *split = bio_split_to_limits(&bio);
 
-	return __bio_split_to_limits(bio, bdev_limits(bio->bi_bdev), &nr_segs);
+	if (split && bio)
+		submit_bio_noacct(bio);
+	return split;
 }
-EXPORT_SYMBOL(bio_split_to_limits);
+EXPORT_SYMBOL(bio_split_to_limits_and_submit);
 
 unsigned int blk_recalc_rq_segments(struct request *rq)
 {
diff --git a/block/blk-mq.c b/block/blk-mq.c
index cbc9a9f97a31..d9a84d0282ae 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3122,6 +3122,7 @@ void blk_mq_submit_bio(struct bio *bio)
 	struct blk_plug *plug = current->plug;
 	const int is_sync = op_is_sync(bio->bi_opf);
 	struct blk_mq_hw_ctx *hctx;
+	struct bio *remainder = NULL;
 	unsigned int nr_segs;
 	struct request *rq;
 	blk_status_t ret;
@@ -3169,18 +3170,19 @@ void blk_mq_submit_bio(struct bio *bio)
 		goto queue_exit;
 	}
 
-	bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+	remainder = bio;
+	bio = __bio_split_to_limits(&remainder, &q->limits, &nr_segs);
 	if (!bio)
 		goto queue_exit;
 
 	if (!bio_integrity_prep(bio))
-		goto queue_exit;
+		goto submit_remainder_and_exit;
 
 	if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
-		goto queue_exit;
+		goto submit_remainder_and_exit;
 
 	if (blk_queue_is_zoned(q) && blk_zone_plug_bio(bio, nr_segs))
-		goto queue_exit;
+		goto submit_remainder_and_exit;
 
 new_request:
 	if (rq) {
@@ -3190,7 +3192,7 @@ void blk_mq_submit_bio(struct bio *bio)
 		if (unlikely(!rq)) {
 			if (bio->bi_opf & REQ_NOWAIT)
 				bio_wouldblock_error(bio);
-			goto queue_exit;
+			goto submit_remainder_and_exit;
 		}
 	}
 
@@ -3205,18 +3207,18 @@ void blk_mq_submit_bio(struct bio *bio)
 		bio->bi_status = ret;
 		bio_endio(bio);
 		blk_mq_free_request(rq);
-		return;
+		goto submit_remainder;
 	}
 
 	if (bio_zone_write_plugging(bio))
 		blk_zone_write_plug_init_request(rq);
 
 	if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
-		return;
+		goto submit_remainder;
 
 	if (plug) {
 		blk_add_rq_to_plug(plug, rq);
-		return;
+		goto submit_remainder;
 	}
 
 	hctx = rq->mq_hctx;
@@ -3227,8 +3229,17 @@ void blk_mq_submit_bio(struct bio *bio)
 	} else {
 		blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
 	}
+
+submit_remainder:
+	if (remainder)
+		submit_bio_noacct(remainder);
+
 	return;
 
+submit_remainder_and_exit:
+	if (remainder)
+		submit_bio_noacct(remainder);
+
 queue_exit:
 	/*
 	 * Don't drop the queue reference if we were trying to use a cached
diff --git a/block/blk.h b/block/blk.h
index 665b3d1fb504..31fd9a2208a0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -340,13 +340,13 @@ ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
 ssize_t part_timeout_store(struct device *, struct device_attribute *,
 				const char *, size_t);
 
-struct bio *bio_split_discard(struct bio *bio, const struct queue_limits *lim,
+struct bio *bio_split_discard(struct bio **bio, const struct queue_limits *lim,
 		unsigned *nsegs);
-struct bio *bio_split_write_zeroes(struct bio *bio,
+struct bio *bio_split_write_zeroes(struct bio **bio,
 		const struct queue_limits *lim, unsigned *nsegs);
-struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+struct bio *bio_split_rw(struct bio **bio, const struct queue_limits *lim,
 		unsigned *nr_segs);
-struct bio *bio_split_zone_append(struct bio *bio,
+struct bio *bio_split_zone_append(struct bio **bio,
 		const struct queue_limits *lim, unsigned *nr_segs);
 
 /*
@@ -370,27 +370,30 @@ static inline bool bio_may_need_split(struct bio *bio,
 
 /**
  * __bio_split_to_limits - split a bio to fit the queue limits
- * @bio:     bio to be split
+ * @bio:     pointer to the bio to be split
  * @lim:     queue limits to split based on
  * @nr_segs: returns the number of segments in the returned bio
  *
  * Check if @bio needs splitting based on the queue limits, and if so split off
  * a bio fitting the limits from the beginning of @bio and return it.  @bio is
- * shortened to the remainder and re-submitted.
+ * shortened to the remainder and stored in *@bio.
  *
  * The split bio is allocated from @q->bio_split, which is provided by the
  * block layer.
  */
-static inline struct bio *__bio_split_to_limits(struct bio *bio,
+static inline struct bio *__bio_split_to_limits(struct bio **bio,
 		const struct queue_limits *lim, unsigned int *nr_segs)
 {
-	switch (bio_op(bio)) {
+	struct bio *orig_bio = *bio;
+
+	switch (bio_op(*bio)) {
 	case REQ_OP_READ:
 	case REQ_OP_WRITE:
-		if (bio_may_need_split(bio, lim))
+		if (bio_may_need_split(*bio, lim))
 			return bio_split_rw(bio, lim, nr_segs);
 		*nr_segs = 1;
-		return bio;
+		*bio = NULL;
+		return orig_bio;
 	case REQ_OP_ZONE_APPEND:
 		return bio_split_zone_append(bio, lim, nr_segs);
 	case REQ_OP_DISCARD:
@@ -401,7 +404,8 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
 	default:
 		/* other operations can't be split */
 		*nr_segs = 0;
-		return bio;
+		*bio = NULL;
+		return orig_bio;
 	}
 }
 
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 380e6584a4ee..1076fa616a25 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1612,7 +1612,7 @@ void drbd_submit_bio(struct bio *bio)
 {
 	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
 
-	bio = bio_split_to_limits(bio);
+	bio = bio_split_to_limits_and_submit(bio);
 	if (!bio)
 		return;
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d5cc7bd2875c..e165ce11dce2 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2431,7 +2431,7 @@ static void pkt_submit_bio(struct bio *bio)
 	struct device *ddev = disk_to_dev(pd->disk);
 	struct bio *split;
 
-	bio = bio_split_to_limits(bio);
+	bio = bio_split_to_limits_and_submit(bio);
 	if (!bio)
 		return;
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5ab7574c0c76..2974e95f671a 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1939,6 +1939,7 @@ static blk_status_t __send_zone_reset_all(struct clone_info *ci)
 static void dm_split_and_process_bio(struct mapped_device *md,
 				     struct dm_table *map, struct bio *bio)
 {
+	struct bio *remainder = NULL;
 	struct clone_info ci;
 	struct dm_io *io;
 	blk_status_t error = BLK_STS_OK;
@@ -1961,7 +1962,8 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 		 * emulation to ensure that the BIO does not cross zone
 		 * boundaries.
 		 */
-		bio = bio_split_to_limits(bio);
+		remainder = bio;
+		bio = bio_split_to_limits(&remainder);
 		if (!bio)
 			return;
 	}
@@ -1971,7 +1973,7 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 	 * need zone append emulation (e.g. dm-crypt).
 	 */
 	if (static_branch_unlikely(&zoned_enabled) && dm_zone_plug_bio(md, bio))
-		return;
+		goto submit_remainder;
 
 	/* Only support nowait for normal IO */
 	if (unlikely(bio->bi_opf & REQ_NOWAIT) && !is_abnormal) {
@@ -1982,13 +1984,13 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 		 */
 		if (bio->bi_opf & REQ_PREFLUSH) {
 			bio_wouldblock_error(bio);
-			return;
+			goto submit_remainder;
 		}
 		io = alloc_io(md, bio, GFP_NOWAIT);
 		if (unlikely(!io)) {
 			/* Unable to do anything without dm_io. */
 			bio_wouldblock_error(bio);
-			return;
+			goto submit_remainder;
 		}
 	} else {
 		io = alloc_io(md, bio, GFP_NOIO);
@@ -2036,6 +2038,10 @@ static void dm_split_and_process_bio(struct mapped_device *md,
 		dm_io_dec_pending(io, error);
 	} else
 		dm_queue_poll_io(bio, io);
+
+submit_remainder:
+	if (remainder)
+		submit_bio_noacct(remainder);
 }
 
 static void dm_submit_bio(struct bio *bio)
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 9daa78c5fe33..40c20398a9cc 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -418,7 +418,7 @@ static void md_submit_bio(struct bio *bio)
 		return;
 	}
 
-	bio = bio_split_to_limits(bio);
+	bio = bio_split_to_limits_and_submit(bio);
 	if (!bio)
 		return;
 
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 250f3da67cc9..d84cf3e3f979 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -457,7 +457,7 @@ static void nvme_ns_head_submit_bio(struct bio *bio)
 	 * different queue via blk_steal_bios(), so we need to use the bio_split
 	 * pool from the original queue to allocate the bvecs from.
 	 */
-	bio = bio_split_to_limits(bio);
+	bio = bio_split_to_limits_and_submit(bio);
 	if (!bio)
 		return;
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6ebd8e7f3ac9..1d8267f389bb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -934,7 +934,8 @@ void blk_request_module(dev_t devt);
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 void submit_bio_noacct(struct bio *bio);
-struct bio *bio_split_to_limits(struct bio *bio);
+struct bio *bio_split_to_limits(struct bio **bio);
+struct bio *bio_split_to_limits_and_submit(struct bio *bio);
 
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags);

^ permalink raw reply related	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2025-05-14  5:29 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-05-12 22:56 [PATCH] block: Split bios in LBA order Bart Van Assche
2025-05-13  6:44 ` Christoph Hellwig
2025-05-13  7:28   ` Yu Kuai
2025-05-13 20:36   ` Bart Van Assche
2025-05-14  5:29     ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox