linux-xfs.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Damien Le Moal <dlemoal@kernel.org>
To: Jens Axboe <axboe@kernel.dk>,
	linux-block@vger.kernel.org, linux-nvme@lists.infradead.org,
	Keith Busch <keith.busch@wdc.com>, Christoph Hellwig <hch@lst.de>,
	dm-devel@lists.linux.dev, Mike Snitzer <snitzer@kernel.org>,
	Mikulas Patocka <mpatocka@redhat.com>,
	"Martin K . Petersen" <martin.petersen@oracle.com>,
	linux-scsi@vger.kernel.org, linux-xfs@vger.kernel.org,
	Carlos Maiolino <cem@kernel.org>,
	linux-btrfs@vger.kernel.org, David Sterba <dsterba@suse.com>
Subject: [PATCH v2 06/15] block: use zone condition to determine conventional zones
Date: Mon,  3 Nov 2025 22:31:14 +0900	[thread overview]
Message-ID: <20251103133123.645038-7-dlemoal@kernel.org> (raw)
In-Reply-To: <20251103133123.645038-1-dlemoal@kernel.org>

The conv_zones_bitmap field of struct gendisk is used to define a bitmap
to identify the conventional zones of a zoned block device. The bit for
a zone is set in this bitmap if the zone is a conventional one, that is,
if the zone type is BLK_ZONE_TYPE_CONVENTIONAL. For such zone, this
always corresponds to the zone condition BLK_ZONE_COND_NOT_WP.
In other words, conv_zones_bitmap tracks a single condition of the
zones of a zoned block device.

In preparation for tracking more zone conditions, change
conv_zones_bitmap into an array of zone conditions, using 1 byte per
zone. This increases the memory usage from 1 bit per zone to 1 byte per
zone, that is, from 16 KiB to about 100 KiB for a 30 TB SMR HDD with 256
MiB zones. This is a trade-off to allow fast cached report zones later
on top of this change.

Rename the conv_zones_bitmap field of struct gendisk to zones_cond. Add
a blk_revalidate_zone_cond() function to initialize the zones_cond array
of a disk during device scan and to update it on device revalidation.
Move the allocation of the zones_cond array to
disk_revalidate_zone_resources(), making sure that this array is always
allocated, even for devices that do not need zone write plugs (zone
resources), to ensure that bdev_zone_is_seq() can be re-implemented to
use the zone condition array in place of the conv zones bitmap.

Finally, the function bdev_zone_is_seq() is rewritten to use a test on
the condition of the target zone.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 block/blk-zoned.c      | 153 +++++++++++++++++++++++++++++------------
 include/linux/blkdev.h |  37 +++-------
 2 files changed, 117 insertions(+), 73 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index d4fc87b0be6b..f62862274f9a 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -114,6 +114,33 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
 }
 EXPORT_SYMBOL_GPL(blk_zone_cond_str);
 
+/**
+ * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
+ * @bdev:       block device to check
+ * @sector:     sector number
+ *
+ * Check if @sector on @bdev is contained in a sequential write required zone.
+ */
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	unsigned int zno = disk_zone_no(disk, sector);
+	bool is_seq = false;
+	u8 *zones_cond;
+
+	if (!bdev_is_zoned(bdev))
+		return false;
+
+	rcu_read_lock();
+	zones_cond = rcu_dereference(disk->zones_cond);
+	if (zones_cond && zno < disk->nr_zones)
+		is_seq = zones_cond[zno] != BLK_ZONE_COND_NOT_WP;
+	rcu_read_unlock();
+
+	return is_seq;
+}
+EXPORT_SYMBOL_GPL(bdev_zone_is_seq);
+
 /*
  * Zone report arguments for block device drivers report_zones operation.
  * @cb: report_zones_cb callback for each reported zone.
@@ -1458,22 +1485,16 @@ static void disk_destroy_zone_wplugs_hash_table(struct gendisk *disk)
 	disk->zone_wplugs_hash_bits = 0;
 }
 
-static unsigned int disk_set_conv_zones_bitmap(struct gendisk *disk,
-					       unsigned long *bitmap)
+static void disk_set_zones_cond_array(struct gendisk *disk, u8 *zones_cond)
 {
-	unsigned int nr_conv_zones = 0;
 	unsigned long flags;
 
 	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
-	if (bitmap)
-		nr_conv_zones = bitmap_weight(bitmap, disk->nr_zones);
-	bitmap = rcu_replace_pointer(disk->conv_zones_bitmap, bitmap,
-				     lockdep_is_held(&disk->zone_wplugs_lock));
+	zones_cond = rcu_replace_pointer(disk->zones_cond, zones_cond,
+				lockdep_is_held(&disk->zone_wplugs_lock));
 	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 
-	kfree_rcu_mightsleep(bitmap);
-
-	return nr_conv_zones;
+	kfree_rcu_mightsleep(zones_cond);
 }
 
 void disk_free_zone_resources(struct gendisk *disk)
@@ -1497,7 +1518,7 @@ void disk_free_zone_resources(struct gendisk *disk)
 	mempool_destroy(disk->zone_wplugs_pool);
 	disk->zone_wplugs_pool = NULL;
 
-	disk_set_conv_zones_bitmap(disk, NULL);
+	disk_set_zones_cond_array(disk, NULL);
 	disk->zone_capacity = 0;
 	disk->last_zone_capacity = 0;
 	disk->nr_zones = 0;
@@ -1516,12 +1537,31 @@ static inline bool disk_need_zone_resources(struct gendisk *disk)
 		queue_emulates_zone_append(disk->queue);
 }
 
+struct blk_revalidate_zone_args {
+	struct gendisk	*disk;
+	u8		*zones_cond;
+	unsigned int	nr_zones;
+	unsigned int	nr_conv_zones;
+	unsigned int	zone_capacity;
+	unsigned int	last_zone_capacity;
+	sector_t	sector;
+};
+
 static int disk_revalidate_zone_resources(struct gendisk *disk,
-					  unsigned int nr_zones)
+				struct blk_revalidate_zone_args *args)
 {
 	struct queue_limits *lim = &disk->queue->limits;
 	unsigned int pool_size;
 
+	args->disk = disk;
+	args->nr_zones =
+		DIV_ROUND_UP_ULL(get_capacity(disk), lim->chunk_sectors);
+
+	/* Cached zone conditions: 1 byte per zone */
+	args->zones_cond = kzalloc(args->nr_zones, GFP_NOIO);
+	if (!args->zones_cond)
+		return -ENOMEM;
+
 	if (!disk_need_zone_resources(disk))
 		return 0;
 
@@ -1531,7 +1571,8 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 	 */
 	pool_size = max(lim->max_open_zones, lim->max_active_zones);
 	if (!pool_size)
-		pool_size = min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, nr_zones);
+		pool_size =
+			min(BLK_ZONE_WPLUG_DEFAULT_POOL_SIZE, args->nr_zones);
 
 	if (!disk->zone_wplugs_hash)
 		return disk_alloc_zone_resources(disk, pool_size);
@@ -1539,15 +1580,6 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 	return 0;
 }
 
-struct blk_revalidate_zone_args {
-	struct gendisk	*disk;
-	unsigned long	*conv_zones_bitmap;
-	unsigned int	nr_zones;
-	unsigned int	zone_capacity;
-	unsigned int	last_zone_capacity;
-	sector_t	sector;
-};
-
 /*
  * Update the disk zone resources information and device queue limits.
  * The disk queue is frozen when this is executed.
@@ -1556,7 +1588,7 @@ static int disk_update_zone_resources(struct gendisk *disk,
 				      struct blk_revalidate_zone_args *args)
 {
 	struct request_queue *q = disk->queue;
-	unsigned int nr_seq_zones, nr_conv_zones;
+	unsigned int nr_seq_zones;
 	unsigned int pool_size, memflags;
 	struct queue_limits lim;
 	int ret = 0;
@@ -1566,24 +1598,24 @@ static int disk_update_zone_resources(struct gendisk *disk,
 	memflags = blk_mq_freeze_queue(q);
 
 	disk->nr_zones = args->nr_zones;
-	disk->zone_capacity = args->zone_capacity;
-	disk->last_zone_capacity = args->last_zone_capacity;
-	nr_conv_zones =
-		disk_set_conv_zones_bitmap(disk, args->conv_zones_bitmap);
-	if (nr_conv_zones >= disk->nr_zones) {
+	if (args->nr_conv_zones >= disk->nr_zones) {
 		pr_warn("%s: Invalid number of conventional zones %u / %u\n",
-			disk->disk_name, nr_conv_zones, disk->nr_zones);
+			disk->disk_name, args->nr_conv_zones, disk->nr_zones);
 		ret = -ENODEV;
 		goto unfreeze;
 	}
 
+	disk->zone_capacity = args->zone_capacity;
+	disk->last_zone_capacity = args->last_zone_capacity;
+	disk_set_zones_cond_array(disk, args->zones_cond);
+
 	/*
-	 * Some devices can advertize zone resource limits that are larger than
+	 * Some devices can advertise zone resource limits that are larger than
 	 * the number of sequential zones of the zoned block device, e.g. a
 	 * small ZNS namespace. For such case, assume that the zoned device has
 	 * no zone resource limits.
 	 */
-	nr_seq_zones = disk->nr_zones - nr_conv_zones;
+	nr_seq_zones = disk->nr_zones - args->nr_conv_zones;
 	if (lim.max_open_zones >= nr_seq_zones)
 		lim.max_open_zones = 0;
 	if (lim.max_active_zones >= nr_seq_zones)
@@ -1624,6 +1656,44 @@ static int disk_update_zone_resources(struct gendisk *disk,
 	return ret;
 }
 
+static int blk_revalidate_zone_cond(struct blk_zone *zone, unsigned int idx,
+				    struct blk_revalidate_zone_args *args)
+{
+	enum blk_zone_cond cond = zone->cond;
+
+	/* Check that the zone condition is consistent with the zone type. */
+	switch (cond) {
+	case BLK_ZONE_COND_NOT_WP:
+		if (zone->type != BLK_ZONE_TYPE_CONVENTIONAL)
+			goto invalid_condition;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+	case BLK_ZONE_COND_EXP_OPEN:
+	case BLK_ZONE_COND_CLOSED:
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	case BLK_ZONE_COND_OFFLINE:
+	case BLK_ZONE_COND_READONLY:
+		if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
+			goto invalid_condition;
+		break;
+	default:
+		pr_warn("%s: Invalid zone condition 0x%X\n",
+			args->disk->disk_name, cond);
+		return -ENODEV;
+	}
+
+	args->zones_cond[idx] = cond;
+
+	return 0;
+
+invalid_condition:
+	pr_warn("%s: Invalid zone condition 0x%x for type 0x%x\n",
+		args->disk->disk_name, cond, zone->type);
+
+	return -ENODEV;
+}
+
 static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
 				    struct blk_revalidate_zone_args *args)
 {
@@ -1638,17 +1708,7 @@ static int blk_revalidate_conv_zone(struct blk_zone *zone, unsigned int idx,
 	if (disk_zone_is_last(disk, zone))
 		args->last_zone_capacity = zone->capacity;
 
-	if (!disk_need_zone_resources(disk))
-		return 0;
-
-	if (!args->conv_zones_bitmap) {
-		args->conv_zones_bitmap =
-			bitmap_zalloc(args->nr_zones, GFP_NOIO);
-		if (!args->conv_zones_bitmap)
-			return -ENOMEM;
-	}
-
-	set_bit(idx, args->conv_zones_bitmap);
+	args->nr_conv_zones++;
 
 	return 0;
 }
@@ -1746,6 +1806,11 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
 		return -ENODEV;
 	}
 
+	/* Check zone condition */
+	ret = blk_revalidate_zone_cond(zone, idx, args);
+	if (ret)
+		return ret;
+
 	/* Check zone type */
 	switch (zone->type) {
 	case BLK_ZONE_TYPE_CONVENTIONAL:
@@ -1813,10 +1878,8 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
 	 * Ensure that all memory allocations in this context are done as if
 	 * GFP_NOIO was specified.
 	 */
-	args.disk = disk;
-	args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
 	noio_flag = memalloc_noio_save();
-	ret = disk_revalidate_zone_resources(disk, args.nr_zones);
+	ret = disk_revalidate_zone_resources(disk, &args);
 	if (ret) {
 		memalloc_noio_restore(noio_flag);
 		return ret;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f75fb15f55f..53bcfbc2f68f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -196,7 +196,7 @@ struct gendisk {
 	unsigned int		nr_zones;
 	unsigned int		zone_capacity;
 	unsigned int		last_zone_capacity;
-	unsigned long __rcu	*conv_zones_bitmap;
+	u8 __rcu		*zones_cond;
 	unsigned int		zone_wplugs_hash_bits;
 	atomic_t		nr_zone_wplugs;
 	spinlock_t		zone_wplugs_lock;
@@ -925,12 +925,20 @@ static inline unsigned int bdev_zone_capacity(struct block_device *bdev,
 {
 	return disk_zone_capacity(bdev->bd_disk, pos);
 }
+
+bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector);
+
 #else /* CONFIG_BLK_DEV_ZONED */
 static inline unsigned int disk_nr_zones(struct gendisk *disk)
 {
 	return 0;
 }
 
+static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
+{
+	return false;
+}
+
 static inline bool bio_needs_zone_write_plugging(struct bio *bio)
 {
 	return false;
@@ -1533,33 +1541,6 @@ static inline bool bdev_is_zone_aligned(struct block_device *bdev,
 	return bdev_is_zone_start(bdev, sector);
 }
 
-/**
- * bdev_zone_is_seq - check if a sector belongs to a sequential write zone
- * @bdev:	block device to check
- * @sector:	sector number
- *
- * Check if @sector on @bdev is contained in a sequential write required zone.
- */
-static inline bool bdev_zone_is_seq(struct block_device *bdev, sector_t sector)
-{
-	bool is_seq = false;
-
-#if IS_ENABLED(CONFIG_BLK_DEV_ZONED)
-	if (bdev_is_zoned(bdev)) {
-		struct gendisk *disk = bdev->bd_disk;
-		unsigned long *bitmap;
-
-		rcu_read_lock();
-		bitmap = rcu_dereference(disk->conv_zones_bitmap);
-		is_seq = !bitmap ||
-			!test_bit(disk_zone_no(disk, sector), bitmap);
-		rcu_read_unlock();
-	}
-#endif
-
-	return is_seq;
-}
-
 int blk_zone_issue_zeroout(struct block_device *bdev, sector_t sector,
 			   sector_t nr_sects, gfp_t gfp_mask);
 
-- 
2.51.0


  parent reply	other threads:[~2025-11-03 13:35 UTC|newest]

Thread overview: 41+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2025-11-03 13:31 [PATCH v2 00/15] Introduce cached report zones Damien Le Moal
2025-11-03 13:31 ` [PATCH v2 01/15] block: handle zone management operations completions Damien Le Moal
2025-11-03 13:53   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 02/15] block: freeze queue when updating zone resources Damien Le Moal
2025-11-03 13:46   ` Christoph Hellwig
2025-11-03 13:55   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 03/15] block: cleanup blkdev_report_zones() Damien Le Moal
2025-11-03 13:56   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 04/15] block: introduce disk_report_zone() Damien Le Moal
2025-11-03 14:01   ` Johannes Thumshirn
2025-11-04  0:36   ` kernel test robot
2025-11-03 13:31 ` [PATCH v2 05/15] block: reorganize struct blk_zone_wplug Damien Le Moal
2025-11-03 14:01   ` Johannes Thumshirn
2025-11-03 13:31 ` Damien Le Moal [this message]
2025-11-03 14:52   ` [PATCH v2 06/15] block: use zone condition to determine conventional zones Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 07/15] block: track zone conditions Damien Le Moal
2025-11-03 15:00   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 08/15] block: refactor blkdev_report_zones() code Damien Le Moal
2025-11-03 13:47   ` Christoph Hellwig
2025-11-03 15:01   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 09/15] block: introduce blkdev_get_zone_info() Damien Le Moal
2025-11-03 15:12   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 10/15] block: introduce blkdev_report_zones_cached() Damien Le Moal
2025-11-03 13:31 ` [PATCH v2 11/15] block: introduce BLKREPORTZONESV2 ioctl Damien Le Moal
2025-11-03 15:17   ` Johannes Thumshirn
2025-11-03 22:12     ` Bart Van Assche
2025-11-03 23:01       ` Damien Le Moal
2025-11-04  0:15     ` Damien Le Moal
2025-11-04  1:01       ` Bart Van Assche
2025-11-04  1:20         ` Damien Le Moal
2025-11-04  7:23       ` Johannes Thumshirn
2025-11-04  7:38         ` Damien Le Moal
2025-11-03 13:31 ` [PATCH v2 12/15] block: improve zone_wplugs debugfs attribute output Damien Le Moal
2025-11-03 13:47   ` Christoph Hellwig
2025-11-03 15:18   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 13/15] block: add zone write plug condition to debugfs zone_wplugs Damien Le Moal
2025-11-03 15:23   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 14/15] btrfs: use blkdev_report_zones_cached() Damien Le Moal
2025-11-03 15:26   ` Johannes Thumshirn
2025-11-03 13:31 ` [PATCH v2 15/15] xfs: " Damien Le Moal
2025-11-03 15:27   ` Johannes Thumshirn

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20251103133123.645038-7-dlemoal@kernel.org \
    --to=dlemoal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=cem@kernel.org \
    --cc=dm-devel@lists.linux.dev \
    --cc=dsterba@suse.com \
    --cc=hch@lst.de \
    --cc=keith.busch@wdc.com \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-btrfs@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=linux-xfs@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=mpatocka@redhat.com \
    --cc=snitzer@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).