[PATCH 1/7] zbd: bump ZBD_MAX_OPEN

Flexible I/O Tester development
 help / color / mirror / Atom feed

* [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES
@ 2020-04-30 12:40 Alexey Dobriyan
  2020-04-30 12:40 ` [PATCH 2/7] zbd: don't lock zones outside working area Alexey Dobriyan
                   ` (6 more replies)
  0 siblings, 7 replies; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

128 opened zones is not enough for everyone!

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 zbd_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/zbd_types.h b/zbd_types.h
index 2f2f1324..d63c0d0a 100644
--- a/zbd_types.h
+++ b/zbd_types.h
@@ -8,7 +8,7 @@
 
 #include <inttypes.h>
 
-#define ZBD_MAX_OPEN_ZONES	128
+#define ZBD_MAX_OPEN_ZONES	4096
 
 /*
  * Zoned block device models.
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 2/7] zbd: don't lock zones outside working area
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:27   ` Damien Le Moal
  2020-04-30 12:40 ` [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit Alexey Dobriyan
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

Currently threads tap into each other zones even if their working areas
as defined by [f->file_offset, f->file_offset + f->io_size) don't intersect.

This should remove unnecessary quiescing.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 file.h |  3 +++
 zbd.c  | 35 +++++++++++++++++++++++++----------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/file.h b/file.h
index ae0e6fc8..375bbfd3 100644
--- a/file.h
+++ b/file.h
@@ -104,6 +104,9 @@ struct fio_file {
 	 * Zoned block device information. See also zonemode=zbd.
 	 */
 	struct zoned_block_device_info *zbd_info;
+	/* zonemode=zbd working area */
+	uint32_t min_zone;	/* inclusive */
+	uint32_t max_zone;	/* exclusive */
 
 	/*
 	 * Track last end and last start of IO for a given data direction
diff --git a/zbd.c b/zbd.c
index 9f5a89ed..e8f0a4d8 100644
--- a/zbd.c
+++ b/zbd.c
@@ -156,8 +156,14 @@ static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
 		z->wp + required > z->start + f->zbd_info->zone_size;
 }
 
-static void zone_lock(struct thread_data *td, struct fio_zone_info *z)
+static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zone_info *z)
 {
+	struct zoned_block_device_info *zbd = f->zbd_info;
+	uint32_t nz = z - zbd->zone_info;
+
+	/* Thread should never lock zones outside its working area. */
+	assert(f->min_zone <= nz && nz < f->max_zone);
+
 	/*
 	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
 	 * is changed or when io_u completes and zbd_put_io() executed.
@@ -289,6 +295,9 @@ static bool zbd_verify_sizes(void)
 					 (unsigned long long) new_end - f->file_offset);
 				f->io_size = new_end - f->file_offset;
 			}
+
+			f->min_zone = zbd_zone_idx(f, f->file_offset);
+			f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
 		}
 	}
 
@@ -735,7 +744,7 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
 
 		if (!zbd_zone_swr(z))
 			continue;
-		zone_lock(td, z);
+		zone_lock(td, f, z);
 		if (all_zones) {
 			unsigned int i;
 
@@ -958,7 +967,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 						      struct io_u *io_u)
 {
 	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
-	const struct fio_file *f = io_u->file;
+	struct fio_file *f = io_u->file;
 	struct fio_zone_info *z;
 	unsigned int open_zone_idx = -1;
 	uint32_t zone_idx, new_zone_idx;
@@ -975,6 +984,11 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 	} else {
 		zone_idx = zbd_zone_idx(f, io_u->offset);
 	}
+	if (zone_idx < f->min_zone) {
+		zone_idx = f->min_zone;
+	} else if (zone_idx >= f->max_zone) {
+		zone_idx = f->max_zone - 1;
+	}
 	dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
 	       __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
 
@@ -989,7 +1003,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 
 		z = &f->zbd_info->zone_info[zone_idx];
 
-		zone_lock(td, z);
+		zone_lock(td, f, z);
 		pthread_mutex_lock(&f->zbd_info->mutex);
 		if (td->o.max_open_zones == 0)
 			goto examine_zone;
@@ -1015,8 +1029,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
 			if (tmp_idx >= f->zbd_info->num_open_zones)
 				tmp_idx = 0;
 			tmpz = f->zbd_info->open_zones[tmp_idx];
-
-			if (is_valid_offset(f, f->zbd_info->zone_info[tmpz].start)) {
+			if (f->min_zone <= tmpz && tmpz < f->max_zone) {
 				open_zone_idx = tmp_idx;
 				goto found_candidate_zone;
 			}
@@ -1061,11 +1074,11 @@ examine_zone:
 		z++;
 		if (!is_valid_offset(f, z->start)) {
 			/* Wrap-around. */
-			zone_idx = zbd_zone_idx(f, f->file_offset);
+			zone_idx = f->min_zone;
 			z = &f->zbd_info->zone_info[zone_idx];
 		}
 		assert(is_valid_offset(f, z->start));
-		zone_lock(td, z);
+		zone_lock(td, f, z);
 		if (z->open)
 			continue;
 		if (zbd_open_zone(td, io_u, zone_idx))
@@ -1078,12 +1091,14 @@ examine_zone:
 	pthread_mutex_lock(&f->zbd_info->mutex);
 	for (i = 0; i < f->zbd_info->num_open_zones; i++) {
 		zone_idx = f->zbd_info->open_zones[i];
+		if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
+			continue;
 		pthread_mutex_unlock(&f->zbd_info->mutex);
 		pthread_mutex_unlock(&z->mutex);
 
 		z = &f->zbd_info->zone_info[zone_idx];
 
-		zone_lock(td, z);
+		zone_lock(td, f, z);
 		if (z->wp + min_bs <= (z+1)->start)
 			goto out;
 		pthread_mutex_lock(&f->zbd_info->mutex);
@@ -1409,7 +1424,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
 
 	zbd_check_swd(f);
 
-	zone_lock(td, zb);
+	zone_lock(td, f, zb);
 
 	switch (io_u->ddir) {
 	case DDIR_READ:
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
  2020-04-30 12:40 ` [PATCH 2/7] zbd: don't lock zones outside working area Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:34   ` Damien Le Moal
  2020-04-30 12:40 ` [PATCH 4/7] zbd: make zbd_info->mutex non-recursive Alexey Dobriyan
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

It is not possible to maintain equal per-thread iodepth. The way code
is written, "max_open_zones" acts as a global limit, and one thread
opens all "max_open_zones" for itself and others starve for available
zones and _exit_ prematurely.

This config is guaranteed to make equal number of zone resets/IO now:
each thread generates identical pattern and doesn't intersect with other
threads:

	zonemode=zbd
	zonesize=...
	rw=write

	numjobs=N
	offset_increment=M*zonesize

	[j]
	size=M*zonesize

Patch introduces "global_max_open_zones" which is per-device config
option. "max_open_zones" becomes per-thread limit. Both limits are
checked for each open zone so one thread can't starve others.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 fio.1            |  6 +++++-
 fio.h            |  1 +
 options.c        | 14 ++++++++++++--
 thread_options.h |  1 +
 zbd.c            | 37 +++++++++++++++++++++++++++++++++----
 zbd.h            |  3 +++
 6 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/fio.1 b/fio.1
index a2379f98..1c04a3af 100644
--- a/fio.1
+++ b/fio.1
@@ -804,7 +804,11 @@ so. Default: false.
 When running a random write test across an entire drive many more zones will be
 open than in a typical application workload. Hence this command line option
 that allows to limit the number of open zones. The number of open zones is
-defined as the number of zones to which write commands are issued.
+defined as the number of zones to which write commands are issued by one
+thread/process.
+.TP
+.BI global_max_open_zones \fR=\fPint
+Global limit on the number of simultaneously opened zones per block device.
 .TP
 .BI zone_reset_threshold \fR=\fPfloat
 A number between zero and one that indicates the ratio of logical blocks with
diff --git a/fio.h b/fio.h
index bbf057c1..20ca80e2 100644
--- a/fio.h
+++ b/fio.h
@@ -260,6 +260,7 @@ struct thread_data {
 	struct frand_state prio_state;
 
 	struct zone_split_index **zone_state_index;
+	unsigned int num_open_zones;
 
 	unsigned int verify_batch;
 	unsigned int trim_batch;
diff --git a/options.c b/options.c
index 2372c042..306874ea 100644
--- a/options.c
+++ b/options.c
@@ -3364,8 +3364,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.type	= FIO_OPT_INT,
 		.off1	= offsetof(struct thread_options, max_open_zones),
 		.maxval	= ZBD_MAX_OPEN_ZONES,
-		.help	= "Limit random writes to SMR drives to the specified"
-			  " number of sequential zones",
+		.help	= "Thread/process limit on the number of simultaneously opened sequential write zones in SMR/ZNS drives",
+		.def	= "0",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "global_max_open_zones",
+		.lname	= "Global maximum number of open zones",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct thread_options, global_max_open_zones),
+		.maxval	= ZBD_MAX_OPEN_ZONES,
+		.help	= "Block device limit on the number of simultaneously opened sequential write zones in SMR/ZNS drives",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
diff --git a/thread_options.h b/thread_options.h
index c78ed43d..4078d46f 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -342,6 +342,7 @@ struct thread_options {
 	/* Parameters that affect zonemode=zbd */
 	unsigned int read_beyond_wp;
 	int max_open_zones;
+	unsigned int global_max_open_zones;
 	fio_fp64_t zrt;
 	fio_fp64_t zrf;
 };
diff --git a/zbd.c b/zbd.c
index e8f0a4d8..a517349a 100644
--- a/zbd.c
+++ b/zbd.c
@@ -546,8 +546,10 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 		return -EINVAL;
 	}
 
-	if (ret == 0)
+	if (ret == 0) {
 		f->zbd_info->model = zbd_model;
+		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
+	}
 	return ret;
 }
 
@@ -622,6 +624,27 @@ int zbd_init(struct thread_data *td)
 	if (!zbd_verify_bs())
 		return 1;
 
+	for_each_file(td, f, i) {
+		struct zoned_block_device_info *zbd = f->zbd_info;
+
+		if (!zbd)
+			continue;
+
+		if (zbd->max_open_zones == 0) {
+			zbd->max_open_zones = ZBD_MAX_OPEN_ZONES;
+		}
+
+		if (td->o.global_max_open_zones &&
+		    zbd->max_open_zones != td->o.global_max_open_zones) {
+			log_err("Different 'global_max_open_zones' values\n");
+			return 1;
+		}
+		if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
+			log_err("'global_max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
+			return 1;
+		}
+	}
+
 	return 0;
 }
 
@@ -714,6 +737,7 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
 		(ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
 		sizeof(f->zbd_info->open_zones[0]));
 	f->zbd_info->num_open_zones--;
+	td->num_open_zones--;
 	f->zbd_info->zone_info[zone_idx].open = 0;
 }
 
@@ -895,8 +919,10 @@ static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
 	struct zoned_block_device_info *zbdi = f->zbd_info;
 	int i;
 
-	assert(td->o.max_open_zones <= ARRAY_SIZE(zbdi->open_zones));
-	assert(zbdi->num_open_zones <= td->o.max_open_zones);
+	assert(td->o.max_open_zones == 0 || td->num_open_zones <= td->o.max_open_zones);
+
+	assert(td->o.max_open_zones <= zbdi->max_open_zones);
+	assert(zbdi->num_open_zones <= zbdi->max_open_zones);
 
 	for (i = 0; i < zbdi->num_open_zones; i++)
 		if (zbdi->open_zones[i] == zone_idx)
@@ -937,10 +963,13 @@ static bool zbd_open_zone(struct thread_data *td, const struct io_u *io_u,
 	if (is_zone_open(td, f, zone_idx))
 		goto out;
 	res = false;
-	if (f->zbd_info->num_open_zones >= td->o.max_open_zones)
+	if (td->num_open_zones >= td->o.max_open_zones)
+		goto out;
+	if (f->zbd_info->num_open_zones >= f->zbd_info->max_open_zones)
 		goto out;
 	dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
 	f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
+	td->num_open_zones++;
 	z->open = 1;
 	res = true;
 
diff --git a/zbd.h b/zbd.h
index 5a660399..fb39fb82 100644
--- a/zbd.h
+++ b/zbd.h
@@ -45,6 +45,8 @@ struct fio_zone_info {
 /**
  * zoned_block_device_info - zoned block device characteristics
  * @model: Device model.
+ * @max_open_zones: global limit on the number of simultaneously opened
+ *	sequential write zones.
  * @mutex: Protects the modifiable members in this structure (refcount and
  *		num_open_zones).
  * @zone_size: size of a single zone in units of 512 bytes
@@ -65,6 +67,7 @@ struct fio_zone_info {
  */
 struct zoned_block_device_info {
 	enum zbd_zoned_model	model;
+	uint32_t		max_open_zones;
 	pthread_mutex_t		mutex;
 	uint64_t		zone_size;
 	uint64_t		sectors_with_data;
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 4/7] zbd: make zbd_info->mutex non-recursive
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
  2020-04-30 12:40 ` [PATCH 2/7] zbd: don't lock zones outside working area Alexey Dobriyan
  2020-04-30 12:40 ` [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:36   ` Damien Le Moal
  2020-04-30 12:40 ` [PATCH 5/7] zbd: consolidate zone mutex initialisation Alexey Dobriyan
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

There is no reason for it to be recursive. Resursiveness leaked from
zi->mutex.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 zbd.c | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/zbd.c b/zbd.c
index a517349a..18cedfce 100644
--- a/zbd.c
+++ b/zbd.c
@@ -373,9 +373,9 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 		return -ENOMEM;
 
 	pthread_mutexattr_init(&attr);
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	pthread_mutexattr_setpshared(&attr, true);
 	pthread_mutex_init(&zbd_info->mutex, &attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	zbd_info->refcount = 1;
 	p = &zbd_info->zone_info[0];
 	for (i = 0; i < nr_zones; i++, p++) {
@@ -417,7 +417,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	int i, j, ret = 0;
 
 	pthread_mutexattr_init(&attr);
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	pthread_mutexattr_setpshared(&attr, true);
 
 	zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
@@ -454,6 +453,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	if (!zbd_info)
 		goto out;
 	pthread_mutex_init(&zbd_info->mutex, &attr);
+	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	zbd_info->refcount = 1;
 	p = &zbd_info->zone_info[0];
 	for (offset = 0, j = 0; j < nr_zones;) {
@@ -801,14 +801,20 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
  * Reset zbd_info.write_cnt, the counter that counts down towards the next
  * zone reset.
  */
-static void zbd_reset_write_cnt(const struct thread_data *td,
-				const struct fio_file *f)
+static void _zbd_reset_write_cnt(const struct thread_data *td,
+				 const struct fio_file *f)
 {
 	assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
 
-	pthread_mutex_lock(&f->zbd_info->mutex);
 	f->zbd_info->write_cnt = td->o.zrf.u.f ?
 		min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
+}
+
+static void zbd_reset_write_cnt(const struct thread_data *td,
+				const struct fio_file *f)
+{
+	pthread_mutex_lock(&f->zbd_info->mutex);
+	_zbd_reset_write_cnt(td, f);
 	pthread_mutex_unlock(&f->zbd_info->mutex);
 }
 
@@ -822,7 +828,7 @@ static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
 	if (f->zbd_info->write_cnt)
 		write_cnt = --f->zbd_info->write_cnt;
 	if (write_cnt == 0)
-		zbd_reset_write_cnt(td, f);
+		_zbd_reset_write_cnt(td, f);
 	pthread_mutex_unlock(&f->zbd_info->mutex);
 
 	return write_cnt == 0;
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 5/7] zbd: consolidate zone mutex initialisation
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
                   ` (2 preceding siblings ...)
  2020-04-30 12:40 ` [PATCH 4/7] zbd: make zbd_info->mutex non-recursive Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:44   ` Damien Le Moal
  2020-04-30 12:40 ` [PATCH 6/7] fio: parse "io_size=1%" Alexey Dobriyan
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

Another mutex is coming!

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 zbd.c | 40 +++++++++++++++++++++-------------------
 1 file changed, 21 insertions(+), 19 deletions(-)

diff --git a/zbd.c b/zbd.c
index 18cedfce..2febc8c3 100644
--- a/zbd.c
+++ b/zbd.c
@@ -351,7 +351,6 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 	struct fio_zone_info *p;
 	uint64_t zone_size = td->o.zone_size;
 	struct zoned_block_device_info *zbd_info = NULL;
-	pthread_mutexattr_t attr;
 	int i;
 
 	if (zone_size == 0) {
@@ -372,14 +371,9 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 	if (!zbd_info)
 		return -ENOMEM;
 
-	pthread_mutexattr_init(&attr);
-	pthread_mutexattr_setpshared(&attr, true);
-	pthread_mutex_init(&zbd_info->mutex, &attr);
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	zbd_info->refcount = 1;
 	p = &zbd_info->zone_info[0];
 	for (i = 0; i < nr_zones; i++, p++) {
-		pthread_mutex_init(&p->mutex, &attr);
 		p->start = i * zone_size;
 		p->wp = p->start + zone_size;
 		p->type = ZBD_ZONE_TYPE_SWR;
@@ -393,7 +387,6 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
 	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
 		ilog2(zone_size) : 0;
 	f->zbd_info->nr_zones = nr_zones;
-	pthread_mutexattr_destroy(&attr);
 	return 0;
 }
 
@@ -413,12 +406,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	struct fio_zone_info *p;
 	uint64_t zone_size, offset;
 	struct zoned_block_device_info *zbd_info = NULL;
-	pthread_mutexattr_t attr;
 	int i, j, ret = 0;
 
-	pthread_mutexattr_init(&attr);
-	pthread_mutexattr_setpshared(&attr, true);
-
 	zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
 	if (!zones)
 		goto out;
@@ -452,14 +441,11 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 	ret = -ENOMEM;
 	if (!zbd_info)
 		goto out;
-	pthread_mutex_init(&zbd_info->mutex, &attr);
-	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
 	zbd_info->refcount = 1;
 	p = &zbd_info->zone_info[0];
 	for (offset = 0, j = 0; j < nr_zones;) {
 		z = &zones[0];
 		for (i = 0; i < nrz; i++, j++, z++, p++) {
-			pthread_mutex_init(&p->mutex, &attr);
 			p->start = z->start;
 			switch (z->cond) {
 			case ZBD_ZONE_COND_NOT_WP:
@@ -510,7 +496,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
 out:
 	sfree(zbd_info);
 	free(zones);
-	pthread_mutexattr_destroy(&attr);
 	return ret;
 }
 
@@ -521,6 +506,7 @@ out:
  */
 static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 {
+	struct zoned_block_device_info *zbd;
 	enum zbd_zoned_model zbd_model;
 	int ret;
 
@@ -546,11 +532,27 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
 		return -EINVAL;
 	}
 
-	if (ret == 0) {
-		f->zbd_info->model = zbd_model;
-		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
+	if (ret)
+		return ret;
+
+	zbd = f->zbd_info;
+	zbd->model = zbd_model;
+	zbd->max_open_zones = td->o.global_max_open_zones;
+	{
+		pthread_mutexattr_t attr;
+
+		pthread_mutexattr_init(&attr);
+		pthread_mutexattr_setpshared(&attr, true);
+		pthread_mutex_init(&zbd->mutex, &attr);
+		pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
+		for (uint32_t z = 0; z < zbd->nr_zones; z++) {
+			struct fio_zone_info *zi = &zbd->zone_info[z];
+
+			pthread_mutex_init(&zi->mutex, &attr);
+		}
+		pthread_mutexattr_destroy(&attr);
 	}
-	return ret;
+	return 0;
 }
 
 void zbd_free_zone_info(struct fio_file *f)
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 6/7] fio: parse "io_size=1%"
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
                   ` (3 preceding siblings ...)
  2020-04-30 12:40 ` [PATCH 5/7] zbd: consolidate zone mutex initialisation Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:51   ` Damien Le Moal
  2020-04-30 12:40 ` [PATCH 7/7] verify: decouple seed generation from buffer fill Alexey Dobriyan
  2020-05-01  1:19 ` [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Damien Le Moal
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

The following config:

	zonemode=zbd
	size=1%
	io_size=1%

will generate essentially infinite loop, because io_size is (2^64-1)-1.

Parse io_size as percentage to avoid it.

Note: there is different bug if size% != io_size% and io_size is ignored
but it will be separate patch.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 cconv.c          |  1 +
 options.c        | 17 +++++++++++++++++
 thread_options.h |  4 ++++
 3 files changed, 22 insertions(+)

diff --git a/cconv.c b/cconv.c
index 48218dc4..d547ae11 100644
--- a/cconv.c
+++ b/cconv.c
@@ -102,6 +102,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
 	o->size = le64_to_cpu(top->size);
 	o->io_size = le64_to_cpu(top->io_size);
 	o->size_percent = le32_to_cpu(top->size_percent);
+	o->io_size_percent = le32_to_cpu(top->io_size_percent);
 	o->fill_device = le32_to_cpu(top->fill_device);
 	o->file_append = le32_to_cpu(top->file_append);
 	o->file_size_low = le64_to_cpu(top->file_size_low);
diff --git a/options.c b/options.c
index 306874ea..c5d33d1a 100644
--- a/options.c
+++ b/options.c
@@ -1475,6 +1475,22 @@ static int str_size_cb(void *data, unsigned long long *__val)
 	return 0;
 }
 
+static int str_io_size_cb(void *data, unsigned long long *__val)
+{
+	struct thread_data *td = cb_data_to_td(data);
+	unsigned long long v = *__val;
+
+	if (parse_is_percent(v)) {
+		td->o.io_size = 0;
+		td->o.io_size_percent = -1ULL - v;
+		dprint(FD_PARSE, "SET io_size_percent %d\n",
+					td->o.io_size_percent);
+	} else
+		td->o.io_size = v;
+
+	return 0;
+}
+
 static int str_write_bw_log_cb(void *data, const char *str)
 {
 	struct thread_data *td = cb_data_to_td(data);
@@ -2042,6 +2058,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
 		.alias	= "io_limit",
 		.lname	= "IO Size",
 		.type	= FIO_OPT_STR_VAL,
+		.cb	= str_io_size_cb,
 		.off1	= offsetof(struct thread_options, io_size),
 		.help	= "Total size of I/O to be performed",
 		.interval = 1024 * 1024,
diff --git a/thread_options.h b/thread_options.h
index 4078d46f..197d2ec2 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -83,6 +83,7 @@ struct thread_options {
 	unsigned long long size;
 	unsigned long long io_size;
 	unsigned int size_percent;
+	unsigned int io_size_percent;
 	unsigned int fill_device;
 	unsigned int file_append;
 	unsigned long long file_size_low;
@@ -377,6 +378,7 @@ struct thread_options_pack {
 	uint64_t size;
 	uint64_t io_size;
 	uint32_t size_percent;
+	uint32_t io_size_percent;
 	uint32_t fill_device;
 	uint32_t file_append;
 	uint32_t unique_filename;
@@ -456,6 +458,8 @@ struct thread_options_pack {
 	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
 	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
 
+	uint8_t pad1[4];
+
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
 	fio_fp64_t gauss_dev;
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* [PATCH 7/7] verify: decouple seed generation from buffer fill
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
                   ` (4 preceding siblings ...)
  2020-04-30 12:40 ` [PATCH 6/7] fio: parse "io_size=1%" Alexey Dobriyan
@ 2020-04-30 12:40 ` Alexey Dobriyan
  2020-05-01  1:59   ` Damien Le Moal
  2020-05-01  1:19 ` [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Damien Le Moal
  6 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-04-30 12:40 UTC (permalink / raw)
  To: axboe; +Cc: adobriyan, damien.lemoal, fio

Nicer this way and less LOC.

Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
---
 backend.c |  6 ------
 verify.c  | 20 +++++++-------------
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/backend.c b/backend.c
index feb34e51..452975cf 100644
--- a/backend.c
+++ b/backend.c
@@ -1006,12 +1006,6 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
 		if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ &&
 		    ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) {
 
-			if (!td->o.verify_pattern_bytes) {
-				io_u->rand_seed = __rand(&td->verify_state);
-				if (sizeof(int) != sizeof(long *))
-					io_u->rand_seed *= __rand(&td->verify_state);
-			}
-
 			if (verify_state_should_stop(td, io_u)) {
 				put_io_u(td, io_u);
 				break;
diff --git a/verify.c b/verify.c
index cf299ebf..b7fa6693 100644
--- a/verify.c
+++ b/verify.c
@@ -46,15 +46,6 @@ static void __fill_buffer(struct thread_options *o, uint64_t seed, void *p,
 	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
 }
 
-static uint64_t fill_buffer(struct thread_data *td, void *p,
-			    unsigned int len)
-{
-	struct frand_state *fs = &td->verify_state;
-	struct thread_options *o = &td->o;
-
-	return fill_random_buf_percentage(fs, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
-}
-
 void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
 			 struct io_u *io_u, uint64_t seed, int use_seed)
 {
@@ -63,10 +54,13 @@ void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
 	if (!o->verify_pattern_bytes) {
 		dprint(FD_VERIFY, "fill random bytes len=%u\n", len);
 
-		if (use_seed)
-			__fill_buffer(o, seed, p, len);
-		else
-			io_u->rand_seed = fill_buffer(td, p, len);
+		if (!use_seed) {
+			seed = __rand(&td->verify_state);
+			if (sizeof(int) != sizeof(long *))
+				seed *= (unsigned long)__rand(&td->verify_state);
+		}
+		io_u->rand_seed = seed;
+		__fill_buffer(o, seed, p, len);
 		return;
 	}
 
-- 
2.26.2



^ permalink raw reply related	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES
  2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
                   ` (5 preceding siblings ...)
  2020-04-30 12:40 ` [PATCH 7/7] verify: decouple seed generation from buffer fill Alexey Dobriyan
@ 2020-05-01  1:19 ` Damien Le Moal
  2020-05-01 14:47   ` Alexey Dobriyan
  6 siblings, 1 reply; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:19 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> 128 opened zones is not enough for everyone!
> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  zbd_types.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/zbd_types.h b/zbd_types.h
> index 2f2f1324..d63c0d0a 100644
> --- a/zbd_types.h
> +++ b/zbd_types.h
> @@ -8,7 +8,7 @@
>  
>  #include <inttypes.h>
>  
> -#define ZBD_MAX_OPEN_ZONES	128
> +#define ZBD_MAX_OPEN_ZONES	4096
>  
>  /*
>   * Zoned block device models.
> 

Makes sense. ZNS will will need this.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>

Note: You did not send a cover letter for the series. Please add it for the next
round.

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 2/7] zbd: don't lock zones outside working area
  2020-04-30 12:40 ` [PATCH 2/7] zbd: don't lock zones outside working area Alexey Dobriyan
@ 2020-05-01  1:27   ` Damien Le Moal
  0 siblings, 0 replies; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:27 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> Currently threads tap into each other zones even if their working areas
> as defined by [f->file_offset, f->file_offset + f->io_size) don't intersect.

You are only stating the current problem. Please add a short description of what
this patch does to resolve it.

> 
> This should remove unnecessary quiescing.
> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  file.h |  3 +++
>  zbd.c  | 35 +++++++++++++++++++++++++----------
>  2 files changed, 28 insertions(+), 10 deletions(-)
> 
> diff --git a/file.h b/file.h
> index ae0e6fc8..375bbfd3 100644
> --- a/file.h
> +++ b/file.h
> @@ -104,6 +104,9 @@ struct fio_file {
>  	 * Zoned block device information. See also zonemode=zbd.
>  	 */
>  	struct zoned_block_device_info *zbd_info;
> +	/* zonemode=zbd working area */
> +	uint32_t min_zone;	/* inclusive */
> +	uint32_t max_zone;	/* exclusive */
>  
>  	/*
>  	 * Track last end and last start of IO for a given data direction
> diff --git a/zbd.c b/zbd.c
> index 9f5a89ed..e8f0a4d8 100644
> --- a/zbd.c
> +++ b/zbd.c
> @@ -156,8 +156,14 @@ static bool zbd_zone_full(const struct fio_file *f, struct fio_zone_info *z,
>  		z->wp + required > z->start + f->zbd_info->zone_size;
>  }
>  
> -static void zone_lock(struct thread_data *td, struct fio_zone_info *z)
> +static void zone_lock(struct thread_data *td, struct fio_file *f, struct fio_zone_info *z)
>  {
> +	struct zoned_block_device_info *zbd = f->zbd_info;
> +	uint32_t nz = z - zbd->zone_info;
> +
> +	/* Thread should never lock zones outside its working area. */

	/* A thread should never lock zones outside its working area. */

> +	assert(f->min_zone <= nz && nz < f->max_zone);
> +
>  	/*
>  	 * Lock the io_u target zone. The zone will be unlocked if io_u offset
>  	 * is changed or when io_u completes and zbd_put_io() executed.
> @@ -289,6 +295,9 @@ static bool zbd_verify_sizes(void)
>  					 (unsigned long long) new_end - f->file_offset);
>  				f->io_size = new_end - f->file_offset;
>  			}
> +
> +			f->min_zone = zbd_zone_idx(f, f->file_offset);
> +			f->max_zone = zbd_zone_idx(f, f->file_offset + f->io_size);
>  		}
>  	}
>  
> @@ -735,7 +744,7 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
>  
>  		if (!zbd_zone_swr(z))
>  			continue;
> -		zone_lock(td, z);
> +		zone_lock(td, f, z);
>  		if (all_zones) {
>  			unsigned int i;
>  
> @@ -958,7 +967,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
>  						      struct io_u *io_u)
>  {
>  	const uint32_t min_bs = td->o.min_bs[io_u->ddir];
> -	const struct fio_file *f = io_u->file;
> +	struct fio_file *f = io_u->file;
>  	struct fio_zone_info *z;
>  	unsigned int open_zone_idx = -1;
>  	uint32_t zone_idx, new_zone_idx;
> @@ -975,6 +984,11 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
>  	} else {
>  		zone_idx = zbd_zone_idx(f, io_u->offset);
>  	}
> +	if (zone_idx < f->min_zone) {
> +		zone_idx = f->min_zone;
> +	} else if (zone_idx >= f->max_zone) {
> +		zone_idx = f->max_zone - 1;
> +	}

No need for the {} brackets.

>  	dprint(FD_ZBD, "%s(%s): starting from zone %d (offset %lld, buflen %lld)\n",
>  	       __func__, f->file_name, zone_idx, io_u->offset, io_u->buflen);
>  
> @@ -989,7 +1003,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
>  
>  		z = &f->zbd_info->zone_info[zone_idx];
>  
> -		zone_lock(td, z);
> +		zone_lock(td, f, z);
>  		pthread_mutex_lock(&f->zbd_info->mutex);
>  		if (td->o.max_open_zones == 0)
>  			goto examine_zone;
> @@ -1015,8 +1029,7 @@ static struct fio_zone_info *zbd_convert_to_open_zone(struct thread_data *td,
>  			if (tmp_idx >= f->zbd_info->num_open_zones)
>  				tmp_idx = 0;
>  			tmpz = f->zbd_info->open_zones[tmp_idx];
> -
> -			if (is_valid_offset(f, f->zbd_info->zone_info[tmpz].start)) {
> +			if (f->min_zone <= tmpz && tmpz < f->max_zone) {
>  				open_zone_idx = tmp_idx;
>  				goto found_candidate_zone;
>  			}
> @@ -1061,11 +1074,11 @@ examine_zone:
>  		z++;
>  		if (!is_valid_offset(f, z->start)) {
>  			/* Wrap-around. */
> -			zone_idx = zbd_zone_idx(f, f->file_offset);
> +			zone_idx = f->min_zone;
>  			z = &f->zbd_info->zone_info[zone_idx];
>  		}
>  		assert(is_valid_offset(f, z->start));
> -		zone_lock(td, z);
> +		zone_lock(td, f, z);
>  		if (z->open)
>  			continue;
>  		if (zbd_open_zone(td, io_u, zone_idx))
> @@ -1078,12 +1091,14 @@ examine_zone:
>  	pthread_mutex_lock(&f->zbd_info->mutex);
>  	for (i = 0; i < f->zbd_info->num_open_zones; i++) {
>  		zone_idx = f->zbd_info->open_zones[i];
> +		if (zone_idx < f->min_zone || zone_idx >= f->max_zone)
> +			continue;
>  		pthread_mutex_unlock(&f->zbd_info->mutex);
>  		pthread_mutex_unlock(&z->mutex);
>  
>  		z = &f->zbd_info->zone_info[zone_idx];
>  
> -		zone_lock(td, z);
> +		zone_lock(td, f, z);
>  		if (z->wp + min_bs <= (z+1)->start)
>  			goto out;
>  		pthread_mutex_lock(&f->zbd_info->mutex);
> @@ -1409,7 +1424,7 @@ enum io_u_action zbd_adjust_block(struct thread_data *td, struct io_u *io_u)
>  
>  	zbd_check_swd(f);
>  
> -	zone_lock(td, zb);
> +	zone_lock(td, f, zb);
>  
>  	switch (io_u->ddir) {
>  	case DDIR_READ:
> 

With the above nits fixed, feel free to add:

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit
  2020-04-30 12:40 ` [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit Alexey Dobriyan
@ 2020-05-01  1:34   ` Damien Le Moal
  2020-05-01 18:52     ` Alexey Dobriyan
  0 siblings, 1 reply; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:34 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> It is not possible to maintain equal per-thread iodepth. The way code
> is written, "max_open_zones" acts as a global limit, and one thread
> opens all "max_open_zones" for itself and others starve for available
> zones and _exit_ prematurely.
> 
> This config is guaranteed to make equal number of zone resets/IO now:
> each thread generates identical pattern and doesn't intersect with other
> threads:
> 
> 	zonemode=zbd
> 	zonesize=...
> 	rw=write
> 
> 	numjobs=N
> 	offset_increment=M*zonesize
> 
> 	[j]
> 	size=M*zonesize
> 
> Patch introduces "global_max_open_zones" which is per-device config
> option. "max_open_zones" becomes per-thread limit. Both limits are
> checked for each open zone so one thread can't starve others.

It makes sense. Nice one.

But the change as is will break existing test scripts (e.g. lots of SMR drives
are being tested with this). I think we can avoid this breakage simply: leave
max_open_zones option definition as is and add "job_max_open_zones" or
"thread_max_open_zones" option (no strong feelings about the name here, as long
as it is explicit) to define the per thread maximum number of open zones. This
new option could actually default to max_open_zones / numjobs if that is not 0.

> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  fio.1            |  6 +++++-
>  fio.h            |  1 +
>  options.c        | 14 ++++++++++++--
>  thread_options.h |  1 +
>  zbd.c            | 37 +++++++++++++++++++++++++++++++++----
>  zbd.h            |  3 +++
>  6 files changed, 55 insertions(+), 7 deletions(-)
> 
> diff --git a/fio.1 b/fio.1
> index a2379f98..1c04a3af 100644
> --- a/fio.1
> +++ b/fio.1
> @@ -804,7 +804,11 @@ so. Default: false.
>  When running a random write test across an entire drive many more zones will be
>  open than in a typical application workload. Hence this command line option
>  that allows to limit the number of open zones. The number of open zones is
> -defined as the number of zones to which write commands are issued.
> +defined as the number of zones to which write commands are issued by one
> +thread/process.
> +.TP
> +.BI global_max_open_zones \fR=\fPint
> +Global limit on the number of simultaneously opened zones per block device.
>  .TP
>  .BI zone_reset_threshold \fR=\fPfloat
>  A number between zero and one that indicates the ratio of logical blocks with
> diff --git a/fio.h b/fio.h
> index bbf057c1..20ca80e2 100644
> --- a/fio.h
> +++ b/fio.h
> @@ -260,6 +260,7 @@ struct thread_data {
>  	struct frand_state prio_state;
>  
>  	struct zone_split_index **zone_state_index;
> +	unsigned int num_open_zones;
>  
>  	unsigned int verify_batch;
>  	unsigned int trim_batch;
> diff --git a/options.c b/options.c
> index 2372c042..306874ea 100644
> --- a/options.c
> +++ b/options.c
> @@ -3364,8 +3364,18 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
>  		.type	= FIO_OPT_INT,
>  		.off1	= offsetof(struct thread_options, max_open_zones),
>  		.maxval	= ZBD_MAX_OPEN_ZONES,
> -		.help	= "Limit random writes to SMR drives to the specified"
> -			  " number of sequential zones",
> +		.help	= "Thread/process limit on the number of simultaneously opened sequential write zones in SMR/ZNS drives",
> +		.def	= "0",
> +		.category = FIO_OPT_C_IO,
> +		.group	= FIO_OPT_G_INVALID,
> +	},
> +	{
> +		.name	= "global_max_open_zones",
> +		.lname	= "Global maximum number of open zones",
> +		.type	= FIO_OPT_INT,
> +		.off1	= offsetof(struct thread_options, global_max_open_zones),
> +		.maxval	= ZBD_MAX_OPEN_ZONES,
> +		.help	= "Block device limit on the number of simultaneously opened sequential write zones in SMR/ZNS drives",
>  		.def	= "0",
>  		.category = FIO_OPT_C_IO,
>  		.group	= FIO_OPT_G_INVALID,
> diff --git a/thread_options.h b/thread_options.h
> index c78ed43d..4078d46f 100644
> --- a/thread_options.h
> +++ b/thread_options.h
> @@ -342,6 +342,7 @@ struct thread_options {
>  	/* Parameters that affect zonemode=zbd */
>  	unsigned int read_beyond_wp;
>  	int max_open_zones;
> +	unsigned int global_max_open_zones;
>  	fio_fp64_t zrt;
>  	fio_fp64_t zrf;
>  };
> diff --git a/zbd.c b/zbd.c
> index e8f0a4d8..a517349a 100644
> --- a/zbd.c
> +++ b/zbd.c
> @@ -546,8 +546,10 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
>  		return -EINVAL;
>  	}
>  
> -	if (ret == 0)
> +	if (ret == 0) {
>  		f->zbd_info->model = zbd_model;
> +		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
> +	}
>  	return ret;
>  }
>  
> @@ -622,6 +624,27 @@ int zbd_init(struct thread_data *td)
>  	if (!zbd_verify_bs())
>  		return 1;
>  
> +	for_each_file(td, f, i) {
> +		struct zoned_block_device_info *zbd = f->zbd_info;
> +
> +		if (!zbd)
> +			continue;
> +
> +		if (zbd->max_open_zones == 0) {
> +			zbd->max_open_zones = ZBD_MAX_OPEN_ZONES;
> +		}
> +
> +		if (td->o.global_max_open_zones &&
> +		    zbd->max_open_zones != td->o.global_max_open_zones) {
> +			log_err("Different 'global_max_open_zones' values\n");
> +			return 1;
> +		}
> +		if (zbd->max_open_zones > ZBD_MAX_OPEN_ZONES) {
> +			log_err("'global_max_open_zones' value is limited by %u\n", ZBD_MAX_OPEN_ZONES);
> +			return 1;
> +		}
> +	}
> +
>  	return 0;
>  }
>  
> @@ -714,6 +737,7 @@ static void zbd_close_zone(struct thread_data *td, const struct fio_file *f,
>  		(ZBD_MAX_OPEN_ZONES - (open_zone_idx + 1)) *
>  		sizeof(f->zbd_info->open_zones[0]));
>  	f->zbd_info->num_open_zones--;
> +	td->num_open_zones--;
>  	f->zbd_info->zone_info[zone_idx].open = 0;
>  }
>  
> @@ -895,8 +919,10 @@ static bool is_zone_open(const struct thread_data *td, const struct fio_file *f,
>  	struct zoned_block_device_info *zbdi = f->zbd_info;
>  	int i;
>  
> -	assert(td->o.max_open_zones <= ARRAY_SIZE(zbdi->open_zones));
> -	assert(zbdi->num_open_zones <= td->o.max_open_zones);
> +	assert(td->o.max_open_zones == 0 || td->num_open_zones <= td->o.max_open_zones);
> +
> +	assert(td->o.max_open_zones <= zbdi->max_open_zones);
> +	assert(zbdi->num_open_zones <= zbdi->max_open_zones);
>  
>  	for (i = 0; i < zbdi->num_open_zones; i++)
>  		if (zbdi->open_zones[i] == zone_idx)
> @@ -937,10 +963,13 @@ static bool zbd_open_zone(struct thread_data *td, const struct io_u *io_u,
>  	if (is_zone_open(td, f, zone_idx))
>  		goto out;
>  	res = false;
> -	if (f->zbd_info->num_open_zones >= td->o.max_open_zones)
> +	if (td->num_open_zones >= td->o.max_open_zones)
> +		goto out;
> +	if (f->zbd_info->num_open_zones >= f->zbd_info->max_open_zones)
>  		goto out;
>  	dprint(FD_ZBD, "%s: opening zone %d\n", f->file_name, zone_idx);
>  	f->zbd_info->open_zones[f->zbd_info->num_open_zones++] = zone_idx;
> +	td->num_open_zones++;
>  	z->open = 1;
>  	res = true;
>  
> diff --git a/zbd.h b/zbd.h
> index 5a660399..fb39fb82 100644
> --- a/zbd.h
> +++ b/zbd.h
> @@ -45,6 +45,8 @@ struct fio_zone_info {
>  /**
>   * zoned_block_device_info - zoned block device characteristics
>   * @model: Device model.
> + * @max_open_zones: global limit on the number of simultaneously opened
> + *	sequential write zones.
>   * @mutex: Protects the modifiable members in this structure (refcount and
>   *		num_open_zones).
>   * @zone_size: size of a single zone in units of 512 bytes
> @@ -65,6 +67,7 @@ struct fio_zone_info {
>   */
>  struct zoned_block_device_info {
>  	enum zbd_zoned_model	model;
> +	uint32_t		max_open_zones;
>  	pthread_mutex_t		mutex;
>  	uint64_t		zone_size;
>  	uint64_t		sectors_with_data;
> 


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 4/7] zbd: make zbd_info->mutex non-recursive
  2020-04-30 12:40 ` [PATCH 4/7] zbd: make zbd_info->mutex non-recursive Alexey Dobriyan
@ 2020-05-01  1:36   ` Damien Le Moal
  0 siblings, 0 replies; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:36 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> There is no reason for it to be recursive. Resursiveness leaked from
> zi->mutex.
> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  zbd.c | 18 ++++++++++++------
>  1 file changed, 12 insertions(+), 6 deletions(-)
> 
> diff --git a/zbd.c b/zbd.c
> index a517349a..18cedfce 100644
> --- a/zbd.c
> +++ b/zbd.c
> @@ -373,9 +373,9 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
>  		return -ENOMEM;
>  
>  	pthread_mutexattr_init(&attr);
> -	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	pthread_mutexattr_setpshared(&attr, true);
>  	pthread_mutex_init(&zbd_info->mutex, &attr);
> +	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	zbd_info->refcount = 1;
>  	p = &zbd_info->zone_info[0];
>  	for (i = 0; i < nr_zones; i++, p++) {
> @@ -417,7 +417,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
>  	int i, j, ret = 0;
>  
>  	pthread_mutexattr_init(&attr);
> -	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	pthread_mutexattr_setpshared(&attr, true);
>  
>  	zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
> @@ -454,6 +453,7 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
>  	if (!zbd_info)
>  		goto out;
>  	pthread_mutex_init(&zbd_info->mutex, &attr);
> +	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	zbd_info->refcount = 1;
>  	p = &zbd_info->zone_info[0];
>  	for (offset = 0, j = 0; j < nr_zones;) {
> @@ -801,14 +801,20 @@ static int zbd_reset_zones(struct thread_data *td, struct fio_file *f,
>   * Reset zbd_info.write_cnt, the counter that counts down towards the next
>   * zone reset.
>   */
> -static void zbd_reset_write_cnt(const struct thread_data *td,
> -				const struct fio_file *f)
> +static void _zbd_reset_write_cnt(const struct thread_data *td,
> +				 const struct fio_file *f)
>  {
>  	assert(0 <= td->o.zrf.u.f && td->o.zrf.u.f <= 1);
>  
> -	pthread_mutex_lock(&f->zbd_info->mutex);
>  	f->zbd_info->write_cnt = td->o.zrf.u.f ?
>  		min(1.0 / td->o.zrf.u.f, 0.0 + UINT_MAX) : UINT_MAX;
> +}
> +
> +static void zbd_reset_write_cnt(const struct thread_data *td,
> +				const struct fio_file *f)
> +{
> +	pthread_mutex_lock(&f->zbd_info->mutex);
> +	_zbd_reset_write_cnt(td, f);
>  	pthread_mutex_unlock(&f->zbd_info->mutex);
>  }
>  
> @@ -822,7 +828,7 @@ static bool zbd_dec_and_reset_write_cnt(const struct thread_data *td,
>  	if (f->zbd_info->write_cnt)
>  		write_cnt = --f->zbd_info->write_cnt;
>  	if (write_cnt == 0)
> -		zbd_reset_write_cnt(td, f);
> +		_zbd_reset_write_cnt(td, f);
>  	pthread_mutex_unlock(&f->zbd_info->mutex);
>  
>  	return write_cnt == 0;
> 

Looks good.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/7] zbd: consolidate zone mutex initialisation
  2020-04-30 12:40 ` [PATCH 5/7] zbd: consolidate zone mutex initialisation Alexey Dobriyan
@ 2020-05-01  1:44   ` Damien Le Moal
  2020-05-01 18:37     ` Alexey Dobriyan
  0 siblings, 1 reply; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:44 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> Another mutex is coming!

That is not very descriptive at all.
What the patch does is to make the per zone mutex initialization code common
between init_zone_info() and parse_zone_info(), moving it to create_zone_info().

> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  zbd.c | 40 +++++++++++++++++++++-------------------
>  1 file changed, 21 insertions(+), 19 deletions(-)
> 
> diff --git a/zbd.c b/zbd.c
> index 18cedfce..2febc8c3 100644
> --- a/zbd.c
> +++ b/zbd.c
> @@ -351,7 +351,6 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
>  	struct fio_zone_info *p;
>  	uint64_t zone_size = td->o.zone_size;
>  	struct zoned_block_device_info *zbd_info = NULL;
> -	pthread_mutexattr_t attr;
>  	int i;
>  
>  	if (zone_size == 0) {
> @@ -372,14 +371,9 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
>  	if (!zbd_info)
>  		return -ENOMEM;
>  
> -	pthread_mutexattr_init(&attr);
> -	pthread_mutexattr_setpshared(&attr, true);
> -	pthread_mutex_init(&zbd_info->mutex, &attr);
> -	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	zbd_info->refcount = 1;
>  	p = &zbd_info->zone_info[0];
>  	for (i = 0; i < nr_zones; i++, p++) {
> -		pthread_mutex_init(&p->mutex, &attr);
>  		p->start = i * zone_size;
>  		p->wp = p->start + zone_size;
>  		p->type = ZBD_ZONE_TYPE_SWR;
> @@ -393,7 +387,6 @@ static int init_zone_info(struct thread_data *td, struct fio_file *f)
>  	f->zbd_info->zone_size_log2 = is_power_of_2(zone_size) ?
>  		ilog2(zone_size) : 0;
>  	f->zbd_info->nr_zones = nr_zones;
> -	pthread_mutexattr_destroy(&attr);
>  	return 0;
>  }
>  
> @@ -413,12 +406,8 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
>  	struct fio_zone_info *p;
>  	uint64_t zone_size, offset;
>  	struct zoned_block_device_info *zbd_info = NULL;
> -	pthread_mutexattr_t attr;
>  	int i, j, ret = 0;
>  
> -	pthread_mutexattr_init(&attr);
> -	pthread_mutexattr_setpshared(&attr, true);
> -
>  	zones = calloc(ZBD_REPORT_MAX_ZONES, sizeof(struct zbd_zone));
>  	if (!zones)
>  		goto out;
> @@ -452,14 +441,11 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
>  	ret = -ENOMEM;
>  	if (!zbd_info)
>  		goto out;
> -	pthread_mutex_init(&zbd_info->mutex, &attr);
> -	pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>  	zbd_info->refcount = 1;
>  	p = &zbd_info->zone_info[0];
>  	for (offset = 0, j = 0; j < nr_zones;) {
>  		z = &zones[0];
>  		for (i = 0; i < nrz; i++, j++, z++, p++) {
> -			pthread_mutex_init(&p->mutex, &attr);
>  			p->start = z->start;
>  			switch (z->cond) {
>  			case ZBD_ZONE_COND_NOT_WP:
> @@ -510,7 +496,6 @@ static int parse_zone_info(struct thread_data *td, struct fio_file *f)
>  out:
>  	sfree(zbd_info);
>  	free(zones);
> -	pthread_mutexattr_destroy(&attr);
>  	return ret;
>  }
>  
> @@ -521,6 +506,7 @@ out:
>   */
>  static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
>  {
> +	struct zoned_block_device_info *zbd;
>  	enum zbd_zoned_model zbd_model;
>  	int ret;
>  
> @@ -546,11 +532,27 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
>  		return -EINVAL;
>  	}
>  
> -	if (ret == 0) {
> -		f->zbd_info->model = zbd_model;
> -		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
> +	if (ret)
> +		return ret;
> +
> +	zbd = f->zbd_info;
> +	zbd->model = zbd_model;
> +	zbd->max_open_zones = td->o.global_max_open_zones;
> +	{
> +		pthread_mutexattr_t attr;

If you move this declaration to the beginning of zbd_create_zone_info(), you can
avoid this block code, which does not look pretty.

> +
> +		pthread_mutexattr_init(&attr);
> +		pthread_mutexattr_setpshared(&attr, true);
> +		pthread_mutex_init(&zbd->mutex, &attr);
> +		pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
> +		for (uint32_t z = 0; z < zbd->nr_zones; z++) {
> +			struct fio_zone_info *zi = &zbd->zone_info[z];
> +
> +			pthread_mutex_init(&zi->mutex, &attr);
> +		}
> +		pthread_mutexattr_destroy(&attr);
>  	}
> -	return ret;
> +	return 0;
>  }
>  
>  void zbd_free_zone_info(struct fio_file *f)
> 

With the nits above fixed, Looks good to me.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>

-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 6/7] fio: parse "io_size=1%"
  2020-04-30 12:40 ` [PATCH 6/7] fio: parse "io_size=1%" Alexey Dobriyan
@ 2020-05-01  1:51   ` Damien Le Moal
  2020-05-01  6:00     ` Sitsofe Wheeler
  0 siblings, 1 reply; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:51 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> The following config:
> 
> 	zonemode=zbd

I do not think this problem to be restricted to zonemode=zbd, no
If so, may be remove this from the example config.

> 	size=1%
> 	io_size=1%
> 
> will generate essentially infinite loop, because io_size is (2^64-1)-1.
> 
> Parse io_size as percentage to avoid it.
> 
> Note: there is different bug if size% != io_size% and io_size is ignored
> but it will be separate patch.
> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  cconv.c          |  1 +
>  options.c        | 17 +++++++++++++++++
>  thread_options.h |  4 ++++
>  3 files changed, 22 insertions(+)
> 
> diff --git a/cconv.c b/cconv.c
> index 48218dc4..d547ae11 100644
> --- a/cconv.c
> +++ b/cconv.c
> @@ -102,6 +102,7 @@ void convert_thread_options_to_cpu(struct thread_options *o,
>  	o->size = le64_to_cpu(top->size);
>  	o->io_size = le64_to_cpu(top->io_size);
>  	o->size_percent = le32_to_cpu(top->size_percent);
> +	o->io_size_percent = le32_to_cpu(top->io_size_percent);
>  	o->fill_device = le32_to_cpu(top->fill_device);
>  	o->file_append = le32_to_cpu(top->file_append);
>  	o->file_size_low = le64_to_cpu(top->file_size_low);
> diff --git a/options.c b/options.c
> index 306874ea..c5d33d1a 100644
> --- a/options.c
> +++ b/options.c
> @@ -1475,6 +1475,22 @@ static int str_size_cb(void *data, unsigned long long *__val)
>  	return 0;
>  }
>  
> +static int str_io_size_cb(void *data, unsigned long long *__val)
> +{
> +	struct thread_data *td = cb_data_to_td(data);
> +	unsigned long long v = *__val;
> +
> +	if (parse_is_percent(v)) {
> +		td->o.io_size = 0;
> +		td->o.io_size_percent = -1ULL - v;
> +		dprint(FD_PARSE, "SET io_size_percent %d\n",
> +					td->o.io_size_percent);
> +	} else
> +		td->o.io_size = v;
> +
> +	return 0;
> +}
> +
>  static int str_write_bw_log_cb(void *data, const char *str)
>  {
>  	struct thread_data *td = cb_data_to_td(data);
> @@ -2042,6 +2058,7 @@ struct fio_option fio_options[FIO_MAX_OPTS] = {
>  		.alias	= "io_limit",
>  		.lname	= "IO Size",
>  		.type	= FIO_OPT_STR_VAL,
> +		.cb	= str_io_size_cb,
>  		.off1	= offsetof(struct thread_options, io_size),
>  		.help	= "Total size of I/O to be performed",
>  		.interval = 1024 * 1024,
> diff --git a/thread_options.h b/thread_options.h
> index 4078d46f..197d2ec2 100644
> --- a/thread_options.h
> +++ b/thread_options.h
> @@ -83,6 +83,7 @@ struct thread_options {
>  	unsigned long long size;
>  	unsigned long long io_size;
>  	unsigned int size_percent;
> +	unsigned int io_size_percent;
>  	unsigned int fill_device;
>  	unsigned int file_append;
>  	unsigned long long file_size_low;
> @@ -377,6 +378,7 @@ struct thread_options_pack {
>  	uint64_t size;
>  	uint64_t io_size;
>  	uint32_t size_percent;
> +	uint32_t io_size_percent;
>  	uint32_t fill_device;
>  	uint32_t file_append;
>  	uint32_t unique_filename;
> @@ -456,6 +458,8 @@ struct thread_options_pack {
>  	struct zone_split zone_split[DDIR_RWDIR_CNT][ZONESPLIT_MAX];
>  	uint32_t zone_split_nr[DDIR_RWDIR_CNT];
>  
> +	uint8_t pad1[4];
> +
>  	fio_fp64_t zipf_theta;
>  	fio_fp64_t pareto_h;
>  	fio_fp64_t gauss_dev;
> 

I think you need to add the option field to struct thread_options_pack too,
otherwise the client/server case will break/not handle the option as expected.
There are also some changes in cconv.c needed. "git grep size_percent" and you
will get where that is needed.


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 7/7] verify: decouple seed generation from buffer fill
  2020-04-30 12:40 ` [PATCH 7/7] verify: decouple seed generation from buffer fill Alexey Dobriyan
@ 2020-05-01  1:59   ` Damien Le Moal
  0 siblings, 0 replies; 22+ messages in thread
From: Damien Le Moal @ 2020-05-01  1:59 UTC (permalink / raw)
  To: Alexey Dobriyan, axboe@kernel.dk; +Cc: fio@vger.kernel.org

On 2020/04/30 21:41, Alexey Dobriyan wrote:
> Nicer this way and less LOC.

A better description of this cleanup would be nice.

> 
> Signed-off-by: Alexey Dobriyan (SK hynix) <adobriyan@gmail.com>
> ---
>  backend.c |  6 ------
>  verify.c  | 20 +++++++-------------
>  2 files changed, 7 insertions(+), 19 deletions(-)
> 
> diff --git a/backend.c b/backend.c
> index feb34e51..452975cf 100644
> --- a/backend.c
> +++ b/backend.c
> @@ -1006,12 +1006,6 @@ static void do_io(struct thread_data *td, uint64_t *bytes_done)
>  		if (td->o.verify != VERIFY_NONE && io_u->ddir == DDIR_READ &&
>  		    ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) {
>  
> -			if (!td->o.verify_pattern_bytes) {
> -				io_u->rand_seed = __rand(&td->verify_state);
> -				if (sizeof(int) != sizeof(long *))
> -					io_u->rand_seed *= __rand(&td->verify_state);
> -			}
> -
>  			if (verify_state_should_stop(td, io_u)) {
>  				put_io_u(td, io_u);
>  				break;
> diff --git a/verify.c b/verify.c
> index cf299ebf..b7fa6693 100644
> --- a/verify.c
> +++ b/verify.c
> @@ -46,15 +46,6 @@ static void __fill_buffer(struct thread_options *o, uint64_t seed, void *p,
>  	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
>  }
>  
> -static uint64_t fill_buffer(struct thread_data *td, void *p,
> -			    unsigned int len)
> -{
> -	struct frand_state *fs = &td->verify_state;
> -	struct thread_options *o = &td->o;
> -
> -	return fill_random_buf_percentage(fs, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
> -}

Since you remove this, you could probably rename __fill_buffer() to
fill_buffer() ? Or do you want to keep the relation with the call to the "__"
version of fill_random_buf_percentage() ?

> -
>  void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
>  			 struct io_u *io_u, uint64_t seed, int use_seed)
>  {
> @@ -63,10 +54,13 @@ void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
>  	if (!o->verify_pattern_bytes) {
>  		dprint(FD_VERIFY, "fill random bytes len=%u\n", len);
>  
> -		if (use_seed)
> -			__fill_buffer(o, seed, p, len);
> -		else
> -			io_u->rand_seed = fill_buffer(td, p, len);
> +		if (!use_seed) {
> +			seed = __rand(&td->verify_state);
> +			if (sizeof(int) != sizeof(long *))
> +				seed *= (unsigned long)__rand(&td->verify_state);
> +		}
> +		io_u->rand_seed = seed;
> +		__fill_buffer(o, seed, p, len);
>  		return;
>  	}
>  
> 

Otherwise, this looks good to me.

Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 6/7] fio: parse "io_size=1%"
  2020-05-01  1:51   ` Damien Le Moal
@ 2020-05-01  6:00     ` Sitsofe Wheeler
  0 siblings, 0 replies; 22+ messages in thread
From: Sitsofe Wheeler @ 2020-05-01  6:00 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: Alexey Dobriyan, axboe@kernel.dk, fio@vger.kernel.org

On Fri, 1 May 2020 at 02:52, Damien Le Moal <Damien.LeMoal@wdc.com> wrote:
>
> On 2020/04/30 21:41, Alexey Dobriyan wrote:
> > The following config:
> >
> >       zonemode=zbd
>
> I think you need to add the option field to struct thread_options_pack too,
> otherwise the client/server case will break/not handle the option as expected.

This may also necessitate an increase to FIO_SERVER_VER in server.h...

-- 
Sitsofe | http://sucs.org/~sits/


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES
  2020-05-01  1:19 ` [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Damien Le Moal
@ 2020-05-01 14:47   ` Alexey Dobriyan
  2020-05-02  4:37     ` Damien Le Moal
  0 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-05-01 14:47 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On Fri, May 01, 2020 at 01:19:10AM +0000, Damien Le Moal wrote:
> On 2020/04/30 21:41, Alexey Dobriyan wrote:

> > -#define ZBD_MAX_OPEN_ZONES	128
> > +#define ZBD_MAX_OPEN_ZONES	4096
> >  
> >  /*
> >   * Zoned block device models.
> > 
> 
> Makes sense. ZNS will will need this.

Oh yeah :^) 128 opened zones doesn't cut it.

> Note: You did not send a cover letter for the series. Please add it for the next
> round.

There is no cover letter, fixes are really independent.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/7] zbd: consolidate zone mutex initialisation
  2020-05-01  1:44   ` Damien Le Moal
@ 2020-05-01 18:37     ` Alexey Dobriyan
  2020-05-02  4:39       ` Damien Le Moal
  0 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-05-01 18:37 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On Fri, May 01, 2020 at 01:44:26AM +0000, Damien Le Moal wrote:
> On 2020/04/30 21:41, Alexey Dobriyan wrote:
> > Another mutex is coming!
> 
> That is not very descriptive at all.
> What the patch does is to make the per zone mutex initialization code common
> between init_zone_info() and parse_zone_info(), moving it to create_zone_info().

> > @@ -546,11 +532,27 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
> >  		return -EINVAL;
> >  	}
> >  
> > -	if (ret == 0) {
> > -		f->zbd_info->model = zbd_model;
> > -		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
> > +	if (ret)
> > +		return ret;
> > +
> > +	zbd = f->zbd_info;
> > +	zbd->model = zbd_model;
> > +	zbd->max_open_zones = td->o.global_max_open_zones;
> > +	{
> > +		pthread_mutexattr_t attr;
> 
> If you move this declaration to the beginning of zbd_create_zone_info(), you can
> avoid this block code, which does not look pretty.

It is done on purpose, to move declaration closer to usage.
fio is built with -std=gnu99 _and_ -Wdeclaration-after-statement which
prevents

	a = b;
	int c;
	c = d;

modern style of declaration.

> > +		pthread_mutexattr_init(&attr);
> > +		pthread_mutexattr_setpshared(&attr, true);
> > +		pthread_mutex_init(&zbd->mutex, &attr);
> > +		pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
> > +		for (uint32_t z = 0; z < zbd->nr_zones; z++) {
> > +			struct fio_zone_info *zi = &zbd->zone_info[z];
> > +
> > +			pthread_mutex_init(&zi->mutex, &attr);
> > +		}
> > +		pthread_mutexattr_destroy(&attr);
> >  	}
> > -	return ret;
> > +	return 0;
> >  }
> >  
> >  void zbd_free_zone_info(struct fio_file *f)
> > 
> 
> With the nits above fixed, Looks good to me.
> 
> Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit
  2020-05-01  1:34   ` Damien Le Moal
@ 2020-05-01 18:52     ` Alexey Dobriyan
  2020-05-04  1:41       ` Damien Le Moal
  0 siblings, 1 reply; 22+ messages in thread
From: Alexey Dobriyan @ 2020-05-01 18:52 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On Fri, May 01, 2020 at 01:34:32AM +0000, Damien Le Moal wrote:
> On 2020/04/30 21:41, Alexey Dobriyan wrote:
> > It is not possible to maintain equal per-thread iodepth. The way code
> > is written, "max_open_zones" acts as a global limit, and one thread
> > opens all "max_open_zones" for itself and others starve for available
> > zones and _exit_ prematurely.
> > 
> > This config is guaranteed to make equal number of zone resets/IO now:
> > each thread generates identical pattern and doesn't intersect with other
> > threads:
> > 
> > 	zonemode=zbd
> > 	zonesize=...
> > 	rw=write
> > 
> > 	numjobs=N
> > 	offset_increment=M*zonesize
> > 
> > 	[j]
> > 	size=M*zonesize
> > 
> > Patch introduces "global_max_open_zones" which is per-device config
> > option. "max_open_zones" becomes per-thread limit. Both limits are
> > checked for each open zone so one thread can't starve others.
> 
> It makes sense. Nice one.
> 
> But the change as is will break existing test scripts (e.g. lots of SMR drives
> are being tested with this).

It won't break single-threaded ones, that's for sure.

> I think we can avoid this breakage simply: leave
> max_open_zones option definition as is and add "job_max_open_zones" or
> "thread_max_open_zones" option (no strong feelings about the name here, as long
> as it is explicit) to define the per thread maximum number of open zones. This
> new option could actually default to max_open_zones / numjobs if that is not 0.

I'd argue that such scripts are broken.

If sustained numjobs*max_open_zones QD is desired than it is not
guaranteed as threads will simply exit at indeterminate times,
which break LBA space coverage as well.

Right now, numjobs= + max_open_zones= means "max open zones by at most
"numjobs" threads.


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES
  2020-05-01 14:47   ` Alexey Dobriyan
@ 2020-05-02  4:37     ` Damien Le Moal
  0 siblings, 0 replies; 22+ messages in thread
From: Damien Le Moal @ 2020-05-02  4:37 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On 2020/05/01 23:47, Alexey Dobriyan wrote:
> On Fri, May 01, 2020 at 01:19:10AM +0000, Damien Le Moal wrote:
>> On 2020/04/30 21:41, Alexey Dobriyan wrote:
> 
>>> -#define ZBD_MAX_OPEN_ZONES	128
>>> +#define ZBD_MAX_OPEN_ZONES	4096
>>>  
>>>  /*
>>>   * Zoned block device models.
>>>
>>
>> Makes sense. ZNS will will need this.
> 
> Oh yeah :^) 128 opened zones doesn't cut it.
> 
>> Note: You did not send a cover letter for the series. Please add it for the next
>> round.
> 
> There is no cover letter, fixes are really independent.

Even in such case, a cover letter is still useful for Jens to ack all patches in
one go instead of having to answer each patch email.


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 5/7] zbd: consolidate zone mutex initialisation
  2020-05-01 18:37     ` Alexey Dobriyan
@ 2020-05-02  4:39       ` Damien Le Moal
  0 siblings, 0 replies; 22+ messages in thread
From: Damien Le Moal @ 2020-05-02  4:39 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On 2020/05/02 3:37, Alexey Dobriyan wrote:
> On Fri, May 01, 2020 at 01:44:26AM +0000, Damien Le Moal wrote:
>> On 2020/04/30 21:41, Alexey Dobriyan wrote:
>>> Another mutex is coming!
>>
>> That is not very descriptive at all.
>> What the patch does is to make the per zone mutex initialization code common
>> between init_zone_info() and parse_zone_info(), moving it to create_zone_info().
> 
>>> @@ -546,11 +532,27 @@ static int zbd_create_zone_info(struct thread_data *td, struct fio_file *f)
>>>  		return -EINVAL;
>>>  	}
>>>  
>>> -	if (ret == 0) {
>>> -		f->zbd_info->model = zbd_model;
>>> -		f->zbd_info->max_open_zones = td->o.global_max_open_zones;
>>> +	if (ret)
>>> +		return ret;
>>> +
>>> +	zbd = f->zbd_info;
>>> +	zbd->model = zbd_model;
>>> +	zbd->max_open_zones = td->o.global_max_open_zones;
>>> +	{
>>> +		pthread_mutexattr_t attr;
>>
>> If you move this declaration to the beginning of zbd_create_zone_info(), you can
>> avoid this block code, which does not look pretty.
> 
> It is done on purpose, to move declaration closer to usage.

I understood that. But it is visually not pleasing in my opinion and does not
match the general coding style used in fio.
Just make that mutex initialization code block an inline function. That will
cleanup things nicely.

> fio is built with -std=gnu99 _and_ -Wdeclaration-after-statement which
> prevents
> 
> 	a = b;
> 	int c;
> 	c = d;
> 
> modern style of declaration.
> 
>>> +		pthread_mutexattr_init(&attr);
>>> +		pthread_mutexattr_setpshared(&attr, true);
>>> +		pthread_mutex_init(&zbd->mutex, &attr);
>>> +		pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
>>> +		for (uint32_t z = 0; z < zbd->nr_zones; z++) {
>>> +			struct fio_zone_info *zi = &zbd->zone_info[z];
>>> +
>>> +			pthread_mutex_init(&zi->mutex, &attr);
>>> +		}
>>> +		pthread_mutexattr_destroy(&attr);
>>>  	}
>>> -	return ret;
>>> +	return 0;
>>>  }
>>>  
>>>  void zbd_free_zone_info(struct fio_file *f)
>>>
>>
>> With the nits above fixed, Looks good to me.
>>
>> Reviewed-by: Damien Le Moal <damien.lemoal@wdc.com>
> 


-- 
Damien Le Moal
Western Digital Research


^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit
  2020-05-01 18:52     ` Alexey Dobriyan
@ 2020-05-04  1:41       ` Damien Le Moal
  2020-05-04 12:15         ` Alexey Dobriyan
  0 siblings, 1 reply; 22+ messages in thread
From: Damien Le Moal @ 2020-05-04  1:41 UTC (permalink / raw)
  To: Alexey Dobriyan; +Cc: axboe@kernel.dk, fio@vger.kernel.org

Alexey,

On 2020/05/02 3:52, Alexey Dobriyan wrote:
> On Fri, May 01, 2020 at 01:34:32AM +0000, Damien Le Moal wrote:
>> On 2020/04/30 21:41, Alexey Dobriyan wrote:
>>> It is not possible to maintain equal per-thread iodepth. The way code
>>> is written, "max_open_zones" acts as a global limit, and one thread
>>> opens all "max_open_zones" for itself and others starve for available
>>> zones and _exit_ prematurely.
>>>
>>> This config is guaranteed to make equal number of zone resets/IO now:
>>> each thread generates identical pattern and doesn't intersect with other
>>> threads:
>>>
>>> 	zonemode=zbd
>>> 	zonesize=...
>>> 	rw=write
>>>
>>> 	numjobs=N
>>> 	offset_increment=M*zonesize
>>>
>>> 	[j]
>>> 	size=M*zonesize
>>>
>>> Patch introduces "global_max_open_zones" which is per-device config
>>> option. "max_open_zones" becomes per-thread limit. Both limits are
>>> checked for each open zone so one thread can't starve others.
>>
>> It makes sense. Nice one.
>>
>> But the change as is will break existing test scripts (e.g. lots of SMR drives
>> are being tested with this).
> 
> It won't break single-threaded ones, that's for sure.

Yes, but things like:

fio --ioengine=psync --rw=randwr --max_open_zones=128 --numjobs=32

will change behavior. With your change, instead of 32 threads writing randomly
to a total of 128 zones, you will get 32 threads each writing randomly to 128
zones, with a total of 32*128=4096 zones.

SMR drives and zonemode=zbd have now been around for a while and there are a lot
of fio scripts deployed in production for system validation/tests, as well as in
drive development for testing. If we can avoid breaking that, we absolutely must.

My proposal to keep max_open_zones as the per device maximum and introducing a
thread_max_open_zones limit keeps backward compatibility with existing scripts
while still allowing your change.

> 
>> I think we can avoid this breakage simply: leave
>> max_open_zones option definition as is and add "job_max_open_zones" or
>> "thread_max_open_zones" option (no strong feelings about the name here, as long
>> as it is explicit) to define the per thread maximum number of open zones. This
>> new option could actually default to max_open_zones / numjobs if that is not 0.
> 
> I'd argue that such scripts are broken.

See the above example. It is a perfectly valid script, not broken at all.
Varying the number of max_open_zones allows measuring the performance variation
of a drive with the number of implicitly open zones. It is a common one that I
have seen a lot in drive development and production. There are likely other
valid ones too. Assuming that all current uses of max_open_zones with multi-jobs
workloads are broken would be a mistake.

> 
> If sustained numjobs*max_open_zones QD is desired than it is not
> guaranteed as threads will simply exit at indeterminate times,
> which break LBA space coverage as well.
> 
> Right now, numjobs= + max_open_zones= means "max open zones by at most
> "numjobs" threads.

I understand that. And we should keep it that way for the reasons mentioned
above. Modifying your change with the option thread_max_open_zones will nicely
enhance. E.g.

fio --ioengine=libaio --iodepth=8 --rw=randwr --thread_max_open_zones=1 --numjobs=8

Will result in 8 threads writing a single randomly chosen zone at QD=8. And that
is the same as your proposed:

fio --ioengine=libaio --iodepth=8 --rw=randwr --max_open_zones=1 --numjobs=8

but without breaking the existing meaning of max_open_zones as a per drive/file
limit.

I totally agree with your change. It is a nice one. But let's preserve
max_open_zones meaning as the per device limit. No need to change it.

Best regards.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 22+ messages in thread

* Re: [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit
  2020-05-04  1:41       ` Damien Le Moal
@ 2020-05-04 12:15         ` Alexey Dobriyan
  0 siblings, 0 replies; 22+ messages in thread
From: Alexey Dobriyan @ 2020-05-04 12:15 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: axboe@kernel.dk, fio@vger.kernel.org

On Mon, May 04, 2020 at 01:41:14AM +0000, Damien Le Moal wrote:
> Alexey,
> 
> On 2020/05/02 3:52, Alexey Dobriyan wrote:
> > On Fri, May 01, 2020 at 01:34:32AM +0000, Damien Le Moal wrote:
> >> On 2020/04/30 21:41, Alexey Dobriyan wrote:
> >>> It is not possible to maintain equal per-thread iodepth. The way code
> >>> is written, "max_open_zones" acts as a global limit, and one thread
> >>> opens all "max_open_zones" for itself and others starve for available
> >>> zones and _exit_ prematurely.
> >>>
> >>> This config is guaranteed to make equal number of zone resets/IO now:
> >>> each thread generates identical pattern and doesn't intersect with other
> >>> threads:
> >>>
> >>> 	zonemode=zbd
> >>> 	zonesize=...
> >>> 	rw=write
> >>>
> >>> 	numjobs=N
> >>> 	offset_increment=M*zonesize
> >>>
> >>> 	[j]
> >>> 	size=M*zonesize
> >>>
> >>> Patch introduces "global_max_open_zones" which is per-device config
> >>> option. "max_open_zones" becomes per-thread limit. Both limits are
> >>> checked for each open zone so one thread can't starve others.
> >>
> >> It makes sense. Nice one.
> >>
> >> But the change as is will break existing test scripts (e.g. lots of SMR drives
> >> are being tested with this).
> > 
> > It won't break single-threaded ones, that's for sure.
> 
> Yes, but things like:
> 
> fio --ioengine=psync --rw=randwr --max_open_zones=128 --numjobs=32
> 
> will change behavior. With your change, instead of 32 threads writing randomly
> to a total of 128 zones, you will get 32 threads each writing randomly to 128
> zones, with a total of 32*128=4096 zones.
> 
> SMR drives and zonemode=zbd have now been around for a while and there are a lot
> of fio scripts deployed in production for system validation/tests, as well as in
> drive development for testing. If we can avoid breaking that, we absolutely must.
> 
> My proposal to keep max_open_zones as the per device maximum and introducing a
> thread_max_open_zones limit keeps backward compatibility with existing scripts
> while still allowing your change.
> 
> > 
> >> I think we can avoid this breakage simply: leave
> >> max_open_zones option definition as is and add "job_max_open_zones" or
> >> "thread_max_open_zones" option (no strong feelings about the name here, as long
> >> as it is explicit) to define the per thread maximum number of open zones. This
> >> new option could actually default to max_open_zones / numjobs if that is not 0.
> > 
> > I'd argue that such scripts are broken.
> 
> See the above example. It is a perfectly valid script, not broken at all.

It is broken in the sense that script doesn't test what's author thinks it tests.
max_open_zones= + numjobs= can only be used as random stress smoke test, nothing
more. Patch actually increases stress level :-)

I assume that if open zone command fails due to hardware limitations, thread can
and will exit just as easily.

> Varying the number of max_open_zones allows measuring the performance variation
> of a drive with the number of implicitly open zones. It is a common one that I
> have seen a lot in drive development and production. There are likely other
> valid ones too. Assuming that all current uses of max_open_zones with multi-jobs
> workloads are broken would be a mistake.
> 
> > 
> > If sustained numjobs*max_open_zones QD is desired than it is not
> > guaranteed as threads will simply exit at indeterminate times,
> > which break LBA space coverage as well.
> > 
> > Right now, numjobs= + max_open_zones= means "max open zones by at most
> > "numjobs" threads.
> 
> I understand that. And we should keep it that way for the reasons mentioned
> above. Modifying your change with the option thread_max_open_zones will nicely
> enhance. E.g.
> 
> fio --ioengine=libaio --iodepth=8 --rw=randwr --thread_max_open_zones=1 --numjobs=8
> 
> Will result in 8 threads writing a single randomly chosen zone at QD=8. And that
> is the same as your proposed:
> 
> fio --ioengine=libaio --iodepth=8 --rw=randwr --max_open_zones=1 --numjobs=8
> 
> but without breaking the existing meaning of max_open_zones as a per drive/file
> limit.
> 
> I totally agree with your change. It is a nice one. But let's preserve
> max_open_zones meaning as the per device limit. No need to change it.

OK I'll resend but I'll call it "job_max_open_zones".
It doesn't help that fio doesn't have a notion of per-file/device option.


^ permalink raw reply	[flat|nested] 22+ messages in thread

end of thread, other threads:[~2020-05-04 12:15 UTC | newest]

Thread overview: 22+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2020-04-30 12:40 [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Alexey Dobriyan
2020-04-30 12:40 ` [PATCH 2/7] zbd: don't lock zones outside working area Alexey Dobriyan
2020-05-01  1:27   ` Damien Le Moal
2020-04-30 12:40 ` [PATCH 3/7] zbd: introduce per-device "max_open_zones" limit Alexey Dobriyan
2020-05-01  1:34   ` Damien Le Moal
2020-05-01 18:52     ` Alexey Dobriyan
2020-05-04  1:41       ` Damien Le Moal
2020-05-04 12:15         ` Alexey Dobriyan
2020-04-30 12:40 ` [PATCH 4/7] zbd: make zbd_info->mutex non-recursive Alexey Dobriyan
2020-05-01  1:36   ` Damien Le Moal
2020-04-30 12:40 ` [PATCH 5/7] zbd: consolidate zone mutex initialisation Alexey Dobriyan
2020-05-01  1:44   ` Damien Le Moal
2020-05-01 18:37     ` Alexey Dobriyan
2020-05-02  4:39       ` Damien Le Moal
2020-04-30 12:40 ` [PATCH 6/7] fio: parse "io_size=1%" Alexey Dobriyan
2020-05-01  1:51   ` Damien Le Moal
2020-05-01  6:00     ` Sitsofe Wheeler
2020-04-30 12:40 ` [PATCH 7/7] verify: decouple seed generation from buffer fill Alexey Dobriyan
2020-05-01  1:59   ` Damien Le Moal
2020-05-01  1:19 ` [PATCH 1/7] zbd: bump ZBD_MAX_OPEN_ZONES Damien Le Moal
2020-05-01 14:47   ` Alexey Dobriyan
2020-05-02  4:37     ` Damien Le Moal

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox