* [PATCH 0/2] null_blk: zone support
@ 2018-07-06 17:38 Matias Bjørling
2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling
` (3 more replies)
0 siblings, 4 replies; 16+ messages in thread
From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw)
To: axboe
Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal,
Matias Bjørling
This series adds support for exposing a zone block device using the
null_blk device driver.
The first patch moves core null_blk data structures to a shared header
file. The second implements the actual zone support. The patchset adds
two new options. One to enable the zone interface, and another to
define the size of the zones to expose.
Thanks,
Matias
Matias Bjørling (2):
null_blk: move shared definitions to header file
null_blk: add zone support
Documentation/block/null_blk.txt | 7 ++
drivers/block/Makefile | 5 +-
drivers/block/null_blk.c | 124 ++++++++++++--------------------
drivers/block/null_blk.h | 108 ++++++++++++++++++++++++++++
drivers/block/null_blk_zoned.c | 149 +++++++++++++++++++++++++++++++++++++++
5 files changed, 315 insertions(+), 78 deletions(-)
create mode 100644 drivers/block/null_blk.h
create mode 100644 drivers/block/null_blk_zoned.c
--
2.11.0
^ permalink raw reply [flat|nested] 16+ messages in thread* [PATCH 1/2] null_blk: move shared definitions to header file 2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling @ 2018-07-06 17:38 ` Matias Bjørling 2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling ` (2 subsequent siblings) 3 siblings, 0 replies; 16+ messages in thread From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw) To: axboe Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal, Matias Bjørling From: Matias Bjørling <matias.bjorling@wdc.com> Split the null_blk device driver, such that it can prepare for zoned block interface support. Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com> --- drivers/block/null_blk.c | 76 +-------------------------------------------- drivers/block/null_blk.h | 80 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+), 75 deletions(-) create mode 100644 drivers/block/null_blk.h diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 042c778e5a4e..cd4b0849d3b4 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -7,14 +7,8 @@ #include <linux/moduleparam.h> #include <linux/sched.h> #include <linux/fs.h> -#include <linux/blkdev.h> #include <linux/init.h> -#include <linux/slab.h> -#include <linux/blk-mq.h> -#include <linux/hrtimer.h> -#include <linux/configfs.h> -#include <linux/badblocks.h> -#include <linux/fault-inject.h> +#include "null_blk.h" #define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT) #define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT) @@ -35,28 +29,6 @@ static inline u64 mb_per_tick(int mbps) return (1 << 20) / TICKS_PER_SEC * ((u64) mbps); } -struct nullb_cmd { - struct list_head list; - struct llist_node ll_list; - struct __call_single_data csd; - struct request *rq; - struct bio *bio; - unsigned int tag; - blk_status_t error; - struct nullb_queue *nq; - struct hrtimer timer; -}; - -struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; - struct nullb_device *dev; - unsigned int requeue_selection; - - struct nullb_cmd *cmds; -}; - /* * Status flags for nullb_device. * @@ -92,52 +64,6 @@ struct nullb_page { #define NULLB_PAGE_LOCK (MAP_SZ - 1) #define NULLB_PAGE_FREE (MAP_SZ - 2) -struct nullb_device { - struct nullb *nullb; - struct config_item item; - struct radix_tree_root data; /* data stored in the disk */ - struct radix_tree_root cache; /* disk cache data */ - unsigned long flags; /* device flags */ - unsigned int curr_cache; - struct badblocks badblocks; - - unsigned long size; /* device size in MB */ - unsigned long completion_nsec; /* time in ns to complete a request */ - unsigned long cache_size; /* disk cache size in MB */ - unsigned int submit_queues; /* number of submission queues */ - unsigned int home_node; /* home node for the device */ - unsigned int queue_mode; /* block interface */ - unsigned int blocksize; /* block size */ - unsigned int irqmode; /* IRQ completion handler */ - unsigned int hw_queue_depth; /* queue depth */ - unsigned int index; /* index of the disk, only valid with a disk */ - unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ - bool blocking; /* blocking blk-mq device */ - bool use_per_node_hctx; /* use per-node allocation for hardware context */ - bool power; /* power on/off the device */ - bool memory_backed; /* if data is stored in memory */ - bool discard; /* if support discard */ -}; - -struct nullb { - struct nullb_device *dev; - struct list_head list; - unsigned int index; - struct request_queue *q; - struct gendisk *disk; - struct blk_mq_tag_set *tag_set; - struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; - atomic_long_t cur_bytes; - struct hrtimer bw_timer; - unsigned long cache_flush_pos; - spinlock_t lock; - - struct nullb_queue *queues; - unsigned int nr_queues; - char disk_name[DISK_NAME_LEN]; -}; - static LIST_HEAD(nullb_list); static struct mutex lock; static int null_major; diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h new file mode 100644 index 000000000000..d82c5501806d --- /dev/null +++ b/drivers/block/null_blk.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __BLK_NULL_BLK_H +#define __BLK_NULL_BLK_H + +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/blk-mq.h> +#include <linux/hrtimer.h> +#include <linux/configfs.h> +#include <linux/badblocks.h> +#include <linux/fault-inject.h> + +struct nullb_cmd { + struct list_head list; + struct llist_node ll_list; + struct __call_single_data csd; + struct request *rq; + struct bio *bio; + unsigned int tag; + blk_status_t error; + struct nullb_queue *nq; + struct hrtimer timer; +}; + +struct nullb_queue { + unsigned long *tag_map; + wait_queue_head_t wait; + unsigned int queue_depth; + struct nullb_device *dev; + unsigned int requeue_selection; + + struct nullb_cmd *cmds; +}; + +struct nullb_device { + struct nullb *nullb; + struct config_item item; + struct radix_tree_root data; /* data stored in the disk */ + struct radix_tree_root cache; /* disk cache data */ + unsigned long flags; /* device flags */ + unsigned int curr_cache; + struct badblocks badblocks; + + unsigned long size; /* device size in MB */ + unsigned long completion_nsec; /* time in ns to complete a request */ + unsigned long cache_size; /* disk cache size in MB */ + unsigned int submit_queues; /* number of submission queues */ + unsigned int home_node; /* home node for the device */ + unsigned int queue_mode; /* block interface */ + unsigned int blocksize; /* block size */ + unsigned int irqmode; /* IRQ completion handler */ + unsigned int hw_queue_depth; /* queue depth */ + unsigned int index; /* index of the disk, only valid with a disk */ + unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */ + bool blocking; /* blocking blk-mq device */ + bool use_per_node_hctx; /* use per-node allocation for hardware context */ + bool power; /* power on/off the device */ + bool memory_backed; /* if data is stored in memory */ + bool discard; /* if support discard */ +}; + +struct nullb { + struct nullb_device *dev; + struct list_head list; + unsigned int index; + struct request_queue *q; + struct gendisk *disk; + struct blk_mq_tag_set *tag_set; + struct blk_mq_tag_set __tag_set; + unsigned int queue_depth; + atomic_long_t cur_bytes; + struct hrtimer bw_timer; + unsigned long cache_flush_pos; + spinlock_t lock; + + struct nullb_queue *queues; + unsigned int nr_queues; + char disk_name[DISK_NAME_LEN]; +}; +#endif /* __NULL_BLK_H */ -- 2.11.0 ^ permalink raw reply related [flat|nested] 16+ messages in thread
* [PATCH 2/2] null_blk: add zone support 2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling 2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling @ 2018-07-06 17:38 ` Matias Bjørling 2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman 2018-07-07 2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe 3 siblings, 0 replies; 16+ messages in thread From: Matias Bjørling @ 2018-07-06 17:38 UTC (permalink / raw) To: axboe Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal, Matias Bjørling From: Matias Bjørling <matias.bjorling@wdc.com> Adds support for exposing a null_blk device through the zone device interface. The interface is managed with the parameters zoned and zone_size. If zoned is set, the null_blk instance registers as a zoned block device. The zone_size parameter defines how big each zone will be. Signed-off-by: Matias Bjørling <matias.bjorling@wdc.com> Signed-off-by: Bart Van Assche <bart.vanassche@wdc.com> Signed-off-by: Damien Le Moal <damien.lemoal@wdc.com> --- Documentation/block/null_blk.txt | 7 ++ drivers/block/Makefile | 5 +- drivers/block/null_blk.c | 48 ++++++++++++- drivers/block/null_blk.h | 28 ++++++++ drivers/block/null_blk_zoned.c | 149 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 234 insertions(+), 3 deletions(-) create mode 100644 drivers/block/null_blk_zoned.c diff --git a/Documentation/block/null_blk.txt b/Documentation/block/null_blk.txt index 07f147381f32..ea2dafe49ae8 100644 --- a/Documentation/block/null_blk.txt +++ b/Documentation/block/null_blk.txt @@ -85,3 +85,10 @@ shared_tags=[0/1]: Default: 0 0: Tag set is not shared. 1: Tag set shared between devices for blk-mq. Only makes sense with nr_devices > 1, otherwise there's no tag set to share. + +zoned=[0/1]: Default: 0 + 0: Block device is exposed as a random-access block device. + 1: Block device is exposed as a host-managed zoned block device. + +zone_size=[MB]: Default: 256 + Per zone size when exposed as a zoned block device. Must be a power of two. diff --git a/drivers/block/Makefile b/drivers/block/Makefile index dc061158b403..a0d88aa0c05d 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -36,8 +36,11 @@ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx/ obj-$(CONFIG_BLK_DEV_RSXX) += rsxx/ -obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk.o obj-$(CONFIG_ZRAM) += zram/ +obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk_mod.o +null_blk_mod-objs := null_blk.o +null_blk_mod-$(CONFIG_BLK_DEV_ZONED) += null_blk_zoned.o + skd-y := skd_main.o swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cd4b0849d3b4..99b6bfe7abd1 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -180,6 +180,14 @@ static bool g_use_per_node_hctx; module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444); MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false"); +static bool g_zoned; +module_param_named(zoned, g_zoned, bool, S_IRUGO); +MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false"); + +static unsigned long g_zone_size = 256; +module_param_named(zone_size, g_zone_size, ulong, S_IRUGO); +MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256"); + static struct nullb_device *null_alloc_dev(void); static void null_free_dev(struct nullb_device *dev); static void null_del_dev(struct nullb *nullb); @@ -283,6 +291,8 @@ NULLB_DEVICE_ATTR(memory_backed, bool); NULLB_DEVICE_ATTR(discard, bool); NULLB_DEVICE_ATTR(mbps, uint); NULLB_DEVICE_ATTR(cache_size, ulong); +NULLB_DEVICE_ATTR(zoned, bool); +NULLB_DEVICE_ATTR(zone_size, ulong); static ssize_t nullb_device_power_show(struct config_item *item, char *page) { @@ -394,6 +404,8 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_mbps, &nullb_device_attr_cache_size, &nullb_device_attr_badblocks, + &nullb_device_attr_zoned, + &nullb_device_attr_zone_size, NULL, }; @@ -446,7 +458,7 @@ nullb_group_drop_item(struct config_group *group, struct config_item *item) static ssize_t memb_group_features_show(struct config_item *item, char *page) { - return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks\n"); + return snprintf(page, PAGE_SIZE, "memory_backed,discard,bandwidth,cache,badblocks,zoned,zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -505,6 +517,8 @@ static struct nullb_device *null_alloc_dev(void) dev->hw_queue_depth = g_hw_queue_depth; dev->blocking = g_blocking; dev->use_per_node_hctx = g_use_per_node_hctx; + dev->zoned = g_zoned; + dev->zone_size = g_zone_size; return dev; } @@ -513,6 +527,7 @@ static void null_free_dev(struct nullb_device *dev) if (!dev) return; + null_zone_exit(dev); badblocks_exit(&dev->badblocks); kfree(dev); } @@ -1145,6 +1160,11 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) struct nullb *nullb = dev->nullb; int err = 0; + if (req_op(cmd->rq) == REQ_OP_ZONE_REPORT) { + cmd->error = null_zone_report(nullb, cmd); + goto out; + } + if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) { struct request *rq = cmd->rq; @@ -1209,6 +1229,13 @@ static blk_status_t null_handle_cmd(struct nullb_cmd *cmd) } } cmd->error = errno_to_blk_status(err); + + if (!cmd->error && dev->zoned) { + if (req_op(cmd->rq) == REQ_OP_WRITE) + null_zone_write(cmd); + else if (req_op(cmd->rq) == REQ_OP_ZONE_RESET) + null_zone_reset(cmd); + } out: /* Complete IO by inline, softirq or timer */ switch (dev->irqmode) { @@ -1736,6 +1763,15 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flush_queueable(nullb->q, true); } + if (dev->zoned) { + rv = null_zone_init(dev); + if (rv) + goto out_cleanup_blk_queue; + + blk_queue_chunk_sectors(nullb->q, dev->zone_size_sects); + nullb->q->limits.zoned = BLK_ZONED_HM; + } + nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); @@ -1754,13 +1790,16 @@ static int null_add_dev(struct nullb_device *dev) rv = null_gendisk_register(nullb); if (rv) - goto out_cleanup_blk_queue; + goto out_cleanup_zone; mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); return 0; +out_cleanup_zone: + if (dev->zoned) + null_zone_exit(dev); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: @@ -1787,6 +1826,11 @@ static int __init null_init(void) g_bs = PAGE_SIZE; } + if (!is_power_of_2(g_zone_size)) { + pr_err("null_blk: zone_size must be power-of-two\n"); + return -EINVAL; + } + if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("null_blk: submit_queues param is set to %u.\n", diff --git a/drivers/block/null_blk.h b/drivers/block/null_blk.h index d82c5501806d..d81781f22dba 100644 --- a/drivers/block/null_blk.h +++ b/drivers/block/null_blk.h @@ -41,9 +41,14 @@ struct nullb_device { unsigned int curr_cache; struct badblocks badblocks; + unsigned int nr_zones; + struct blk_zone *zones; + sector_t zone_size_sects; + unsigned long size; /* device size in MB */ unsigned long completion_nsec; /* time in ns to complete a request */ unsigned long cache_size; /* disk cache size in MB */ + unsigned long zone_size; /* zone size in MB if device is zoned */ unsigned int submit_queues; /* number of submission queues */ unsigned int home_node; /* home node for the device */ unsigned int queue_mode; /* block interface */ @@ -57,6 +62,7 @@ struct nullb_device { bool power; /* power on/off the device */ bool memory_backed; /* if data is stored in memory */ bool discard; /* if support discard */ + bool zoned; /* if device is zoned */ }; struct nullb { @@ -77,4 +83,26 @@ struct nullb { unsigned int nr_queues; char disk_name[DISK_NAME_LEN]; }; + +#ifdef CONFIG_BLK_DEV_ZONED +int null_zone_init(struct nullb_device *dev); +void null_zone_exit(struct nullb_device *dev); +blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd); +void null_zone_write(struct nullb_cmd *cmd); +void null_zone_reset(struct nullb_cmd *cmd); +#else +static inline int null_zone_init(struct nullb_device *dev) +{ + return -EINVAL; +} +static inline void null_zone_exit(struct nullb_device *dev) {} +static inline blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd) +{ + return BLK_STS_NOTSUPP; +} +static inline void null_zone_write(struct nullb_cmd *cmd) {} +static inline void null_zone_reset(struct nullb_cmd *cmd) {} +#endif /* CONFIG_BLK_DEV_ZONED */ #endif /* __NULL_BLK_H */ diff --git a/drivers/block/null_blk_zoned.c b/drivers/block/null_blk_zoned.c new file mode 100644 index 000000000000..a979ca00d7be --- /dev/null +++ b/drivers/block/null_blk_zoned.c @@ -0,0 +1,149 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <linux/vmalloc.h> +#include "null_blk.h" + +/* zone_size in MBs to sectors. */ +#define ZONE_SIZE_SHIFT 11 + +static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect) +{ + return sect >> ilog2(dev->zone_size_sects); +} + +int null_zone_init(struct nullb_device *dev) +{ + sector_t dev_size = (sector_t)dev->size * 1024 * 1024; + sector_t sector = 0; + unsigned int i; + + if (!is_power_of_2(dev->zone_size)) { + pr_err("null_blk: zone_size must be power-of-two\n"); + return -EINVAL; + } + + dev->zone_size_sects = dev->zone_size << ZONE_SIZE_SHIFT; + dev->nr_zones = dev_size >> + (SECTOR_SHIFT + ilog2(dev->zone_size_sects)); + dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct blk_zone), + GFP_KERNEL | __GFP_ZERO); + if (!dev->zones) + return -ENOMEM; + + for (i = 0; i < dev->nr_zones; i++) { + struct blk_zone *zone = &dev->zones[i]; + + zone->start = zone->wp = sector; + zone->len = dev->zone_size_sects; + zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ; + zone->cond = BLK_ZONE_COND_EMPTY; + + sector += dev->zone_size_sects; + } + + return 0; +} + +void null_zone_exit(struct nullb_device *dev) +{ + kvfree(dev->zones); +} + +static void null_zone_fill_rq(struct nullb_device *dev, struct request *rq, + unsigned int zno, unsigned int nr_zones) +{ + struct blk_zone_report_hdr *hdr = NULL; + struct bio_vec bvec; + struct bvec_iter iter; + void *addr; + unsigned int zones_to_cpy; + + bio_for_each_segment(bvec, rq->bio, iter) { + addr = kmap_atomic(bvec.bv_page); + + zones_to_cpy = bvec.bv_len / sizeof(struct blk_zone); + + if (!hdr) { + hdr = (struct blk_zone_report_hdr *)addr; + hdr->nr_zones = nr_zones; + zones_to_cpy--; + addr += sizeof(struct blk_zone_report_hdr); + } + + zones_to_cpy = min_t(unsigned int, zones_to_cpy, nr_zones); + + memcpy(addr, &dev->zones[zno], + zones_to_cpy * sizeof(struct blk_zone)); + + kunmap_atomic(addr); + + nr_zones -= zones_to_cpy; + zno += zones_to_cpy; + + if (!nr_zones) + break; + } +} + +blk_status_t null_zone_report(struct nullb *nullb, + struct nullb_cmd *cmd) +{ + struct nullb_device *dev = nullb->dev; + struct request *rq = cmd->rq; + unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); + unsigned int nr_zones = dev->nr_zones - zno; + unsigned int max_zones = (blk_rq_bytes(rq) / + sizeof(struct blk_zone)) - 1; + + nr_zones = min_t(unsigned int, nr_zones, max_zones); + + null_zone_fill_rq(nullb->dev, rq, zno, nr_zones); + + return BLK_STS_OK; +} + +void null_zone_write(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct request *rq = cmd->rq; + sector_t sector = blk_rq_pos(rq); + unsigned int rq_sectors = blk_rq_sectors(rq); + unsigned int zno = null_zone_no(dev, sector); + struct blk_zone *zone = &dev->zones[zno]; + + switch (zone->cond) { + case BLK_ZONE_COND_FULL: + /* Cannot write to a full zone */ + cmd->error = BLK_STS_IOERR; + break; + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_IMP_OPEN: + /* Writes must be at the write pointer position */ + if (blk_rq_pos(rq) != zone->wp) { + cmd->error = BLK_STS_IOERR; + break; + } + + if (zone->cond == BLK_ZONE_COND_EMPTY) + zone->cond = BLK_ZONE_COND_IMP_OPEN; + + zone->wp += rq_sectors; + if (zone->wp == zone->start + zone->len) + zone->cond = BLK_ZONE_COND_FULL; + break; + default: + /* Invalid zone condition */ + cmd->error = BLK_STS_IOERR; + break; + } +} + +void null_zone_reset(struct nullb_cmd *cmd) +{ + struct nullb_device *dev = cmd->nq->dev; + struct request *rq = cmd->rq; + unsigned int zno = null_zone_no(dev, blk_rq_pos(rq)); + struct blk_zone *zone = &dev->zones[zno]; + + zone->cond = BLK_ZONE_COND_EMPTY; + zone->wp = zone->start; +} -- 2.11.0 ^ permalink raw reply related [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling 2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling 2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling @ 2018-07-06 17:45 ` Laurence Oberman 2018-07-09 7:54 ` Matias Bjørling 2018-07-07 2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe 3 siblings, 1 reply; 16+ messages in thread From: Laurence Oberman @ 2018-07-06 17:45 UTC (permalink / raw) To: Matias Bjørling, axboe Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote: > This series adds support for exposing a zone block device using the > null_blk device driver. > > The first patch moves core null_blk data structures to a shared > header > file. The second implements the actual zone support. The patchset > adds > two new options. One to enable the zone interface, and another to > define the size of the zones to expose. > > Thanks, > Matias > > Matias Bjørling (2): > null_blk: move shared definitions to header file > null_blk: add zone support > > Documentation/block/null_blk.txt | 7 ++ > drivers/block/Makefile | 5 +- > drivers/block/null_blk.c | 124 ++++++++++++----------------- > --- > drivers/block/null_blk.h | 108 ++++++++++++++++++++++++++++ > drivers/block/null_blk_zoned.c | 149 > +++++++++++++++++++++++++++++++++++++++ > 5 files changed, 315 insertions(+), 78 deletions(-) > create mode 100644 drivers/block/null_blk.h > create mode 100644 drivers/block/null_blk_zoned.c > Thank you for this will be very useful. I will test it to add value. I dont know the code well enough to review it Regards Laurence ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman @ 2018-07-09 7:54 ` Matias Bjørling 2018-07-09 16:34 ` Jens Axboe 0 siblings, 1 reply; 16+ messages in thread From: Matias Bjørling @ 2018-07-09 7:54 UTC (permalink / raw) To: loberman, axboe; +Cc: linux-block, linux-kernel, Bart.VanAssche, Damien.LeMoal On 07/06/2018 07:45 PM, Laurence Oberman wrote: > On Fri, 2018-07-06 at 19:38 +0200, Matias Bjørling wrote: >> This series adds support for exposing a zone block device using the >> null_blk device driver. >> >> The first patch moves core null_blk data structures to a shared >> header >> file. The second implements the actual zone support. The patchset >> adds >> two new options. One to enable the zone interface, and another to >> define the size of the zones to expose. >> >> Thanks, >> Matias >> >> Matias Bjørling (2): >> null_blk: move shared definitions to header file >> null_blk: add zone support >> >> Documentation/block/null_blk.txt | 7 ++ >> drivers/block/Makefile | 5 +- >> drivers/block/null_blk.c | 124 ++++++++++++----------------- >> --- >> drivers/block/null_blk.h | 108 ++++++++++++++++++++++++++++ >> drivers/block/null_blk_zoned.c | 149 >> +++++++++++++++++++++++++++++++++++++++ >> 5 files changed, 315 insertions(+), 78 deletions(-) >> create mode 100644 drivers/block/null_blk.h >> create mode 100644 drivers/block/null_blk_zoned.c >> > > Thank you for this will be very useful. > I will test it to add value. > I dont know the code well enough to review it > Regards > Laurence > Great. For fio, you can use the zone support here: https://github.com/bvanassche/fio It is in the process of being upstreamed. Also, you'll need the latest libzbc/util-linux (blkzone) if you want to report zones. A bug was fixed around detection of the drive. https://github.com/hgst/libzbc https://github.com/karelzak/util-linux ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-09 7:54 ` Matias Bjørling @ 2018-07-09 16:34 ` Jens Axboe 2018-07-10 0:05 ` Bart Van Assche 0 siblings, 1 reply; 16+ messages in thread From: Jens Axboe @ 2018-07-09 16:34 UTC (permalink / raw) To: Matias Bjørling, loberman Cc: linux-block, linux-kernel, Bart.VanAssche, Damien.LeMoal On 7/9/18 1:54 AM, Matias Bjørling wrote: > For fio, you can use the zone support here: > > https://github.com/bvanassche/fio > > It is in the process of being upstreamed. In the spirit of making some progress on this, I just don't like how it's done. For example, it should not be necessary to adjust what comes out of the block generator, instead the block generator should be told to do what we need on zbc. This is a key concept. The workload should be defined as such that it works for zoned devices. The trim as zone resets seems odd. What happens if you end up with a zoned flash device in the future? The support code looks fine. -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-09 16:34 ` Jens Axboe @ 2018-07-10 0:05 ` Bart Van Assche 2018-07-10 14:46 ` Jens Axboe 0 siblings, 1 reply; 16+ messages in thread From: Bart Van Assche @ 2018-07-10 0:05 UTC (permalink / raw) To: mb@lightnvm.io, axboe@kernel.dk, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal T24gTW9uLCAyMDE4LTA3LTA5IGF0IDEwOjM0IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP biA3LzkvMTggMTo1NCBBTSwgTWF0aWFzIEJqw7hybGluZyB3cm90ZToNCj4gPiBGb3IgZmlvLCB5 b3UgY2FuIHVzZSB0aGUgem9uZSBzdXBwb3J0IGhlcmU6DQo+ID4gDQo+ID4gICAgaHR0cHM6Ly9n aXRodWIuY29tL2J2YW5hc3NjaGUvZmlvDQo+ID4gDQo+ID4gSXQgaXMgaW4gdGhlIHByb2Nlc3Mg b2YgYmVpbmcgdXBzdHJlYW1lZC4NCj4gDQo+IEluIHRoZSBzcGlyaXQgb2YgbWFraW5nIHNvbWUg cHJvZ3Jlc3Mgb24gdGhpcywgSSBqdXN0IGRvbid0IGxpa2UgaG93DQo+IGl0J3MgZG9uZS4gRm9y IGV4YW1wbGUsIGl0IHNob3VsZCBub3QgYmUgbmVjZXNzYXJ5IHRvIGFkanVzdCB3aGF0DQo+IGNv bWVzIG91dCBvZiB0aGUgYmxvY2sgZ2VuZXJhdG9yLCBpbnN0ZWFkIHRoZSBibG9jayBnZW5lcmF0 b3Igc2hvdWxkDQo+IGJlIHRvbGQgdG8gZG8gd2hhdCB3ZSBuZWVkIG9uIHpiYy4gVGhpcyBpcyBh IGtleSBjb25jZXB0LiBUaGUgd29ya2xvYWQNCj4gc2hvdWxkIGJlIGRlZmluZWQgYXMgc3VjaCB0 aGF0IGl0IHdvcmtzIGZvciB6b25lZCBkZXZpY2VzLg0KDQpIZWxsbyBKZW5zLA0KDQpIb3cgd291 bGQgeW91IGxpa2UgdG8gc2VlIGJsb2NrIGdlbmVyYXRpb24gd29yaz8gSSBkb24ndCBzZWUgYW4g YWx0ZXJuYXRpdmUNCmZvciByYW5kb20gSS9PIG90aGVyIHN0YXJ0aW5nIGZyb20gdGhlIG91dHB1 dCBvZiBhIHJhbmRvbSBnZW5lcmF0b3IgYW5kDQp0cmFuc2xhdGluZyB0aGF0IG91dHB1dCBpbnRv IHNvbWV0aGluZyB0aGF0IGlzIGFwcHJvcHJpYXRlIGZvciBhIHpvbmVkIGJsb2NrDQpkZXZpY2Uu IFJhbmRvbSByZWFkcyBtdXN0IGhhcHBlbiBiZWxvdyB0aGUgem9uZSBwb2ludGVyIGlmIGZpbyBp cyBjb25maWd1cmVkDQp0byByZWFkIGJlbG93IHRoZSB6b25lIHBvaW50ZXIuIFJhbmRvbSB3cml0 ZXMgbXVzdCBoYXBwZW4gYXQgdGhlIHdyaXRlDQpwb2ludGVyLiBUaGUgb25seSB3YXkgSSBzZWUg dG8gaW1wbGVtZW50IHN1Y2ggYW4gSS9PIHBhdHRlcm4gaXMgdG8gc3RhcnQNCmZyb20gdGhlIG91 dHB1dCBvZiBhIHJhbmRvbSBnZW5lcmF0b3IgYW5kIHRvIGFkanVzdCB0aGUgb3V0cHV0IG9mIHRo YXQNCnJhbmRvbSBnZW5lcmF0b3IuIEhvd2V2ZXIsIEkgZG9uJ3QgaGF2ZSBhIHN0cm9uZyBvcGlu aW9uIHdoZXRoZXIgYWRqdXN0aW5nDQp0aGUgb3V0cHV0IG9mIGEgcmFuZG9tIGdlbmVyYXRvciBz aG91bGQgaGFwcGVuIGJ5IHRoZSBjYWxsZXIgb2YNCmdldF9uZXh0X2J1ZmxlbigpIG9yIGluc2lk ZSBnZXRfbmV4dF9idWZsZW4oKS4gT3IgaXMgeW91ciBjb25jZXJuIHBlcmhhcHMNCnRoYXQgdGhl IGN1cnJlbnQgYXBwcm9hY2ggaW50ZXJmZXJlcyB3aXRoIGZpbyBqb2Igb3B0aW9ucyBsaWtlIGJz X3VuYWxpZ25lZD8NCg0KPiBUaGUgdHJpbSBhcyB6b25lIHJlc2V0cyBzZWVtcyBvZGQuIFdoYXQg aGFwcGVucyBpZiB5b3UgZW5kIHVwIHdpdGggYQ0KPiB6b25lZCBmbGFzaCBkZXZpY2UgaW4gdGhl IGZ1dHVyZT8NCg0KV2UgY2FuIGxlYXZlIG91dCB0aGUgY29kZSB0aGF0IHRyYW5zbGF0ZXMgdHJp bSBpbnRvIHpvbmUgcmVzZXRzIGZyb20gZmlvDQphbmQgZGlzY3VzcyBsYXRlciBob3cgdG8gaGFu ZGxlIHRoaXMuIE9uZSBwb3NzaWJpbGl0eSBpcyB0byBtb2RpZnkgdGhlIHNkDQpkcml2ZXIgc3Vj aCB0aGF0IGl0IHRyYW5zbGF0ZXMgUkVRX09QX0RJU0NBUkQgaW50byB6b25lIHJlc2V0cyB3aGVu DQphcHByb3ByaWF0ZS4gVGhlcmUgbWF5IGJlIGJldHRlciBhbHRlcm5hdGl2ZXMuDQoNCkJhcnQu DQoNCg0KDQoNCg== ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-10 0:05 ` Bart Van Assche @ 2018-07-10 14:46 ` Jens Axboe 2018-07-10 16:47 ` Bart Van Assche 0 siblings, 1 reply; 16+ messages in thread From: Jens Axboe @ 2018-07-10 14:46 UTC (permalink / raw) To: Bart Van Assche, mb@lightnvm.io, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On 7/9/18 6:05 PM, Bart Van Assche wrote: > On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote: >> On 7/9/18 1:54 AM, Matias Bjørling wrote: >>> For fio, you can use the zone support here: >>> >>> https://github.com/bvanassche/fio >>> >>> It is in the process of being upstreamed. >> >> In the spirit of making some progress on this, I just don't like how >> it's done. For example, it should not be necessary to adjust what >> comes out of the block generator, instead the block generator should >> be told to do what we need on zbc. This is a key concept. The workload >> should be defined as such that it works for zoned devices. > > Hello Jens, > > How would you like to see block generation work? I don't see an > alternative for random I/O other starting from the output of a random > generator and translating that output into something that is > appropriate for a zoned block device. Random reads must happen below > the zone pointer if fio is configured to read below the zone pointer. > Random writes must happen at the write pointer. The only way I see to > implement such an I/O pattern is to start from the output of a random > generator and to adjust the output of that random generator. However, > I don't have a strong opinion whether adjusting the output of a random > generator should happen by the caller of get_next_buflen() or inside > get_next_buflen(). Or is your concern perhaps that the current > approach interferes with fio job options like bs_unaligned? The main issue I have with that approach is that the core of fio is generating the IO patterns, and then you are just changing them as you see fit. This means that the workload definition and the resulting IO operations are no longer matched up, since they now also depend on what you are running on. If I take one workload and run it on a zoned drive, and then run it on a non-zoned drive, I can't compare the results at all. This is a showstopper. There should be no adjusting of the output, rather it should be possible to write zoned friendly job definitions. It should be possible to run the same job on a non-zoned drive, and vice versa, and the resulting IO patterns must be the same. Fio already has some notion of zones. Maybe that could be extended to hard zones, and some control of open zones, and patterns within those zones? -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-10 14:46 ` Jens Axboe @ 2018-07-10 16:47 ` Bart Van Assche 2018-07-10 18:45 ` Jens Axboe 0 siblings, 1 reply; 16+ messages in thread From: Bart Van Assche @ 2018-07-10 16:47 UTC (permalink / raw) To: mb@lightnvm.io, axboe@kernel.dk, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal T24gVHVlLCAyMDE4LTA3LTEwIGF0IDA4OjQ2IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBP biA3LzkvMTggNjowNSBQTSwgQmFydCBWYW4gQXNzY2hlIHdyb3RlOg0KPiA+IE9uIE1vbiwgMjAx OC0wNy0wOSBhdCAxMDozNCAtMDYwMCwgSmVucyBBeGJvZSB3cm90ZToNCj4gPiA+IEluIHRoZSBz cGlyaXQgb2YgbWFraW5nIHNvbWUgcHJvZ3Jlc3Mgb24gdGhpcywgSSBqdXN0IGRvbid0IGxpa2Ug aG93DQo+ID4gPiBpdCdzIGRvbmUuIEZvciBleGFtcGxlLCBpdCBzaG91bGQgbm90IGJlIG5lY2Vz c2FyeSB0byBhZGp1c3Qgd2hhdA0KPiA+ID4gY29tZXMgb3V0IG9mIHRoZSBibG9jayBnZW5lcmF0 b3IsIGluc3RlYWQgdGhlIGJsb2NrIGdlbmVyYXRvciBzaG91bGQNCj4gPiA+IGJlIHRvbGQgdG8g ZG8gd2hhdCB3ZSBuZWVkIG9uIHpiYy4gVGhpcyBpcyBhIGtleSBjb25jZXB0LiBUaGUgd29ya2xv YWQNCj4gPiA+IHNob3VsZCBiZSBkZWZpbmVkIGFzIHN1Y2ggdGhhdCBpdCB3b3JrcyBmb3Igem9u ZWQgZGV2aWNlcy4NCj4gPiANCj4gPiBIb3cgd291bGQgeW91IGxpa2UgdG8gc2VlIGJsb2NrIGdl bmVyYXRpb24gd29yaz8gSSBkb24ndCBzZWUgYW4NCj4gPiBhbHRlcm5hdGl2ZSBmb3IgcmFuZG9t IEkvTyBvdGhlciBzdGFydGluZyBmcm9tIHRoZSBvdXRwdXQgb2YgYSByYW5kb20NCj4gPiBnZW5l cmF0b3IgYW5kIHRyYW5zbGF0aW5nIHRoYXQgb3V0cHV0IGludG8gc29tZXRoaW5nIHRoYXQgaXMN Cj4gPiBhcHByb3ByaWF0ZSBmb3IgYSB6b25lZCBibG9jayBkZXZpY2UuIFJhbmRvbSByZWFkcyBt dXN0IGhhcHBlbiBiZWxvdw0KPiA+IHRoZSB6b25lIHBvaW50ZXIgaWYgZmlvIGlzIGNvbmZpZ3Vy ZWQgdG8gcmVhZCBiZWxvdyB0aGUgem9uZSBwb2ludGVyLg0KPiA+IFJhbmRvbSB3cml0ZXMgbXVz dCBoYXBwZW4gYXQgdGhlIHdyaXRlIHBvaW50ZXIuIFRoZSBvbmx5IHdheSBJIHNlZSB0bw0KPiA+ IGltcGxlbWVudCBzdWNoIGFuIEkvTyBwYXR0ZXJuIGlzIHRvIHN0YXJ0IGZyb20gdGhlIG91dHB1 dCBvZiBhIHJhbmRvbQ0KPiA+IGdlbmVyYXRvciBhbmQgdG8gYWRqdXN0IHRoZSBvdXRwdXQgb2Yg dGhhdCByYW5kb20gZ2VuZXJhdG9yLiBIb3dldmVyLA0KPiA+IEkgZG9uJ3QgaGF2ZSBhIHN0cm9u ZyBvcGluaW9uIHdoZXRoZXIgYWRqdXN0aW5nIHRoZSBvdXRwdXQgb2YgYSByYW5kb20NCj4gPiBn ZW5lcmF0b3Igc2hvdWxkIGhhcHBlbiBieSB0aGUgY2FsbGVyIG9mIGdldF9uZXh0X2J1Zmxlbigp IG9yIGluc2lkZQ0KPiA+IGdldF9uZXh0X2J1ZmxlbigpLiBPciBpcyB5b3VyIGNvbmNlcm4gcGVy aGFwcyB0aGF0IHRoZSBjdXJyZW50DQo+ID4gYXBwcm9hY2ggaW50ZXJmZXJlcyB3aXRoIGZpbyBq b2Igb3B0aW9ucyBsaWtlIGJzX3VuYWxpZ25lZD8NCj4gDQo+IFRoZSBtYWluIGlzc3VlIEkgaGF2 ZSB3aXRoIHRoYXQgYXBwcm9hY2ggaXMgdGhhdCB0aGUgY29yZSBvZiBmaW8gaXMNCj4gZ2VuZXJh dGluZyB0aGUgSU8gcGF0dGVybnMsIGFuZCB0aGVuIHlvdSBhcmUganVzdCBjaGFuZ2luZyB0aGVt IGFzIHlvdQ0KPiBzZWUgZml0LiBUaGlzIG1lYW5zIHRoYXQgdGhlIHdvcmtsb2FkIGRlZmluaXRp b24gYW5kIHRoZSByZXN1bHRpbmcgSU8NCj4gb3BlcmF0aW9ucyBhcmUgbm8gbG9uZ2VyIG1hdGNo ZWQgdXAsIHNpbmNlIHRoZXkgbm93IGFsc28gZGVwZW5kIG9uIHdoYXQNCj4geW91IGFyZSBydW5u aW5nIG9uLiBJZiBJIHRha2Ugb25lIHdvcmtsb2FkIGFuZCBydW4gaXQgb24gYSB6b25lZCBkcml2 ZSwNCj4gYW5kIHRoZW4gcnVuIGl0IG9uIGEgbm9uLXpvbmVkIGRyaXZlLCBJIGNhbid0IGNvbXBh cmUgdGhlIHJlc3VsdHMgYXQNCj4gYWxsLiBUaGlzIGlzIGEgc2hvd3N0b3BwZXIuDQo+IA0KPiBU aGVyZSBzaG91bGQgYmUgbm8gYWRqdXN0aW5nIG9mIHRoZSBvdXRwdXQsIHJhdGhlciBpdCBzaG91 bGQgYmUgcG9zc2libGUNCj4gdG8gd3JpdGUgem9uZWQgZnJpZW5kbHkgam9iIGRlZmluaXRpb25z LiBJdCBzaG91bGQgYmUgcG9zc2libGUgdG8gcnVuDQo+IHRoZSBzYW1lIGpvYiBvbiBhIG5vbi16 b25lZCBkcml2ZSwgYW5kIHZpY2UgdmVyc2EsIGFuZCB0aGUgcmVzdWx0aW5nIElPDQo+IHBhdHRl cm5zIG11c3QgYmUgdGhlIHNhbWUuDQo+IA0KPiBGaW8gYWxyZWFkeSBoYXMgc29tZSBub3Rpb24g b2Ygem9uZXMuIE1heWJlIHRoYXQgY291bGQgYmUgZXh0ZW5kZWQgdG8NCj4gaGFyZCB6b25lcywg YW5kIHNvbWUgY29udHJvbCBvZiBvcGVuIHpvbmVzLCBhbmQgcGF0dGVybnMgd2l0aGluIHRob3Nl DQo+IHpvbmVzPw0KDQpIZWxsbyBKZW5zLA0KDQpIb3cgYWJvdXQgYWRkaW5nIGEgam9iIG9wdGlv biB0aGF0IG1ha2VzIGl0IHBvc3NpYmxlIHRvIHVzZSB0aGUgem9uZWQNCmJsb2NrIGRldmljZSAo WkJEKSBJL08gcGF0dGVybiBvbiBub24tWkJEIGRldmljZXMsIHJlcXVpcmluZyB0aGF0IHRoZQ0K em9uZSBzaXplIGlzIHNldCBleHBsaWNpdGx5IGZvciBub24tWkJEIGRldmljZXMgYW5kIG1haW50 YWluaW5nIGEgd3JpdGUNCnBvaW50ZXIgbm90IG9ubHkgd2hlbiBwZXJmb3JtaW5nIEkvTyB0byBh IFpCRCBkZXZpY2UgYnV0IGFsc28gaWYgYQ0KWkJELXN0eWxlIEkvTyBwYXR0ZXJuIGlzIGFwcGxp ZWQgdG8gYSBub24tWkJEIGRpc2s/IFRoaXMgc2hvdWxkIGFsbG93IHRvDQphcHBseSBleGFjdGx5 IHRoZSBzYW1lIHdvcmtsb2FkIHRvIGEgbm9uLVpCRCBkaXNrIGFzIHRvIGEgWkJEIGRpc2suDQoN CldoYXQgSSBkZXJpdmVkIGZyb20gdGhlIGZpbyBzb3VyY2UgY29kZSBpcyBhcyBmb2xsb3dzIChw bGVhc2UgY29ycmVjdCBtZQ0KaWYgSSBnb3QgYW55dGhpbmcgd3JvbmcpOg0KKiBUaGUgcHVycG9z ZSBvZiB0aGUgem9uZXNpemUsIHpvbmVyYW5nZSBhbmQgem9uZXNraXAgam9iIG9wdGlvbnMgaXMg dG8NCiAgbGltaXQgdGhlIEkvTyByYW5nZSB0byBhIHNpbmdsZSB6b25lIHdpdGggc2l6ZSAiem9u ZXNpemUiLiBUaGUgSS9PDQogIHBhdHRlcm4gZm9yIHpvbmVkIGJsb2NrIGRldmljZXMgaXMgZGlm ZmVyZW50OiBJL08gaGFwcGVucyBpbiBtdWx0aXBsZQ0KICB6b25lcyBzaW11bHRhbmVvdXNseS4g VGhlIG51bWJlciBvZiB6b25lcyB0byB3aGljaCBJL08gaGFwcGVucyBpcw0KICBjYWxsZWQgdGhl IG51bWJlciBvZiBvcGVuIHpvbmVzLg0KKiBUaGUgcHVycG9zZSBvZiB0aGUgcmFuZG9tX2Rpc3Ry aWJ1dGlvbj16b25lZHtfYWJzfSBqb2Igb3B0aW9uIGlzIHRvDQogIGFsbG93IHRoZSB1c2VyIHRv IHNrZXcgYSB1bmlmb3JtIHJhbmRvbSBkaXN0cmlidXRpb24uIFRoaXMgaXMgYW5vdGhlcg0KICB3 b3JrbG9hZCBwYXR0ZXJuIHRoYW4gdGhlIHR5cGljYWwgcGF0dGVybiBmb3IgWkJEIGRyaXZlcy4N Cg0KVGhhbmtzLA0KDQpCYXJ0Lg0KDQoNCg0KDQoNCg== ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-10 16:47 ` Bart Van Assche @ 2018-07-10 18:45 ` Jens Axboe 2018-07-10 18:49 ` Bart Van Assche 0 siblings, 1 reply; 16+ messages in thread From: Jens Axboe @ 2018-07-10 18:45 UTC (permalink / raw) To: Bart Van Assche, mb@lightnvm.io, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On 7/10/18 10:47 AM, Bart Van Assche wrote: > On Tue, 2018-07-10 at 08:46 -0600, Jens Axboe wrote: >> On 7/9/18 6:05 PM, Bart Van Assche wrote: >>> On Mon, 2018-07-09 at 10:34 -0600, Jens Axboe wrote: >>>> In the spirit of making some progress on this, I just don't like how >>>> it's done. For example, it should not be necessary to adjust what >>>> comes out of the block generator, instead the block generator should >>>> be told to do what we need on zbc. This is a key concept. The workload >>>> should be defined as such that it works for zoned devices. >>> >>> How would you like to see block generation work? I don't see an >>> alternative for random I/O other starting from the output of a random >>> generator and translating that output into something that is >>> appropriate for a zoned block device. Random reads must happen below >>> the zone pointer if fio is configured to read below the zone pointer. >>> Random writes must happen at the write pointer. The only way I see to >>> implement such an I/O pattern is to start from the output of a random >>> generator and to adjust the output of that random generator. However, >>> I don't have a strong opinion whether adjusting the output of a random >>> generator should happen by the caller of get_next_buflen() or inside >>> get_next_buflen(). Or is your concern perhaps that the current >>> approach interferes with fio job options like bs_unaligned? >> >> The main issue I have with that approach is that the core of fio is >> generating the IO patterns, and then you are just changing them as you >> see fit. This means that the workload definition and the resulting IO >> operations are no longer matched up, since they now also depend on what >> you are running on. If I take one workload and run it on a zoned drive, >> and then run it on a non-zoned drive, I can't compare the results at >> all. This is a showstopper. >> >> There should be no adjusting of the output, rather it should be possible >> to write zoned friendly job definitions. It should be possible to run >> the same job on a non-zoned drive, and vice versa, and the resulting IO >> patterns must be the same. >> >> Fio already has some notion of zones. Maybe that could be extended to >> hard zones, and some control of open zones, and patterns within those >> zones? > > Hello Jens, > > How about adding a job option that makes it possible to use the zoned > block device (ZBD) I/O pattern on non-ZBD devices, requiring that the > zone size is set explicitly for non-ZBD devices and maintaining a write > pointer not only when performing I/O to a ZBD device but also if a > ZBD-style I/O pattern is applied to a non-ZBD disk? This should allow to > apply exactly the same workload to a non-ZBD disk as to a ZBD disk. It just doesn't make any sense to me. The source of truth is the generator of the IO, which does exactly what it is told by the job definition. You're proposing to mangle that somehow, to fit some restrictions that the underlying device has. That very concept is foreign, and adding an option to be able to do the same on some other device is misleading. The difference between the job file and the workload run can be huge. Consider something really basic: [randwrites] bs=4k rw=randwrite which would be 100% random 4k writes. If I run this on a zoned device, then that'd turn into 100% sequential writes. That makes no sense at all. And if I run it on a different devices, I'd get 100% random writes. Except if I set some magic option. Sorry, but that concept is just too ugly to live, it makes zero sense. Put down your zoned hat for a bit and think about it. > What I derived from the fio source code is as follows (please correct me > if I got anything wrong): > * The purpose of the zonesize, zonerange and zoneskip job options is to > limit the I/O range to a single zone with size "zonesize". The I/O > pattern for zoned block devices is different: I/O happens in multiple > zones simultaneously. The number of zones to which I/O happens is > called the number of open zones. The only difference is that fio currently only has one zone active. When it finishes one, it goes to the next. See my above suggestion on adding the notion of open zones, which would extend this to more than 1. > * The purpose of the random_distribution=zoned{_abs} job option is to > allow the user to skew a uniform random distribution. This is another > workload pattern than the typical pattern for ZBD drives. Fio's zones were never intended to be for zoned devices. Don't get hung up on current use cases, think about what kind of definitions would make sense for zoned devices. -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-10 18:45 ` Jens Axboe @ 2018-07-10 18:49 ` Bart Van Assche 2018-07-10 18:51 ` Jens Axboe 0 siblings, 1 reply; 16+ messages in thread From: Bart Van Assche @ 2018-07-10 18:49 UTC (permalink / raw) To: mb@lightnvm.io, axboe@kernel.dk, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal T24gVHVlLCAyMDE4LTA3LTEwIGF0IDEyOjQ1IC0wNjAwLCBKZW5zIEF4Ym9lIHdyb3RlOg0KPiBU aGUgZGlmZmVyZW5jZSBiZXR3ZWVuIHRoZSBqb2IgZmlsZSBhbmQgdGhlDQo+IHdvcmtsb2FkIHJ1 biBjYW4gYmUgaHVnZS4gQ29uc2lkZXIgc29tZXRoaW5nIHJlYWxseSBiYXNpYzoNCj4gDQo+IFty YW5kd3JpdGVzXQ0KPiBicz00aw0KPiBydz1yYW5kd3JpdGUNCj4gDQo+IHdoaWNoIHdvdWxkIGJl IDEwMCUgcmFuZG9tIDRrIHdyaXRlcy4gSWYgSSBydW4gdGhpcyBvbiBhIHpvbmVkIGRldmljZSwN Cj4gdGhlbiB0aGF0J2QgdHVybiBpbnRvIDEwMCUgc2VxdWVudGlhbCB3cml0ZXMuDQoNClRoYXQn cyBub3QgY29ycmVjdC4gVGhlIFpCRCBjb2RlIGluIHRoZSBnaXRodWIgcHVsbCByZXF1ZXN0IHNl cmlhbGl6ZXMgd3JpdGVzDQpwZXIgem9uZSwgbm90IGdsb2JhbGx5Lg0KDQo+ID4gV2hhdCBJIGRl cml2ZWQgZnJvbSB0aGUgZmlvIHNvdXJjZSBjb2RlIGlzIGFzIGZvbGxvd3MgKHBsZWFzZSBjb3Jy ZWN0IG1lDQo+ID4gaWYgSSBnb3QgYW55dGhpbmcgd3JvbmcpOg0KPiA+ICogVGhlIHB1cnBvc2Ug b2YgdGhlIHpvbmVzaXplLCB6b25lcmFuZ2UgYW5kIHpvbmVza2lwIGpvYiBvcHRpb25zIGlzIHRv DQo+ID4gICBsaW1pdCB0aGUgSS9PIHJhbmdlIHRvIGEgc2luZ2xlIHpvbmUgd2l0aCBzaXplICJ6 b25lc2l6ZSIuIFRoZSBJL08NCj4gPiAgIHBhdHRlcm4gZm9yIHpvbmVkIGJsb2NrIGRldmljZXMg aXMgZGlmZmVyZW50OiBJL08gaGFwcGVucyBpbiBtdWx0aXBsZQ0KPiA+ICAgem9uZXMgc2ltdWx0 YW5lb3VzbHkuIFRoZSBudW1iZXIgb2Ygem9uZXMgdG8gd2hpY2ggSS9PIGhhcHBlbnMgaXMNCj4g PiAgIGNhbGxlZCB0aGUgbnVtYmVyIG9mIG9wZW4gem9uZXMuDQo+IA0KPiBUaGUgb25seSBkaWZm ZXJlbmNlIGlzIHRoYXQgZmlvIGN1cnJlbnRseSBvbmx5IGhhcyBvbmUgem9uZSBhY3RpdmUuIFdo ZW4NCj4gaXQgZmluaXNoZXMgb25lLCBpdCBnb2VzIHRvIHRoZSBuZXh0LiBTZWUgbXkgYWJvdmUg c3VnZ2VzdGlvbiBvbiBhZGRpbmcNCj4gdGhlIG5vdGlvbiBvZiBvcGVuIHpvbmVzLCB3aGljaCB3 b3VsZCBleHRlbmQgdGhpcyB0byBtb3JlIHRoYW4gMS4NCg0KVGhhbmtzLCBJIHdpbGwgbG9vayBp bnRvIHRoaXMuDQoNCkJhcnQuDQoNCg0KDQoNCg0K ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-10 18:49 ` Bart Van Assche @ 2018-07-10 18:51 ` Jens Axboe 2018-08-09 20:51 ` Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support) Bart Van Assche 0 siblings, 1 reply; 16+ messages in thread From: Jens Axboe @ 2018-07-10 18:51 UTC (permalink / raw) To: Bart Van Assche, mb@lightnvm.io, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On 7/10/18 12:49 PM, Bart Van Assche wrote: > On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote: >> The difference between the job file and the >> workload run can be huge. Consider something really basic: >> >> [randwrites] >> bs=4k >> rw=randwrite >> >> which would be 100% random 4k writes. If I run this on a zoned device, >> then that'd turn into 100% sequential writes. > > That's not correct. The ZBD code in the github pull request serializes writes > per zone, not globally. That's a totally minor detail. If all my random writes fall within a single zone, then they'd be 100% sequential. For N open zones, you'd be 100% sequential within the zone. The point is that the workload as defined and the workload as run are two totally different things, and THAT is the problem. -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support) 2018-07-10 18:51 ` Jens Axboe @ 2018-08-09 20:51 ` Bart Van Assche 2018-08-09 21:03 ` Zoned block device support for fio Jens Axboe 0 siblings, 1 reply; 16+ messages in thread From: Bart Van Assche @ 2018-08-09 20:51 UTC (permalink / raw) To: mb@lightnvm.io, axboe@kernel.dk, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote: > On 7/10/18 12:49 PM, Bart Van Assche wrote: > > On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote: > > > The difference between the job file and the > > > workload run can be huge. Consider something really basic= : > > >=20 > > > [randwrites] > > > bs=4k > > > rw=randwrite > > >=20 > > > which would be 100% random 4k writes. If I run this o= n a zoned device, > > > then that'd turn into 100% sequential writes. > >=20 > > That's not correct. The ZBD code in the github pull request ser= ializes writes > > per zone, not globally. >=20 > That's a totally minor detail. If all my random writes fall within a = single > zone, then they'd be 100% sequential. For N open zones, you'd be = 100% > sequential within the zone. The point is that the workload as defined= and > the workload as run are two totally different things, and THAT is the > problem. Hello Jens, What you proposed in a previous e-mail, namely to use the existing fio zone concept for zoned block devices sounds interesting to me. This is something= I will definitely look further into. This will help to make a given workload that is suited for zoned block devices to behave (almost) identical when ru= n against a regular block device. Since fio users expect that no zones are reset before e.g. a random write test is started, there will always be a sm= all difference in behavior between a workload run against a zoned block device = and a workload run against a regular device if some zones already contain data. It's not clear to me how close you want the behavior of fio to be for zoned and regular block devices. Do you e.g. want me to introduce a new I/O patte= rn (--rw=...) that causes fio to write sequentially inside a zone and for = which zones are selected randomly? I don't see any other approach that allows to make sure that the same workload definition behaves identically against zon= ed and regular block devices. Thanks, Bart. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Zoned block device support for fio 2018-08-09 20:51 ` Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support) Bart Van Assche @ 2018-08-09 21:03 ` Jens Axboe 2018-08-15 18:07 ` Bart Van Assche 0 siblings, 1 reply; 16+ messages in thread From: Jens Axboe @ 2018-08-09 21:03 UTC (permalink / raw) To: Bart Van Assche, mb@lightnvm.io, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On 8/9/18 2:51 PM, Bart Van Assche wrote: > On Tue, 2018-07-10 at 12:51 -0600, Jens Axboe wrote: >> On 7/10/18 12:49 PM, Bart Van Assche wrote: >>> On Tue, 2018-07-10 at 12:45 -0600, Jens Axboe wrote: >>>> The difference between the job file and the >>>> workload run can be huge. Consider something really basic: >>>> >>>> [randwrites] >>>> bs=4k >>>> rw=randwrite >>>> >>>> which would be 100% random 4k writes. If I run this on a zoned device, >>>> then that'd turn into 100% sequential writes. >>> >>> That's not correct. The ZBD code in the github pull request serializes writes >>> per zone, not globally. >> >> That's a totally minor detail. If all my random writes fall within a single >> zone, then they'd be 100% sequential. For N open zones, you'd be 100% >> sequential within the zone. The point is that the workload as defined and >> the workload as run are two totally different things, and THAT is the >> problem. > > Hello Jens, > > What you proposed in a previous e-mail, namely to use the existing fio zone > concept for zoned block devices sounds interesting to me. This is something I > will definitely look further into. This will help to make a given workload > that is suited for zoned block devices to behave (almost) identical when run > against a regular block device. Since fio users expect that no zones are > reset before e.g. a random write test is started, there will always be a small > difference in behavior between a workload run against a zoned block device and > a workload run against a regular device if some zones already contain data. > > It's not clear to me how close you want the behavior of fio to be for zoned > and regular block devices. Do you e.g. want me to introduce a new I/O pattern > (--rw=...) that causes fio to write sequentially inside a zone and for which > zones are selected randomly? I don't see any other approach that allows to > make sure that the same workload definition behaves identically against zoned > and regular block devices. Yes exactly. Basically come up with something that just maps fio zones on top of SMR zones. Then come up with something that allows you to have sequential writes IO within a zone, and something that allows you to decide how many zones to use, when to skip between zones, etc. -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: Zoned block device support for fio 2018-08-09 21:03 ` Zoned block device support for fio Jens Axboe @ 2018-08-15 18:07 ` Bart Van Assche 0 siblings, 0 replies; 16+ messages in thread From: Bart Van Assche @ 2018-08-15 18:07 UTC (permalink / raw) To: mb@lightnvm.io, axboe@kernel.dk, loberman@redhat.com Cc: linux-kernel@vger.kernel.org, linux-block@vger.kernel.org, Damien Le Moal On Thu, 2018-08-09 at 15:03 -0600, Jens Axboe wrote: > Yes exactly. Basically come up with something that just maps fio zone= s on > top of SMR zones. Then come up with something that allows you to have > sequential writes IO within a zone, and something that allows you to = decide > how many zones to use, when to skip between zones, etc. Hello Jens, SMR support for fio has been reworked. The reworked patches are available a= t https://github.com/axboe/fio/pull/585. The following two test scripts allow= to verify whether the fio behavior is identical for SMR and non-SMR storage me= dia: * t/zbd/run-tests-against-regular-nullb * t/zbd/run-tests-against-zoned-nullb Thanks, Bart. ^ permalink raw reply [flat|nested] 16+ messages in thread
* Re: [PATCH 0/2] null_blk: zone support 2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling ` (2 preceding siblings ...) 2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman @ 2018-07-07 2:54 ` Jens Axboe 3 siblings, 0 replies; 16+ messages in thread From: Jens Axboe @ 2018-07-07 2:54 UTC (permalink / raw) To: Matias Bjørling Cc: linux-block, linux-kernel, bart.vanassche, damien.lemoal On 7/6/18 11:38 AM, Matias Bjørling wrote: > This series adds support for exposing a zone block device using the > null_blk device driver. > > The first patch moves core null_blk data structures to a shared header > file. The second implements the actual zone support. The patchset adds > two new options. One to enable the zone interface, and another to > define the size of the zones to expose. Thanks, should be useful for verifying write pointer violations. Applied for 4.19. -- Jens Axboe ^ permalink raw reply [flat|nested] 16+ messages in thread
end of thread, other threads:[~2018-08-15 18:07 UTC | newest] Thread overview: 16+ messages (download: mbox.gz follow: Atom feed -- links below jump to the message on this page -- 2018-07-06 17:38 [PATCH 0/2] null_blk: zone support Matias Bjørling 2018-07-06 17:38 ` [PATCH 1/2] null_blk: move shared definitions to header file Matias Bjørling 2018-07-06 17:38 ` [PATCH 2/2] null_blk: add zone support Matias Bjørling 2018-07-06 17:45 ` [PATCH 0/2] null_blk: " Laurence Oberman 2018-07-09 7:54 ` Matias Bjørling 2018-07-09 16:34 ` Jens Axboe 2018-07-10 0:05 ` Bart Van Assche 2018-07-10 14:46 ` Jens Axboe 2018-07-10 16:47 ` Bart Van Assche 2018-07-10 18:45 ` Jens Axboe 2018-07-10 18:49 ` Bart Van Assche 2018-07-10 18:51 ` Jens Axboe 2018-08-09 20:51 ` Zoned block device support for fio (was: [PATCH 0/2] null_blk: zone support) Bart Van Assche 2018-08-09 21:03 ` Zoned block device support for fio Jens Axboe 2018-08-15 18:07 ` Bart Van Assche 2018-07-07 2:54 ` [PATCH 0/2] null_blk: zone support Jens Axboe
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox