* [PATCH] null_blk: allow byte aligned memory offsets
@ 2025-11-03 17:28 Keith Busch
2025-11-04 1:48 ` Damien Le Moal
` (2 more replies)
0 siblings, 3 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-03 17:28 UTC (permalink / raw)
To: linux-block, hch, axboe, dlemoal, hans.holmberg; +Cc: Keith Busch
From: Keith Busch <kbusch@kernel.org>
Allowing byte aligned memory provides a nice testing ground for
direct-io. This has an added benefit of a single kmap/kumap per bio
segment rather than multiple times for each multi-page segment.
Signed-off-by: Keith Busch <kbusch@kernel.org>
---
drivers/block/null_blk/main.c | 84 +++++++++++++++++-----------------
drivers/block/null_blk/zoned.c | 2 +-
2 files changed, 43 insertions(+), 43 deletions(-)
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 0ee55f889cfdd..2227f6db5d3d5 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1129,40 +1129,42 @@ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
return 0;
}
-static int copy_to_nullb(struct nullb *nullb, struct page *source,
- unsigned int off, sector_t sector, size_t n, bool is_fua)
+static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
+ size_t n, bool is_fua)
{
size_t temp, count = 0;
unsigned int offset;
struct nullb_page *t_page;
+ sector_t sector;
while (count < n) {
+ sector = pos >> SECTOR_SHIFT;
temp = min_t(size_t, nullb->dev->blocksize, n - count);
if (null_cache_active(nullb) && !is_fua)
null_make_cache_space(nullb, PAGE_SIZE);
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+ offset = pos & (PAGE_SIZE - 1);
t_page = null_insert_page(nullb, sector,
!null_cache_active(nullb) || is_fua);
if (!t_page)
return -ENOSPC;
- memcpy_page(t_page->page, offset, source, off + count, temp);
+ memcpy_to_page(t_page->page, offset, source, temp);
__set_bit(sector & SECTOR_MASK, t_page->bitmap);
if (is_fua)
null_free_sector(nullb, sector, true);
+ source += temp;
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
return 0;
}
-static int copy_from_nullb(struct nullb *nullb, struct page *dest,
- unsigned int off, sector_t sector, size_t n)
+static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
{
size_t temp, count = 0;
unsigned int offset;
@@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
while (count < n) {
temp = min_t(size_t, nullb->dev->blocksize, n - count);
- offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
- t_page = null_lookup_page(nullb, sector, false,
+ offset = pos & (PAGE_SIZE - 1);
+ t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
!null_cache_active(nullb));
if (t_page)
- memcpy_page(dest, off + count, t_page->page, offset,
- temp);
+ memcpy_from_page(dest, t_page->page, offset, temp);
else
- memzero_page(dest, off + count, temp);
+ memset(dest, 0, temp);
+ dest += temp;
count += temp;
- sector += temp >> SECTOR_SHIFT;
+ pos += temp;
}
return 0;
}
-static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off)
-{
- memset_page(page, off, 0xff, len);
-}
-
blk_status_t null_handle_discard(struct nullb_device *dev,
sector_t sector, sector_t nr_sectors)
{
@@ -1234,8 +1230,8 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
return errno_to_blk_status(err);
}
-static int null_transfer(struct nullb *nullb, struct page *page,
- unsigned int len, unsigned int off, bool is_write, sector_t sector,
+static int null_transfer(struct nullb *nullb, void *p,
+ unsigned int len, bool is_write, loff_t pos,
bool is_fua)
{
struct nullb_device *dev = nullb->dev;
@@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
int err = 0;
if (!is_write) {
- if (dev->zoned)
+ if (dev->zoned) {
valid_len = null_zone_valid_read_len(nullb,
- sector, len);
+ pos >> SECTOR_SHIFT, len);
+
+ if (valid_len && valid_len != len)
+ valid_len -= (pos & (SECTOR_SIZE - 1));
+ }
if (valid_len) {
- err = copy_from_nullb(nullb, page, off,
- sector, valid_len);
- off += valid_len;
+ err = copy_from_nullb(nullb, p, pos, valid_len);
+ p += valid_len;
len -= valid_len;
}
if (len)
- nullb_fill_pattern(nullb, page, len, off);
- flush_dcache_page(page);
+ memset(p, 0xff, len);
+ flush_dcache_page(virt_to_page(p));
} else {
- flush_dcache_page(page);
- err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+ flush_dcache_page(virt_to_page(p));
+ err = copy_to_nullb(nullb, p, pos, len, is_fua);
}
return err;
@@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
struct nullb *nullb = cmd->nq->dev->nullb;
int err = 0;
unsigned int len;
- sector_t sector = blk_rq_pos(rq);
- unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
- unsigned int transferred_bytes = 0;
+ loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
+ unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
struct req_iterator iter;
struct bio_vec bvec;
spin_lock_irq(&nullb->lock);
rq_for_each_segment(bvec, rq, iter) {
+ void *p = bvec_kmap_local(&bvec);;
+
len = bvec.bv_len;
- if (transferred_bytes + len > max_bytes)
- len = max_bytes - transferred_bytes;
- err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(req_op(rq)), sector,
- rq->cmd_flags & REQ_FUA);
+ if (len > nr_bytes)
+ len = nr_bytes;
+ err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
+ pos, rq->cmd_flags & REQ_FUA);
+ kunmap_local(p);
if (err)
break;
- sector += len >> SECTOR_SHIFT;
- transferred_bytes += len;
- if (transferred_bytes >= max_bytes)
+ pos += len;
+ nr_bytes -= len;
+ if (!nr_bytes)
break;
}
spin_unlock_irq(&nullb->lock);
@@ -1949,7 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
.logical_block_size = dev->blocksize,
.physical_block_size = dev->blocksize,
.max_hw_sectors = dev->max_sectors,
- .dma_alignment = dev->blocksize - 1,
+ .dma_alignment = 1,
};
struct nullb *nullb;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f459899..8e9648f87f7c8 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
{
struct nullb_device *dev = nullb->dev;
struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
- unsigned int nr_sectors = len >> SECTOR_SHIFT;
+ unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
/* Read must be below the write pointer position */
if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
--
2.47.3
^ permalink raw reply related [flat|nested] 6+ messages in thread
* Re: [PATCH] null_blk: allow byte aligned memory offsets
2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
@ 2025-11-04 1:48 ` Damien Le Moal
2025-11-05 18:57 ` Keith Busch
2025-11-04 9:15 ` Hans Holmberg
2025-11-04 11:24 ` Christoph Hellwig
2 siblings, 1 reply; 6+ messages in thread
From: Damien Le Moal @ 2025-11-04 1:48 UTC (permalink / raw)
To: Keith Busch, linux-block, hch, axboe, hans.holmberg; +Cc: Keith Busch
On 11/4/25 02:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Allowing byte aligned memory provides a nice testing ground for
> direct-io. This has an added benefit of a single kmap/kumap per bio
> segment rather than multiple times for each multi-page segment.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
Overall looks good to me. A few nits below.
> +static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
> + size_t n, bool is_fua)
> {
> size_t temp, count = 0;
> unsigned int offset;
> struct nullb_page *t_page;
> + sector_t sector;
>
> while (count < n) {
> + sector = pos >> SECTOR_SHIFT;
> temp = min_t(size_t, nullb->dev->blocksize, n - count);
>
> if (null_cache_active(nullb) && !is_fua)
> null_make_cache_space(nullb, PAGE_SIZE);
>
> - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> + offset = pos & (PAGE_SIZE - 1);
Offset is only used in the memcpy_to_page() call below, so maybe move this line
down, or just completely remove that local variable as it has little value ?
> t_page = null_insert_page(nullb, sector,
> !null_cache_active(nullb) || is_fua);
> if (!t_page)
> return -ENOSPC;
[...]
> -static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> - unsigned int off, sector_t sector, size_t n)
> +static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
> {
> size_t temp, count = 0;
> unsigned int offset;
> @@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> while (count < n) {
> temp = min_t(size_t, nullb->dev->blocksize, n - count);
>
> - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> - t_page = null_lookup_page(nullb, sector, false,
> + offset = pos & (PAGE_SIZE - 1);
Same comment here.
> + t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
> !null_cache_active(nullb));
>
> if (t_page)
> - memcpy_page(dest, off + count, t_page->page, offset,
> - temp);
> + memcpy_from_page(dest, t_page->page, offset, temp);
> else
> - memzero_page(dest, off + count, temp);
> + memset(dest, 0, temp);
>
> + dest += temp;
> count += temp;
> - sector += temp >> SECTOR_SHIFT;
> + pos += temp;
> }
> return 0;
> }
[...]
> -static int null_transfer(struct nullb *nullb, struct page *page,
> - unsigned int len, unsigned int off, bool is_write, sector_t sector,
> +static int null_transfer(struct nullb *nullb, void *p,
> + unsigned int len, bool is_write, loff_t pos,
> bool is_fua)
> {
> struct nullb_device *dev = nullb->dev;
> @@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
> int err = 0;
>
> if (!is_write) {
> - if (dev->zoned)
> + if (dev->zoned) {
> valid_len = null_zone_valid_read_len(nullb,
> - sector, len);
> + pos >> SECTOR_SHIFT, len);
> +
> + if (valid_len && valid_len != len)
> + valid_len -= (pos & (SECTOR_SIZE - 1));
I do not think you need the outer parenthesis here.
> + }
>
> if (valid_len) {
> - err = copy_from_nullb(nullb, page, off,
> - sector, valid_len);
> - off += valid_len;
> + err = copy_from_nullb(nullb, p, pos, valid_len);
Not your fault, but if this fails, we will still do the nullb_fill_pattern()
below which I do not think is correct... ? May be we should have:
if (err)
return err;
here ? But not sure if we should still call flush_dcache_page() even on error
though.
> + p += valid_len;
> len -= valid_len;
> }
>
> if (len)
> - nullb_fill_pattern(nullb, page, len, off);
> - flush_dcache_page(page);
> + memset(p, 0xff, len);
> + flush_dcache_page(virt_to_page(p));
> } else {
> - flush_dcache_page(page);
> - err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> + flush_dcache_page(virt_to_page(p));
> + err = copy_to_nullb(nullb, p, pos, len, is_fua);
Nit: this could be "return copy_to_nullb();"
> }
>
> return err;
> @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
> struct nullb *nullb = cmd->nq->dev->nullb;
> int err = 0;
> unsigned int len;
> - sector_t sector = blk_rq_pos(rq);
> - unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> - unsigned int transferred_bytes = 0;
> + loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> + unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
Overflow potential here ?
> struct req_iterator iter;
> struct bio_vec bvec;
>
> spin_lock_irq(&nullb->lock);
> rq_for_each_segment(bvec, rq, iter) {
> + void *p = bvec_kmap_local(&bvec);;
> +
> len = bvec.bv_len;
> - if (transferred_bytes + len > max_bytes)
> - len = max_bytes - transferred_bytes;
> - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
> - op_is_write(req_op(rq)), sector,
> - rq->cmd_flags & REQ_FUA);
> + if (len > nr_bytes)
> + len = nr_bytes;
> + err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> + pos, rq->cmd_flags & REQ_FUA);
> + kunmap_local(p);
> if (err)
> break;
> - sector += len >> SECTOR_SHIFT;
> - transferred_bytes += len;
> - if (transferred_bytes >= max_bytes)
> + pos += len;
> + nr_bytes -= len;
> + if (!nr_bytes)
> break;
> }
> spin_unlock_irq(&nullb->lock);
--
Damien Le Moal
Western Digital Research
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] null_blk: allow byte aligned memory offsets
2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
2025-11-04 1:48 ` Damien Le Moal
@ 2025-11-04 9:15 ` Hans Holmberg
2025-11-05 18:47 ` Keith Busch
2025-11-04 11:24 ` Christoph Hellwig
2 siblings, 1 reply; 6+ messages in thread
From: Hans Holmberg @ 2025-11-04 9:15 UTC (permalink / raw)
To: Keith Busch, linux-block@vger.kernel.org, hch, axboe@kernel.dk,
dlemoal@kernel.org
Cc: Keith Busch
On 03/11/2025 18:29, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
>
> Allowing byte aligned memory provides a nice testing ground for
> direct-io. This has an added benefit of a single kmap/kumap per bio
> segment rather than multiple times for each multi-page segment.
>
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
> drivers/block/null_blk/main.c | 84 +++++++++++++++++-----------------
> drivers/block/null_blk/zoned.c | 2 +-
> 2 files changed, 43 insertions(+), 43 deletions(-)
>
> diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
> index 0ee55f889cfdd..2227f6db5d3d5 100644
> --- a/drivers/block/null_blk/main.c
> +++ b/drivers/block/null_blk/main.c
> @@ -1129,40 +1129,42 @@ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
> return 0;
> }
>
> -static int copy_to_nullb(struct nullb *nullb, struct page *source,
> - unsigned int off, sector_t sector, size_t n, bool is_fua)
> +static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
> + size_t n, bool is_fua)
> {
> size_t temp, count = 0;
> unsigned int offset;
> struct nullb_page *t_page;
> + sector_t sector;
>
> while (count < n) {
> + sector = pos >> SECTOR_SHIFT;
> temp = min_t(size_t, nullb->dev->blocksize, n - count);
>
> if (null_cache_active(nullb) && !is_fua)
> null_make_cache_space(nullb, PAGE_SIZE);
>
> - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> + offset = pos & (PAGE_SIZE - 1);
> t_page = null_insert_page(nullb, sector,
> !null_cache_active(nullb) || is_fua);
> if (!t_page)
> return -ENOSPC;
>
> - memcpy_page(t_page->page, offset, source, off + count, temp);
> + memcpy_to_page(t_page->page, offset, source, temp);
>
> __set_bit(sector & SECTOR_MASK, t_page->bitmap);
>
> if (is_fua)
> null_free_sector(nullb, sector, true);
>
> + source += temp;
> count += temp;
> - sector += temp >> SECTOR_SHIFT;
> + pos += temp;
> }
> return 0;
> }
>
> -static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> - unsigned int off, sector_t sector, size_t n)
> +static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
> {
> size_t temp, count = 0;
> unsigned int offset;
> @@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> while (count < n) {
> temp = min_t(size_t, nullb->dev->blocksize, n - count);
>
> - offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> - t_page = null_lookup_page(nullb, sector, false,
> + offset = pos & (PAGE_SIZE - 1);
> + t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
> !null_cache_active(nullb));
>
> if (t_page)
> - memcpy_page(dest, off + count, t_page->page, offset,
> - temp);
> + memcpy_from_page(dest, t_page->page, offset, temp);
> else
> - memzero_page(dest, off + count, temp);
> + memset(dest, 0, temp);
>
> + dest += temp;
> count += temp;
> - sector += temp >> SECTOR_SHIFT;
> + pos += temp;
> }
> return 0;
> }
>
> -static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
> - unsigned int len, unsigned int off)
> -{
> - memset_page(page, off, 0xff, len);
> -}
> -
> blk_status_t null_handle_discard(struct nullb_device *dev,
> sector_t sector, sector_t nr_sectors)
> {
> @@ -1234,8 +1230,8 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
> return errno_to_blk_status(err);
> }
>
> -static int null_transfer(struct nullb *nullb, struct page *page,
> - unsigned int len, unsigned int off, bool is_write, sector_t sector,
> +static int null_transfer(struct nullb *nullb, void *p,
> + unsigned int len, bool is_write, loff_t pos,
> bool is_fua)
> {
> struct nullb_device *dev = nullb->dev;
> @@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
> int err = 0;
>
> if (!is_write) {
> - if (dev->zoned)
> + if (dev->zoned) {
> valid_len = null_zone_valid_read_len(nullb,
> - sector, len);
> + pos >> SECTOR_SHIFT, len);
> +
> + if (valid_len && valid_len != len)
> + valid_len -= (pos & (SECTOR_SIZE - 1));
> + }
>
> if (valid_len) {
> - err = copy_from_nullb(nullb, page, off,
> - sector, valid_len);
> - off += valid_len;
> + err = copy_from_nullb(nullb, p, pos, valid_len);
> + p += valid_len;
> len -= valid_len;
> }
>
> if (len)
> - nullb_fill_pattern(nullb, page, len, off);
> - flush_dcache_page(page);
> + memset(p, 0xff, len);
> + flush_dcache_page(virt_to_page(p));
> } else {
> - flush_dcache_page(page);
> - err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> + flush_dcache_page(virt_to_page(p));
> + err = copy_to_nullb(nullb, p, pos, len, is_fua);
> }
>
> return err;
> @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
> struct nullb *nullb = cmd->nq->dev->nullb;
> int err = 0;
> unsigned int len;
> - sector_t sector = blk_rq_pos(rq);
> - unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> - unsigned int transferred_bytes = 0;
> + loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> + unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
> struct req_iterator iter;
> struct bio_vec bvec;
>
> spin_lock_irq(&nullb->lock);
> rq_for_each_segment(bvec, rq, iter) {
> + void *p = bvec_kmap_local(&bvec);;
> +
> len = bvec.bv_len;
> - if (transferred_bytes + len > max_bytes)
> - len = max_bytes - transferred_bytes;
> - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
> - op_is_write(req_op(rq)), sector,
> - rq->cmd_flags & REQ_FUA);
> + if (len > nr_bytes)
> + len = nr_bytes;
> + err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> + pos, rq->cmd_flags & REQ_FUA);
> + kunmap_local(p);
> if (err)
> break;
> - sector += len >> SECTOR_SHIFT;
> - transferred_bytes += len;
> - if (transferred_bytes >= max_bytes)
> + pos += len;
> + nr_bytes -= len;
> + if (!nr_bytes)
> break;
> }
> spin_unlock_irq(&nullb->lock);
> @@ -1949,7 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
> .logical_block_size = dev->blocksize,
> .physical_block_size = dev->blocksize,
> .max_hw_sectors = dev->max_sectors,
> - .dma_alignment = dev->blocksize - 1,
> + .dma_alignment = 1,
> };
>
> struct nullb *nullb;
> diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
> index 4e5728f459899..8e9648f87f7c8 100644
> --- a/drivers/block/null_blk/zoned.c
> +++ b/drivers/block/null_blk/zoned.c
> @@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
> {
> struct nullb_device *dev = nullb->dev;
> struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
> - unsigned int nr_sectors = len >> SECTOR_SHIFT;
> + unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
>
> /* Read must be below the write pointer position */
> if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
I applied this on top of 6.18-rc4 and hit the following when attempting to create a zoned
nullblk dev(see bash script for recreating below the crash):
[ 30.982725] BUG: unable to handle page fault for address: ffff88811f310000
[ 30.984349] #PF: supervisor write access in kernel mode
[ 30.985518] #PF: error_code(0x0003) - permissions violation
[ 30.987063] PGD 3c4e01067 P4D 3c4e01067 PUD 101022063 PMD 11f341063 PTE 800000011f310121
[ 30.989295] Oops: Oops: 0003 [#1] SMP KASAN NOPTI
[ 30.990646] CPU: 14 UID: 0 PID: 801 Comm: probe-bcache Not tainted 6.18.0-rc4_keith_nullblk #146 PREEMPT(voluntary)
[ 30.993519] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[ 30.996599] RIP: 0010:memset_orig+0x33/0xb0
[ 30.997780] Code: 01 01 01 01 01 01 01 01 48 0f af c1 41 89 f9 41 83 e1 07 75 74 48 89 d1 48 c1 e9 06 74 39 66 0f 1f 84 00 00 00 00 00 48 ff c9 <48> 89 07 48 89 47 08 48 89 47 10 48 89 47 18 48 89 47 20 48 87
[ 31.002259] RSP: 0018:ffffc900014c7278 EFLAGS: 00010012
[ 31.003160] RAX: ffffffffffffffff RBX: 0000000000024000 RCX: 000000000000047f
[ 31.004309] RDX: 000000000001c000 RSI: 00000000000000ff RDI: ffff88811f310000
[ 31.005450] RBP: 000000000001c000 R08: 0000000000000001 R09: 0000000000000000
[ 31.006544] R10: ffff88811f306000 R11: ffff88810675e748 R12: ffff88818da63260
[ 31.007631] R13: ffff8881c8a41400 R14: ffff8881c8a41a00 R15: dffffc0000000000
[ 31.008778] FS: 00007f67fe22e780(0000) GS:ffff8892017f3000(0000) knlGS:0000000000000000
[ 31.010076] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 31.011006] CR2: ffff88811f310000 CR3: 0000000179af6000 CR4: 0000000000350ef0
[ 31.012110] Call Trace:
[ 31.012518] <TASK>
[ 31.012862] null_handle_data_transfer+0x6b1/0xb70
[ 31.013612] ? blk_mq_submit_bio+0x118a/0x1e30
[ 31.014322] null_process_cmd+0x1bc/0x260
[ 31.014964] ? __pfx_null_process_cmd+0x10/0x10
[ 31.015741] ? __pfx_mutex_lock+0x10/0x10
[ 31.016392] null_process_zoned_cmd+0x1a2/0x1070
[ 31.017144] ? blk_mq_start_request+0xaf/0x6b0
[ 31.017860] null_queue_rq+0x68d/0xc60
[ 31.018469] null_queue_rqs+0xd7/0x280
[ 31.019059] ? __pfx_null_queue_rqs+0x10/0x10
[ 31.019764] ? __pfx_submit_bio_noacct_nocheck+0x10/0x10
[ 31.020612] blk_mq_dispatch_queue_requests+0x147/0x440
[ 31.021443] blk_mq_flush_plug_list+0x184/0x670
[ 31.022186] ? mpage_readahead+0x282/0x3d0
[ 31.022847] ? __pfx_blk_mq_flush_plug_list+0x10/0x10
[ 31.023661] ? __pfx_mpage_readahead+0x10/0x10
[ 31.024386] __blk_flush_plug+0x234/0x430
[ 31.025028] ? __pfx___blk_flush_plug+0x10/0x10
[ 31.025761] blk_finish_plug+0x49/0x90
[ 31.026371] read_pages+0x368/0x7d0
[ 31.026936] ? __pfx_workingset_update_node+0x10/0x10
[ 31.027717] ? __pfx_read_pages+0x10/0x10
[ 31.028327] page_cache_ra_unbounded+0x2fd/0x660
[ 31.029054] force_page_cache_ra+0x1e3/0x300
[ 31.029732] filemap_get_pages+0x2c6/0x1310
[ 31.030412] ? __pfx__copy_to_iter+0x10/0x10
[ 31.031069] ? __pfx_filemap_get_pages+0x10/0x10
[ 31.031609] ? copy_page_to_iter+0xfc/0x170
[ 31.032106] filemap_read+0x2ec/0xa10
[ 31.032563] ? __pfx_filemap_read+0x10/0x10
[ 31.033060] ? __pfx_down_read+0x10/0x10
[ 31.033530] blkdev_read_iter+0x157/0x400
[ 31.034018] vfs_read+0x657/0x910
[ 31.034436] ? __pfx___handle_mm_fault+0x10/0x10
[ 31.034986] ? __pfx_vfs_read+0x10/0x10
[ 31.035443] ? __seccomp_filter+0xf4/0xe00
[ 31.035939] ? fdget_pos+0x53/0x4c0
[ 31.036360] ksys_read+0xee/0x1c0
[ 31.036756] ? __pfx_ksys_read+0x10/0x10
[ 31.037220] do_syscall_64+0x4d/0x200
[ 31.037661] entry_SYSCALL_64_after_hwframe+0x76/0x7e
recreate_nullblk_issue.sh:
---
#!/bin/bash
function create_zoned_nullb()
{
local nid=0
local bs=$1 # Sector size (bytes)
local zs=$2 # Zone size (MB)
local nr_conv=$3 # number of conventional zones
local nr_seq=$4 # number of sequential zones
cap=$(( zs * (nr_conv + nr_seq) ))
while [ 1 ]; do
if [ ! -b "/dev/nullb$nid" ]; then
break
fi
nid=$(( nid + 1 ))
done
dev="/sys/kernel/config/nullb/nullb$nid"
mkdir "$dev" > /dev/null
echo $bs > "$dev"/blocksize
echo 0 > "$dev"/completion_nsec
echo 0 > "$dev"/irqmode
echo 2 > "$dev"/queue_mode
echo 1024 > "$dev"/hw_queue_depth
echo 1 > "$dev"/memory_backed
echo 1 > "$dev"/zoned
echo 16 > "$dev"/zone_max_open
echo $cap > "$dev"/size
echo $zs > "$dev"/zone_size
echo $nr_conv > "$dev"/zone_nr_conv
echo 1 > "$dev"/power
echo "$nid"
}
BLK_SIZE=4096
ZONE_SIZE=256
NR_CONV=0
NR_SEQ=128
nulldevid=$(create_zoned_nullb $BLK_SIZE $ZONE_SIZE $NR_CONV $NR_SEQ)
nulldevpath=/dev/nullb$nulldevid
SIZE_MB=$(( $ZONE_SIZE * ($NR_CONV + $NR_SEQ) ))
echo "Created $nulldevpath size: $SIZE_MB MB"
--
I tried creating a conventional nullblk dev as well (bs=4096) and hit the following:
[ 645.329804] Oops: general protection fault, probably for non-canonical address 0xdffffc000000000a: 0000 [#1] SMP KASAN NOPTI
[ 645.332355] KASAN: null-ptr-deref in range [0x0000000000000050-0x0000000000000057]
[ 645.334082] CPU: 6 UID: 0 PID: 914 Comm: systemd-udevd Not tainted 6.18.0-rc4_keith_nullblk #146 PREEMPT(voluntary)
[ 645.336402] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[ 645.338856] RIP: 0010:anon_vma_interval_tree_insert+0x141/0x4f0
[ 645.340237] Code: 04 4c 89 6b 18 48 8d 7b e0 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 13 02 00 00 4c 8b 63 e0 49 8d 7c 24 50 48 89 f8 48 c1 e8 03 <80> 3c 28 00 0f 85 d4 01 00 00 4d 3b 7c 24 50 72 8a 4c 8d 63 08 31
[ 645.344390] RSP: 0018:ffffc90000fa7938 EFLAGS: 00010216
[ 645.345566] RAX: 000000000000000a RBX: ffff8881de72c308 RCX: ffff8881dea87210
[ 645.347161] RDX: 0000000000000001 RSI: ffff88811b7c7e58 RDI: 0000000000000050
[ 645.348781] RBP: dffffc0000000000 R08: ffff8881dea87220 R09: 1ffff1103ba49aaf
[ 645.350392] R10: ffff88811b7c7e1f R11: 000000000003ab05 R12: 0000000000000000
[ 645.352021] R13: 0000000000000112 R14: ffff8881dea87200 R15: 00000000000000f1
[ 645.353615] FS: 00007fea8d0e38c0(0000) GS:ffff8891d59f3000(0000) knlGS:0000000000000000
[ 645.355424] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 645.356744] CR2: 00007ffc49a34548 CR3: 000000014bf10000 CR4: 0000000000350ef0
[ 645.358354] Call Trace:
[ 645.358955] <TASK>
[ 645.359460] anon_vma_clone+0x1c3/0x4d0
[ 645.360373] ? kmem_cache_alloc_noprof+0x117/0x4f0
[ 645.361108] anon_vma_fork+0x70/0x5b0
[ 645.361651] dup_mmap+0xc85/0x14a0
[ 645.362170] ? __pfx_dup_mmap+0x10/0x10
[ 645.362745] ? mm_init.constprop.0+0xacb/0xfb0
[ 645.363416] ? __hrtimer_setup+0x30/0x1f0
[ 645.364024] copy_process+0x3596/0x61d0
[ 645.364594] ? init_file+0x86/0x4a0
[ 645.365121] ? alloc_empty_file+0x59/0x170
[ 645.365733] ? alloc_file_clone+0x52/0xe0
[ 645.366322] ? create_pipe_files+0x3d6/0x900
[ 645.366960] ? __pfx_copy_process+0x10/0x10
[ 645.367583] ? kvm_sched_clock_read+0x11/0x20
[ 645.368262] ? local_clock_noinstr+0xd/0xc0
[ 645.368883] ? local_clock+0x10/0x30
[ 645.369423] ? kasan_save_track+0x26/0x60
[ 645.370024] kernel_clone+0xb8/0x6c0
[ 645.370559] ? __pfx_kernel_clone+0x10/0x10
[ 645.371170] ? _raw_spin_lock_irq+0x80/0xe0
[ 645.371794] ? __pfx__raw_spin_lock_irq+0x10/0x10
[ 645.372513] ? __pfx_lockref_get+0x10/0x10
[ 645.373137] __do_sys_clone+0xb5/0x100
[ 645.373700] ? __pfx___do_sys_clone+0x10/0x10
[ 645.374341] ? syscall_trace_enter+0x8d/0x1c0
[ 645.374987] do_syscall_64+0x4d/0x200
[ 645.375547] entry_SYSCALL_64_after_hwframe+0x76/0x7e
[ 645.376306] RIP: 0033:0x7fea8d7a9b57
[ 645.376844] Code: ba 04 00 f3 0f 1e fa 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 41 89 c0 85 c0 75 2c 64 48 8b 04 25 10 00
[ 645.379510] RSP: 002b:00007ffc49a32c98 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[ 645.380616] RAX: ffffffffffffffda RBX: 00007fea8d9ca040 RCX: 00007fea8d7a9b57
[ 645.381653] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[ 645.382710] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[ 645.383826] R10: 00007fea8d0e3b90 R11: 0000000000000246 R12: 0000000000000001
[ 645.384921] R13: 00007ffc49a32e70 R14: 0000000000000040 R15: 0000000000000001
[ 645.386020] </TASK>
[ 645.386377] Modules linked in:
[ 645.387059] ---[ end trace 0000000000000000 ]---
[ 645.387796] RIP: 0010:anon_vma_interval_tree_insert+0x141/0x4f0
[ 645.388609] Code: 04 4c 89 6b 18 48 8d 7b e0 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 13 02 00 00 4c 8b 63 e0 49 8d 7c 24 50 48 89 f8 48 c1 e8 03 <80> 3c 28 00 0f 85 d4 01 00 00 4d 3b 7c 24 50 72 8a 4c 8d 63 08 31
[ 645.391996] RSP: 0018:ffffc90000fa7938 EFLAGS: 00010216
[ 645.392593] RAX: 000000000000000a RBX: ffff8881de72c308 RCX: ffff8881dea87210
[ 645.393527] RDX: 0000000000000001 RSI: ffff88811b7c7e58 RDI: 0000000000000050
[ 645.394382] RBP: dffffc0000000000 R08: ffff8881dea87220 R09: 1ffff1103ba49aaf
[ 645.395330] R10: ffff88811b7c7e1f R11: 000000000003ab05 R12: 0000000000000000
[ 645.396231] R13: 0000000000000112 R14: ffff8881dea87200 R15: 00000000000000f1
[ 645.397155] FS: 00007fea8d0e38c0(0000) GS:ffff8891d59f3000(0000) knlGS:0000000000000000
[ 645.398040] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 645.398638] CR2: 00007ffc49a34548 CR3: 000000014bf10000 CR4: 0000000000350ef0
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] null_blk: allow byte aligned memory offsets
2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
2025-11-04 1:48 ` Damien Le Moal
2025-11-04 9:15 ` Hans Holmberg
@ 2025-11-04 11:24 ` Christoph Hellwig
2 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-11-04 11:24 UTC (permalink / raw)
To: Keith Busch; +Cc: linux-block, hch, axboe, dlemoal, hans.holmberg, Keith Busch
> + offset = pos & (PAGE_SIZE - 1);
This is an open coded offset_in_page()
> + offset = pos & (PAGE_SIZE - 1);
Same.
> +static int null_transfer(struct nullb *nullb, void *p,
> + unsigned int len, bool is_write, loff_t pos,
> bool is_fua)
Maybe fix the non-standard indentation here if you touch it anyway?
> + memset(p, 0xff, len);
> + flush_dcache_page(virt_to_page(p));
> } else {
> - flush_dcache_page(page);
> - err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> + flush_dcache_page(virt_to_page(p));
> + err = copy_to_nullb(nullb, p, pos, len, is_fua);
virt_to_page does not work when kmap actually had to map, i.e. for
highmem.
> spin_lock_irq(&nullb->lock);
> rq_for_each_segment(bvec, rq, iter) {
> + void *p = bvec_kmap_local(&bvec);;
> +
> len = bvec.bv_len;
> + if (len > nr_bytes)
> + len = nr_bytes;
> + err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> + pos, rq->cmd_flags & REQ_FUA);
> + kunmap_local(p);
Any reason to not keep the kmap local to null_transfer (or even the low-level
operation below it) and pass the bvec to it?
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] null_blk: allow byte aligned memory offsets
2025-11-04 9:15 ` Hans Holmberg
@ 2025-11-05 18:47 ` Keith Busch
0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-05 18:47 UTC (permalink / raw)
To: Hans Holmberg
Cc: Keith Busch, linux-block@vger.kernel.org, hch, axboe@kernel.dk,
dlemoal@kernel.org
On Tue, Nov 04, 2025 at 09:15:15AM +0000, Hans Holmberg wrote:
> I applied this on top of 6.18-rc4 and hit the following when attempting to create a zoned
> nullblk dev(see bash script for recreating below the crash):
Thanks for checking. I spotted the mistake in the code, causing a
potential buffer overrun. Got it fixed up for the next version.
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] null_blk: allow byte aligned memory offsets
2025-11-04 1:48 ` Damien Le Moal
@ 2025-11-05 18:57 ` Keith Busch
0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-05 18:57 UTC (permalink / raw)
To: Damien Le Moal; +Cc: Keith Busch, linux-block, hch, axboe, hans.holmberg
On Tue, Nov 04, 2025 at 10:48:25AM +0900, Damien Le Moal wrote:
> > if (valid_len) {
> > - err = copy_from_nullb(nullb, page, off,
> > - sector, valid_len);
> > - off += valid_len;
> > + err = copy_from_nullb(nullb, p, pos, valid_len);
>
> Not your fault, but if this fails, we will still do the nullb_fill_pattern()
> below which I do not think is correct... ? May be we should have:
>
> if (err)
> return err;
>
> here ? But not sure if we should still call flush_dcache_page() even on error
> though.
It does look odd. copy_from_nullb() only returns success though, so
maybe we just drop the return value entirely.
> > @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
> > struct nullb *nullb = cmd->nq->dev->nullb;
> > int err = 0;
> > unsigned int len;
> > - sector_t sector = blk_rq_pos(rq);
> > - unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> > - unsigned int transferred_bytes = 0;
> > + loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> > + unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
>
> Overflow potential here ?
Should be okay: nr_sectors comes from blk_rq_sectors().
The same calculation already exist just above, I just changed the name.
Actually, I don't know why I changed it, so I'll leave it alone in the
next version.
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2025-11-05 18:57 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
2025-11-04 1:48 ` Damien Le Moal
2025-11-05 18:57 ` Keith Busch
2025-11-04 9:15 ` Hans Holmberg
2025-11-05 18:47 ` Keith Busch
2025-11-04 11:24 ` Christoph Hellwig
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).