linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] null_blk: allow byte aligned memory offsets
@ 2025-11-03 17:28 Keith Busch
  2025-11-04  1:48 ` Damien Le Moal
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-03 17:28 UTC (permalink / raw)
  To: linux-block, hch, axboe, dlemoal, hans.holmberg; +Cc: Keith Busch

From: Keith Busch <kbusch@kernel.org>

Allowing byte aligned memory provides a nice testing ground for
direct-io. This has an added benefit of a single kmap/kumap per bio
segment rather than multiple times for each multi-page segment.

Signed-off-by: Keith Busch <kbusch@kernel.org>
---
 drivers/block/null_blk/main.c  | 84 +++++++++++++++++-----------------
 drivers/block/null_blk/zoned.c |  2 +-
 2 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 0ee55f889cfdd..2227f6db5d3d5 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -1129,40 +1129,42 @@ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
 	return 0;
 }
 
-static int copy_to_nullb(struct nullb *nullb, struct page *source,
-	unsigned int off, sector_t sector, size_t n, bool is_fua)
+static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
+			 size_t n, bool is_fua)
 {
 	size_t temp, count = 0;
 	unsigned int offset;
 	struct nullb_page *t_page;
+	sector_t sector;
 
 	while (count < n) {
+		sector = pos >> SECTOR_SHIFT;
 		temp = min_t(size_t, nullb->dev->blocksize, n - count);
 
 		if (null_cache_active(nullb) && !is_fua)
 			null_make_cache_space(nullb, PAGE_SIZE);
 
-		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+		offset = pos & (PAGE_SIZE - 1);
 		t_page = null_insert_page(nullb, sector,
 			!null_cache_active(nullb) || is_fua);
 		if (!t_page)
 			return -ENOSPC;
 
-		memcpy_page(t_page->page, offset, source, off + count, temp);
+		memcpy_to_page(t_page->page, offset, source, temp);
 
 		__set_bit(sector & SECTOR_MASK, t_page->bitmap);
 
 		if (is_fua)
 			null_free_sector(nullb, sector, true);
 
+		source += temp;
 		count += temp;
-		sector += temp >> SECTOR_SHIFT;
+		pos += temp;
 	}
 	return 0;
 }
 
-static int copy_from_nullb(struct nullb *nullb, struct page *dest,
-	unsigned int off, sector_t sector, size_t n)
+static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
 {
 	size_t temp, count = 0;
 	unsigned int offset;
@@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
 	while (count < n) {
 		temp = min_t(size_t, nullb->dev->blocksize, n - count);
 
-		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
-		t_page = null_lookup_page(nullb, sector, false,
+		offset = pos & (PAGE_SIZE - 1);
+		t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
 			!null_cache_active(nullb));
 
 		if (t_page)
-			memcpy_page(dest, off + count, t_page->page, offset,
-				    temp);
+			memcpy_from_page(dest, t_page->page, offset, temp);
 		else
-			memzero_page(dest, off + count, temp);
+			memset(dest, 0, temp);
 
+		dest += temp;
 		count += temp;
-		sector += temp >> SECTOR_SHIFT;
+		pos += temp;
 	}
 	return 0;
 }
 
-static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
-			       unsigned int len, unsigned int off)
-{
-	memset_page(page, off, 0xff, len);
-}
-
 blk_status_t null_handle_discard(struct nullb_device *dev,
 				 sector_t sector, sector_t nr_sectors)
 {
@@ -1234,8 +1230,8 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
 	return errno_to_blk_status(err);
 }
 
-static int null_transfer(struct nullb *nullb, struct page *page,
-	unsigned int len, unsigned int off, bool is_write, sector_t sector,
+static int null_transfer(struct nullb *nullb, void *p,
+	unsigned int len, bool is_write, loff_t pos,
 	bool is_fua)
 {
 	struct nullb_device *dev = nullb->dev;
@@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
 	int err = 0;
 
 	if (!is_write) {
-		if (dev->zoned)
+		if (dev->zoned) {
 			valid_len = null_zone_valid_read_len(nullb,
-				sector, len);
+				pos >> SECTOR_SHIFT, len);
+
+			if (valid_len && valid_len != len)
+				valid_len -= (pos & (SECTOR_SIZE - 1));
+		}
 
 		if (valid_len) {
-			err = copy_from_nullb(nullb, page, off,
-				sector, valid_len);
-			off += valid_len;
+			err = copy_from_nullb(nullb, p, pos, valid_len);
+			p += valid_len;
 			len -= valid_len;
 		}
 
 		if (len)
-			nullb_fill_pattern(nullb, page, len, off);
-		flush_dcache_page(page);
+			memset(p, 0xff, len);
+		flush_dcache_page(virt_to_page(p));
 	} else {
-		flush_dcache_page(page);
-		err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+		flush_dcache_page(virt_to_page(p));
+		err = copy_to_nullb(nullb, p, pos, len, is_fua);
 	}
 
 	return err;
@@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
 	struct nullb *nullb = cmd->nq->dev->nullb;
 	int err = 0;
 	unsigned int len;
-	sector_t sector = blk_rq_pos(rq);
-	unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
-	unsigned int transferred_bytes = 0;
+	loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
+	unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
 	struct req_iterator iter;
 	struct bio_vec bvec;
 
 	spin_lock_irq(&nullb->lock);
 	rq_for_each_segment(bvec, rq, iter) {
+		void *p = bvec_kmap_local(&bvec);;
+
 		len = bvec.bv_len;
-		if (transferred_bytes + len > max_bytes)
-			len = max_bytes - transferred_bytes;
-		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
-				     op_is_write(req_op(rq)), sector,
-				     rq->cmd_flags & REQ_FUA);
+		if (len > nr_bytes)
+			len = nr_bytes;
+		err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
+				    pos, rq->cmd_flags & REQ_FUA);
+		kunmap_local(p);
 		if (err)
 			break;
-		sector += len >> SECTOR_SHIFT;
-		transferred_bytes += len;
-		if (transferred_bytes >= max_bytes)
+		pos += len;
+		nr_bytes -= len;
+		if (!nr_bytes)
 			break;
 	}
 	spin_unlock_irq(&nullb->lock);
@@ -1949,7 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
 		.logical_block_size	= dev->blocksize,
 		.physical_block_size	= dev->blocksize,
 		.max_hw_sectors		= dev->max_sectors,
-		.dma_alignment		= dev->blocksize - 1,
+		.dma_alignment		= 1,
 	};
 
 	struct nullb *nullb;
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 4e5728f459899..8e9648f87f7c8 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
 {
 	struct nullb_device *dev = nullb->dev;
 	struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
-	unsigned int nr_sectors = len >> SECTOR_SHIFT;
+	unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
 
 	/* Read must be below the write pointer position */
 	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
-- 
2.47.3


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] null_blk: allow byte aligned memory offsets
  2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
@ 2025-11-04  1:48 ` Damien Le Moal
  2025-11-05 18:57   ` Keith Busch
  2025-11-04  9:15 ` Hans Holmberg
  2025-11-04 11:24 ` Christoph Hellwig
  2 siblings, 1 reply; 6+ messages in thread
From: Damien Le Moal @ 2025-11-04  1:48 UTC (permalink / raw)
  To: Keith Busch, linux-block, hch, axboe, hans.holmberg; +Cc: Keith Busch

On 11/4/25 02:28, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Allowing byte aligned memory provides a nice testing ground for
> direct-io. This has an added benefit of a single kmap/kumap per bio
> segment rather than multiple times for each multi-page segment.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>

Overall looks good to me. A few nits below.

> +static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
> +			 size_t n, bool is_fua)
>  {
>  	size_t temp, count = 0;
>  	unsigned int offset;
>  	struct nullb_page *t_page;
> +	sector_t sector;
>  
>  	while (count < n) {
> +		sector = pos >> SECTOR_SHIFT;
>  		temp = min_t(size_t, nullb->dev->blocksize, n - count);
>  
>  		if (null_cache_active(nullb) && !is_fua)
>  			null_make_cache_space(nullb, PAGE_SIZE);
>  
> -		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> +		offset = pos & (PAGE_SIZE - 1);

Offset is only used in the memcpy_to_page() call below, so maybe move this line
down, or just completely remove that local variable as it has little value ?

>  		t_page = null_insert_page(nullb, sector,
>  			!null_cache_active(nullb) || is_fua);
>  		if (!t_page)
>  			return -ENOSPC;

[...]

> -static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> -	unsigned int off, sector_t sector, size_t n)
> +static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
>  {
>  	size_t temp, count = 0;
>  	unsigned int offset;
> @@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
>  	while (count < n) {
>  		temp = min_t(size_t, nullb->dev->blocksize, n - count);
>  
> -		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> -		t_page = null_lookup_page(nullb, sector, false,
> +		offset = pos & (PAGE_SIZE - 1);

Same comment here.

> +		t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
>  			!null_cache_active(nullb));
>  
>  		if (t_page)
> -			memcpy_page(dest, off + count, t_page->page, offset,
> -				    temp);
> +			memcpy_from_page(dest, t_page->page, offset, temp);
>  		else
> -			memzero_page(dest, off + count, temp);
> +			memset(dest, 0, temp);
>  
> +		dest += temp;
>  		count += temp;
> -		sector += temp >> SECTOR_SHIFT;
> +		pos += temp;
>  	}
>  	return 0;
>  }

[...]

> -static int null_transfer(struct nullb *nullb, struct page *page,
> -	unsigned int len, unsigned int off, bool is_write, sector_t sector,
> +static int null_transfer(struct nullb *nullb, void *p,
> +	unsigned int len, bool is_write, loff_t pos,
>  	bool is_fua)
>  {
>  	struct nullb_device *dev = nullb->dev;
> @@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
>  	int err = 0;
>  
>  	if (!is_write) {
> -		if (dev->zoned)
> +		if (dev->zoned) {
>  			valid_len = null_zone_valid_read_len(nullb,
> -				sector, len);
> +				pos >> SECTOR_SHIFT, len);
> +
> +			if (valid_len && valid_len != len)
> +				valid_len -= (pos & (SECTOR_SIZE - 1));

I do not think you need the outer parenthesis here.

> +		}
>  
>  		if (valid_len) {
> -			err = copy_from_nullb(nullb, page, off,
> -				sector, valid_len);
> -			off += valid_len;
> +			err = copy_from_nullb(nullb, p, pos, valid_len);

Not your fault, but if this fails, we will still do the nullb_fill_pattern()
below which I do not think is correct... ? May be we should have:

			if (err)
				return err;

here ? But not sure if we should still call flush_dcache_page() even on error
though.

> +			p += valid_len;
>  			len -= valid_len;
>  		}
>  
>  		if (len)
> -			nullb_fill_pattern(nullb, page, len, off);
> -		flush_dcache_page(page);
> +			memset(p, 0xff, len);
> +		flush_dcache_page(virt_to_page(p));
>  	} else {
> -		flush_dcache_page(page);
> -		err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> +		flush_dcache_page(virt_to_page(p));
> +		err = copy_to_nullb(nullb, p, pos, len, is_fua);

Nit: this could be "return copy_to_nullb();"

>  	}
>  
>  	return err;
> @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
>  	struct nullb *nullb = cmd->nq->dev->nullb;
>  	int err = 0;
>  	unsigned int len;
> -	sector_t sector = blk_rq_pos(rq);
> -	unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> -	unsigned int transferred_bytes = 0;
> +	loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> +	unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;

Overflow potential here ?

>  	struct req_iterator iter;
>  	struct bio_vec bvec;
>  
>  	spin_lock_irq(&nullb->lock);
>  	rq_for_each_segment(bvec, rq, iter) {
> +		void *p = bvec_kmap_local(&bvec);;
> +
>  		len = bvec.bv_len;
> -		if (transferred_bytes + len > max_bytes)
> -			len = max_bytes - transferred_bytes;
> -		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
> -				     op_is_write(req_op(rq)), sector,
> -				     rq->cmd_flags & REQ_FUA);
> +		if (len > nr_bytes)
> +			len = nr_bytes;
> +		err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> +				    pos, rq->cmd_flags & REQ_FUA);
> +		kunmap_local(p);
>  		if (err)
>  			break;
> -		sector += len >> SECTOR_SHIFT;
> -		transferred_bytes += len;
> -		if (transferred_bytes >= max_bytes)
> +		pos += len;
> +		nr_bytes -= len;
> +		if (!nr_bytes)
>  			break;
>  	}
>  	spin_unlock_irq(&nullb->lock);



-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] null_blk: allow byte aligned memory offsets
  2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
  2025-11-04  1:48 ` Damien Le Moal
@ 2025-11-04  9:15 ` Hans Holmberg
  2025-11-05 18:47   ` Keith Busch
  2025-11-04 11:24 ` Christoph Hellwig
  2 siblings, 1 reply; 6+ messages in thread
From: Hans Holmberg @ 2025-11-04  9:15 UTC (permalink / raw)
  To: Keith Busch, linux-block@vger.kernel.org, hch, axboe@kernel.dk,
	dlemoal@kernel.org
  Cc: Keith Busch

On 03/11/2025 18:29, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> Allowing byte aligned memory provides a nice testing ground for
> direct-io. This has an added benefit of a single kmap/kumap per bio
> segment rather than multiple times for each multi-page segment.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>  drivers/block/null_blk/main.c  | 84 +++++++++++++++++-----------------
>  drivers/block/null_blk/zoned.c |  2 +-
>  2 files changed, 43 insertions(+), 43 deletions(-)
> 
> diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
> index 0ee55f889cfdd..2227f6db5d3d5 100644
> --- a/drivers/block/null_blk/main.c
> +++ b/drivers/block/null_blk/main.c
> @@ -1129,40 +1129,42 @@ static int null_make_cache_space(struct nullb *nullb, unsigned long n)
>  	return 0;
>  }
>  
> -static int copy_to_nullb(struct nullb *nullb, struct page *source,
> -	unsigned int off, sector_t sector, size_t n, bool is_fua)
> +static int copy_to_nullb(struct nullb *nullb, void *source, loff_t pos,
> +			 size_t n, bool is_fua)
>  {
>  	size_t temp, count = 0;
>  	unsigned int offset;
>  	struct nullb_page *t_page;
> +	sector_t sector;
>  
>  	while (count < n) {
> +		sector = pos >> SECTOR_SHIFT;
>  		temp = min_t(size_t, nullb->dev->blocksize, n - count);
>  
>  		if (null_cache_active(nullb) && !is_fua)
>  			null_make_cache_space(nullb, PAGE_SIZE);
>  
> -		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> +		offset = pos & (PAGE_SIZE - 1);
>  		t_page = null_insert_page(nullb, sector,
>  			!null_cache_active(nullb) || is_fua);
>  		if (!t_page)
>  			return -ENOSPC;
>  
> -		memcpy_page(t_page->page, offset, source, off + count, temp);
> +		memcpy_to_page(t_page->page, offset, source, temp);
>  
>  		__set_bit(sector & SECTOR_MASK, t_page->bitmap);
>  
>  		if (is_fua)
>  			null_free_sector(nullb, sector, true);
>  
> +		source += temp;
>  		count += temp;
> -		sector += temp >> SECTOR_SHIFT;
> +		pos += temp;
>  	}
>  	return 0;
>  }
>  
> -static int copy_from_nullb(struct nullb *nullb, struct page *dest,
> -	unsigned int off, sector_t sector, size_t n)
> +static int copy_from_nullb(struct nullb *nullb, void *dest, loff_t pos, size_t n)
>  {
>  	size_t temp, count = 0;
>  	unsigned int offset;
> @@ -1171,28 +1173,22 @@ static int copy_from_nullb(struct nullb *nullb, struct page *dest,
>  	while (count < n) {
>  		temp = min_t(size_t, nullb->dev->blocksize, n - count);
>  
> -		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
> -		t_page = null_lookup_page(nullb, sector, false,
> +		offset = pos & (PAGE_SIZE - 1);
> +		t_page = null_lookup_page(nullb, pos >> SECTOR_SHIFT, false,
>  			!null_cache_active(nullb));
>  
>  		if (t_page)
> -			memcpy_page(dest, off + count, t_page->page, offset,
> -				    temp);
> +			memcpy_from_page(dest, t_page->page, offset, temp);
>  		else
> -			memzero_page(dest, off + count, temp);
> +			memset(dest, 0, temp);
>  
> +		dest += temp;
>  		count += temp;
> -		sector += temp >> SECTOR_SHIFT;
> +		pos += temp;
>  	}
>  	return 0;
>  }
>  
> -static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
> -			       unsigned int len, unsigned int off)
> -{
> -	memset_page(page, off, 0xff, len);
> -}
> -
>  blk_status_t null_handle_discard(struct nullb_device *dev,
>  				 sector_t sector, sector_t nr_sectors)
>  {
> @@ -1234,8 +1230,8 @@ static blk_status_t null_handle_flush(struct nullb *nullb)
>  	return errno_to_blk_status(err);
>  }
>  
> -static int null_transfer(struct nullb *nullb, struct page *page,
> -	unsigned int len, unsigned int off, bool is_write, sector_t sector,
> +static int null_transfer(struct nullb *nullb, void *p,
> +	unsigned int len, bool is_write, loff_t pos,
>  	bool is_fua)
>  {
>  	struct nullb_device *dev = nullb->dev;
> @@ -1243,23 +1239,26 @@ static int null_transfer(struct nullb *nullb, struct page *page,
>  	int err = 0;
>  
>  	if (!is_write) {
> -		if (dev->zoned)
> +		if (dev->zoned) {
>  			valid_len = null_zone_valid_read_len(nullb,
> -				sector, len);
> +				pos >> SECTOR_SHIFT, len);
> +
> +			if (valid_len && valid_len != len)
> +				valid_len -= (pos & (SECTOR_SIZE - 1));
> +		}
>  
>  		if (valid_len) {
> -			err = copy_from_nullb(nullb, page, off,
> -				sector, valid_len);
> -			off += valid_len;
> +			err = copy_from_nullb(nullb, p, pos, valid_len);
> +			p += valid_len;
>  			len -= valid_len;
>  		}
>  
>  		if (len)
> -			nullb_fill_pattern(nullb, page, len, off);
> -		flush_dcache_page(page);
> +			memset(p, 0xff, len);
> +		flush_dcache_page(virt_to_page(p));
>  	} else {
> -		flush_dcache_page(page);
> -		err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> +		flush_dcache_page(virt_to_page(p));
> +		err = copy_to_nullb(nullb, p, pos, len, is_fua);
>  	}
>  
>  	return err;
> @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
>  	struct nullb *nullb = cmd->nq->dev->nullb;
>  	int err = 0;
>  	unsigned int len;
> -	sector_t sector = blk_rq_pos(rq);
> -	unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> -	unsigned int transferred_bytes = 0;
> +	loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> +	unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
>  	struct req_iterator iter;
>  	struct bio_vec bvec;
>  
>  	spin_lock_irq(&nullb->lock);
>  	rq_for_each_segment(bvec, rq, iter) {
> +		void *p = bvec_kmap_local(&bvec);;
> +
>  		len = bvec.bv_len;
> -		if (transferred_bytes + len > max_bytes)
> -			len = max_bytes - transferred_bytes;
> -		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
> -				     op_is_write(req_op(rq)), sector,
> -				     rq->cmd_flags & REQ_FUA);
> +		if (len > nr_bytes)
> +			len = nr_bytes;
> +		err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> +				    pos, rq->cmd_flags & REQ_FUA);
> +		kunmap_local(p);
>  		if (err)
>  			break;
> -		sector += len >> SECTOR_SHIFT;
> -		transferred_bytes += len;
> -		if (transferred_bytes >= max_bytes)
> +		pos += len;
> +		nr_bytes -= len;
> +		if (!nr_bytes)
>  			break;
>  	}
>  	spin_unlock_irq(&nullb->lock);
> @@ -1949,7 +1949,7 @@ static int null_add_dev(struct nullb_device *dev)
>  		.logical_block_size	= dev->blocksize,
>  		.physical_block_size	= dev->blocksize,
>  		.max_hw_sectors		= dev->max_sectors,
> -		.dma_alignment		= dev->blocksize - 1,
> +		.dma_alignment		= 1,
>  	};
>  
>  	struct nullb *nullb;
> diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
> index 4e5728f459899..8e9648f87f7c8 100644
> --- a/drivers/block/null_blk/zoned.c
> +++ b/drivers/block/null_blk/zoned.c
> @@ -242,7 +242,7 @@ size_t null_zone_valid_read_len(struct nullb *nullb,
>  {
>  	struct nullb_device *dev = nullb->dev;
>  	struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
> -	unsigned int nr_sectors = len >> SECTOR_SHIFT;
> +	unsigned int nr_sectors = DIV_ROUND_UP(len, SECTOR_SIZE);
>  
>  	/* Read must be below the write pointer position */
>  	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||

I applied this on top of 6.18-rc4 and hit the following when attempting to create a zoned
nullblk dev(see bash script for recreating below the crash): 

[   30.982725] BUG: unable to handle page fault for address: ffff88811f310000
[   30.984349] #PF: supervisor write access in kernel mode
[   30.985518] #PF: error_code(0x0003) - permissions violation
[   30.987063] PGD 3c4e01067 P4D 3c4e01067 PUD 101022063 PMD 11f341063 PTE 800000011f310121
[   30.989295] Oops: Oops: 0003 [#1] SMP KASAN NOPTI
[   30.990646] CPU: 14 UID: 0 PID: 801 Comm: probe-bcache Not tainted 6.18.0-rc4_keith_nullblk #146 PREEMPT(voluntary)
[   30.993519] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[   30.996599] RIP: 0010:memset_orig+0x33/0xb0
[   30.997780] Code: 01 01 01 01 01 01 01 01 48 0f af c1 41 89 f9 41 83 e1 07 75 74 48 89 d1 48 c1 e9 06 74 39 66 0f 1f 84 00 00 00 00 00 48 ff c9 <48> 89 07 48 89 47 08 48 89 47 10 48 89 47 18 48 89 47 20 48 87
[   31.002259] RSP: 0018:ffffc900014c7278 EFLAGS: 00010012
[   31.003160] RAX: ffffffffffffffff RBX: 0000000000024000 RCX: 000000000000047f
[   31.004309] RDX: 000000000001c000 RSI: 00000000000000ff RDI: ffff88811f310000
[   31.005450] RBP: 000000000001c000 R08: 0000000000000001 R09: 0000000000000000
[   31.006544] R10: ffff88811f306000 R11: ffff88810675e748 R12: ffff88818da63260
[   31.007631] R13: ffff8881c8a41400 R14: ffff8881c8a41a00 R15: dffffc0000000000
[   31.008778] FS:  00007f67fe22e780(0000) GS:ffff8892017f3000(0000) knlGS:0000000000000000
[   31.010076] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   31.011006] CR2: ffff88811f310000 CR3: 0000000179af6000 CR4: 0000000000350ef0
[   31.012110] Call Trace:
[   31.012518]  <TASK>
[   31.012862]  null_handle_data_transfer+0x6b1/0xb70
[   31.013612]  ? blk_mq_submit_bio+0x118a/0x1e30
[   31.014322]  null_process_cmd+0x1bc/0x260
[   31.014964]  ? __pfx_null_process_cmd+0x10/0x10
[   31.015741]  ? __pfx_mutex_lock+0x10/0x10
[   31.016392]  null_process_zoned_cmd+0x1a2/0x1070
[   31.017144]  ? blk_mq_start_request+0xaf/0x6b0
[   31.017860]  null_queue_rq+0x68d/0xc60
[   31.018469]  null_queue_rqs+0xd7/0x280
[   31.019059]  ? __pfx_null_queue_rqs+0x10/0x10
[   31.019764]  ? __pfx_submit_bio_noacct_nocheck+0x10/0x10
[   31.020612]  blk_mq_dispatch_queue_requests+0x147/0x440
[   31.021443]  blk_mq_flush_plug_list+0x184/0x670
[   31.022186]  ? mpage_readahead+0x282/0x3d0
[   31.022847]  ? __pfx_blk_mq_flush_plug_list+0x10/0x10
[   31.023661]  ? __pfx_mpage_readahead+0x10/0x10
[   31.024386]  __blk_flush_plug+0x234/0x430
[   31.025028]  ? __pfx___blk_flush_plug+0x10/0x10
[   31.025761]  blk_finish_plug+0x49/0x90
[   31.026371]  read_pages+0x368/0x7d0
[   31.026936]  ? __pfx_workingset_update_node+0x10/0x10
[   31.027717]  ? __pfx_read_pages+0x10/0x10
[   31.028327]  page_cache_ra_unbounded+0x2fd/0x660
[   31.029054]  force_page_cache_ra+0x1e3/0x300
[   31.029732]  filemap_get_pages+0x2c6/0x1310
[   31.030412]  ? __pfx__copy_to_iter+0x10/0x10
[   31.031069]  ? __pfx_filemap_get_pages+0x10/0x10
[   31.031609]  ? copy_page_to_iter+0xfc/0x170
[   31.032106]  filemap_read+0x2ec/0xa10
[   31.032563]  ? __pfx_filemap_read+0x10/0x10
[   31.033060]  ? __pfx_down_read+0x10/0x10
[   31.033530]  blkdev_read_iter+0x157/0x400
[   31.034018]  vfs_read+0x657/0x910
[   31.034436]  ? __pfx___handle_mm_fault+0x10/0x10
[   31.034986]  ? __pfx_vfs_read+0x10/0x10
[   31.035443]  ? __seccomp_filter+0xf4/0xe00
[   31.035939]  ? fdget_pos+0x53/0x4c0
[   31.036360]  ksys_read+0xee/0x1c0
[   31.036756]  ? __pfx_ksys_read+0x10/0x10
[   31.037220]  do_syscall_64+0x4d/0x200
[   31.037661]  entry_SYSCALL_64_after_hwframe+0x76/0x7e

recreate_nullblk_issue.sh:
---
#!/bin/bash

function create_zoned_nullb()
{
        local nid=0
	local bs=$1 # Sector size (bytes)
	local zs=$2 # Zone size (MB)
        local nr_conv=$3 # number of conventional zones
        local nr_seq=$4 # number of sequential zones

        cap=$(( zs * (nr_conv + nr_seq) ))

        while [ 1 ]; do
                if [ ! -b "/dev/nullb$nid" ]; then
                        break
                fi
                nid=$(( nid + 1 ))
        done

        dev="/sys/kernel/config/nullb/nullb$nid"
        mkdir "$dev" > /dev/null

        echo $bs > "$dev"/blocksize
        echo 0 > "$dev"/completion_nsec
        echo 0 > "$dev"/irqmode
        echo 2 > "$dev"/queue_mode
        echo 1024 > "$dev"/hw_queue_depth
        echo 1 > "$dev"/memory_backed
        echo 1 > "$dev"/zoned
	echo 16 > "$dev"/zone_max_open
        echo $cap > "$dev"/size
        echo $zs > "$dev"/zone_size
        echo $nr_conv > "$dev"/zone_nr_conv
        echo 1 > "$dev"/power

        echo "$nid"
}


BLK_SIZE=4096
ZONE_SIZE=256
NR_CONV=0
NR_SEQ=128

nulldevid=$(create_zoned_nullb $BLK_SIZE $ZONE_SIZE $NR_CONV $NR_SEQ)
nulldevpath=/dev/nullb$nulldevid

SIZE_MB=$(( $ZONE_SIZE * ($NR_CONV + $NR_SEQ) ))
echo "Created $nulldevpath size: $SIZE_MB MB"
--

I tried creating a conventional nullblk dev as well (bs=4096) and hit the following:


[  645.329804] Oops: general protection fault, probably for non-canonical address 0xdffffc000000000a: 0000 [#1] SMP KASAN NOPTI
[  645.332355] KASAN: null-ptr-deref in range [0x0000000000000050-0x0000000000000057]
[  645.334082] CPU: 6 UID: 0 PID: 914 Comm: systemd-udevd Not tainted 6.18.0-rc4_keith_nullblk #146 PREEMPT(voluntary) 
[  645.336402] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014
[  645.338856] RIP: 0010:anon_vma_interval_tree_insert+0x141/0x4f0
[  645.340237] Code: 04 4c 89 6b 18 48 8d 7b e0 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 13 02 00 00 4c 8b 63 e0 49 8d 7c 24 50 48 89 f8 48 c1 e8 03 <80> 3c 28 00 0f 85 d4 01 00 00 4d 3b 7c 24 50 72 8a 4c 8d 63 08 31
[  645.344390] RSP: 0018:ffffc90000fa7938 EFLAGS: 00010216
[  645.345566] RAX: 000000000000000a RBX: ffff8881de72c308 RCX: ffff8881dea87210
[  645.347161] RDX: 0000000000000001 RSI: ffff88811b7c7e58 RDI: 0000000000000050
[  645.348781] RBP: dffffc0000000000 R08: ffff8881dea87220 R09: 1ffff1103ba49aaf
[  645.350392] R10: ffff88811b7c7e1f R11: 000000000003ab05 R12: 0000000000000000
[  645.352021] R13: 0000000000000112 R14: ffff8881dea87200 R15: 00000000000000f1
[  645.353615] FS:  00007fea8d0e38c0(0000) GS:ffff8891d59f3000(0000) knlGS:0000000000000000
[  645.355424] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  645.356744] CR2: 00007ffc49a34548 CR3: 000000014bf10000 CR4: 0000000000350ef0
[  645.358354] Call Trace:
[  645.358955]  <TASK>
[  645.359460]  anon_vma_clone+0x1c3/0x4d0
[  645.360373]  ? kmem_cache_alloc_noprof+0x117/0x4f0
[  645.361108]  anon_vma_fork+0x70/0x5b0
[  645.361651]  dup_mmap+0xc85/0x14a0
[  645.362170]  ? __pfx_dup_mmap+0x10/0x10
[  645.362745]  ? mm_init.constprop.0+0xacb/0xfb0
[  645.363416]  ? __hrtimer_setup+0x30/0x1f0
[  645.364024]  copy_process+0x3596/0x61d0
[  645.364594]  ? init_file+0x86/0x4a0
[  645.365121]  ? alloc_empty_file+0x59/0x170
[  645.365733]  ? alloc_file_clone+0x52/0xe0
[  645.366322]  ? create_pipe_files+0x3d6/0x900
[  645.366960]  ? __pfx_copy_process+0x10/0x10
[  645.367583]  ? kvm_sched_clock_read+0x11/0x20
[  645.368262]  ? local_clock_noinstr+0xd/0xc0
[  645.368883]  ? local_clock+0x10/0x30
[  645.369423]  ? kasan_save_track+0x26/0x60
[  645.370024]  kernel_clone+0xb8/0x6c0
[  645.370559]  ? __pfx_kernel_clone+0x10/0x10
[  645.371170]  ? _raw_spin_lock_irq+0x80/0xe0
[  645.371794]  ? __pfx__raw_spin_lock_irq+0x10/0x10
[  645.372513]  ? __pfx_lockref_get+0x10/0x10
[  645.373137]  __do_sys_clone+0xb5/0x100
[  645.373700]  ? __pfx___do_sys_clone+0x10/0x10
[  645.374341]  ? syscall_trace_enter+0x8d/0x1c0
[  645.374987]  do_syscall_64+0x4d/0x200
[  645.375547]  entry_SYSCALL_64_after_hwframe+0x76/0x7e
[  645.376306] RIP: 0033:0x7fea8d7a9b57
[  645.376844] Code: ba 04 00 f3 0f 1e fa 64 48 8b 04 25 10 00 00 00 45 31 c0 31 d2 31 f6 bf 11 00 20 01 4c 8d 90 d0 02 00 00 b8 38 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 41 41 89 c0 85 c0 75 2c 64 48 8b 04 25 10 00
[  645.379510] RSP: 002b:00007ffc49a32c98 EFLAGS: 00000246 ORIG_RAX: 0000000000000038
[  645.380616] RAX: ffffffffffffffda RBX: 00007fea8d9ca040 RCX: 00007fea8d7a9b57
[  645.381653] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000001200011
[  645.382710] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
[  645.383826] R10: 00007fea8d0e3b90 R11: 0000000000000246 R12: 0000000000000001
[  645.384921] R13: 00007ffc49a32e70 R14: 0000000000000040 R15: 0000000000000001
[  645.386020]  </TASK>
[  645.386377] Modules linked in:
[  645.387059] ---[ end trace 0000000000000000 ]---
[  645.387796] RIP: 0010:anon_vma_interval_tree_insert+0x141/0x4f0
[  645.388609] Code: 04 4c 89 6b 18 48 8d 7b e0 48 89 f8 48 c1 e8 03 80 3c 28 00 0f 85 13 02 00 00 4c 8b 63 e0 49 8d 7c 24 50 48 89 f8 48 c1 e8 03 <80> 3c 28 00 0f 85 d4 01 00 00 4d 3b 7c 24 50 72 8a 4c 8d 63 08 31
[  645.391996] RSP: 0018:ffffc90000fa7938 EFLAGS: 00010216
[  645.392593] RAX: 000000000000000a RBX: ffff8881de72c308 RCX: ffff8881dea87210
[  645.393527] RDX: 0000000000000001 RSI: ffff88811b7c7e58 RDI: 0000000000000050
[  645.394382] RBP: dffffc0000000000 R08: ffff8881dea87220 R09: 1ffff1103ba49aaf
[  645.395330] R10: ffff88811b7c7e1f R11: 000000000003ab05 R12: 0000000000000000
[  645.396231] R13: 0000000000000112 R14: ffff8881dea87200 R15: 00000000000000f1
[  645.397155] FS:  00007fea8d0e38c0(0000) GS:ffff8891d59f3000(0000) knlGS:0000000000000000
[  645.398040] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  645.398638] CR2: 00007ffc49a34548 CR3: 000000014bf10000 CR4: 0000000000350ef0




^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] null_blk: allow byte aligned memory offsets
  2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
  2025-11-04  1:48 ` Damien Le Moal
  2025-11-04  9:15 ` Hans Holmberg
@ 2025-11-04 11:24 ` Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2025-11-04 11:24 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, hch, axboe, dlemoal, hans.holmberg, Keith Busch

> +		offset = pos & (PAGE_SIZE - 1);

This is an open coded offset_in_page()

> +		offset = pos & (PAGE_SIZE - 1);

Same.

> +static int null_transfer(struct nullb *nullb, void *p,
> +	unsigned int len, bool is_write, loff_t pos,
>  	bool is_fua)

Maybe fix the non-standard indentation here if you touch it anyway?

> +			memset(p, 0xff, len);
> +		flush_dcache_page(virt_to_page(p));
>  	} else {
> -		flush_dcache_page(page);
> -		err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
> +		flush_dcache_page(virt_to_page(p));
> +		err = copy_to_nullb(nullb, p, pos, len, is_fua);

virt_to_page does not work when kmap actually had to map, i.e. for
highmem.


>  	spin_lock_irq(&nullb->lock);
>  	rq_for_each_segment(bvec, rq, iter) {
> +		void *p = bvec_kmap_local(&bvec);;
> +
>  		len = bvec.bv_len;
> +		if (len > nr_bytes)
> +			len = nr_bytes;
> +		err = null_transfer(nullb, p, nr_bytes, op_is_write(req_op(rq)),
> +				    pos, rq->cmd_flags & REQ_FUA);
> +		kunmap_local(p);

Any reason to not keep the kmap local to null_transfer (or even the low-level
operation below it) and pass the bvec to it?


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] null_blk: allow byte aligned memory offsets
  2025-11-04  9:15 ` Hans Holmberg
@ 2025-11-05 18:47   ` Keith Busch
  0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-05 18:47 UTC (permalink / raw)
  To: Hans Holmberg
  Cc: Keith Busch, linux-block@vger.kernel.org, hch, axboe@kernel.dk,
	dlemoal@kernel.org

On Tue, Nov 04, 2025 at 09:15:15AM +0000, Hans Holmberg wrote:
> I applied this on top of 6.18-rc4 and hit the following when attempting to create a zoned
> nullblk dev(see bash script for recreating below the crash): 

Thanks for checking. I spotted the mistake in the code, causing a
potential buffer overrun. Got it fixed up for the next version.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] null_blk: allow byte aligned memory offsets
  2025-11-04  1:48 ` Damien Le Moal
@ 2025-11-05 18:57   ` Keith Busch
  0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2025-11-05 18:57 UTC (permalink / raw)
  To: Damien Le Moal; +Cc: Keith Busch, linux-block, hch, axboe, hans.holmberg

On Tue, Nov 04, 2025 at 10:48:25AM +0900, Damien Le Moal wrote:
> >  		if (valid_len) {
> > -			err = copy_from_nullb(nullb, page, off,
> > -				sector, valid_len);
> > -			off += valid_len;
> > +			err = copy_from_nullb(nullb, p, pos, valid_len);
> 
> Not your fault, but if this fails, we will still do the nullb_fill_pattern()
> below which I do not think is correct... ? May be we should have:
> 
> 			if (err)
> 				return err;
> 
> here ? But not sure if we should still call flush_dcache_page() even on error
> though.

It does look odd. copy_from_nullb() only returns success though, so
maybe we just drop the return value entirely.

> > @@ -1276,25 +1275,26 @@ static blk_status_t null_handle_data_transfer(struct nullb_cmd *cmd,
> >  	struct nullb *nullb = cmd->nq->dev->nullb;
> >  	int err = 0;
> >  	unsigned int len;
> > -	sector_t sector = blk_rq_pos(rq);
> > -	unsigned int max_bytes = nr_sectors << SECTOR_SHIFT;
> > -	unsigned int transferred_bytes = 0;
> > +	loff_t pos = blk_rq_pos(rq) << SECTOR_SHIFT;
> > +	unsigned int nr_bytes = nr_sectors << SECTOR_SHIFT;
> 
> Overflow potential here ?

Should be okay: nr_sectors comes from blk_rq_sectors().

The same calculation already exist just above, I just changed the name.
Actually, I don't know why I changed it, so I'll leave it alone in the
next version.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2025-11-05 18:57 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2025-11-03 17:28 [PATCH] null_blk: allow byte aligned memory offsets Keith Busch
2025-11-04  1:48 ` Damien Le Moal
2025-11-05 18:57   ` Keith Busch
2025-11-04  9:15 ` Hans Holmberg
2025-11-05 18:47   ` Keith Busch
2025-11-04 11:24 ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).