[PATCH 0/5] enable bs > ps for block devices

linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* [PATCH 0/5] enable bs > ps for block devices
@ 2024-05-10 10:29 hare
  2024-05-10 10:29 ` [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page hare
                   ` (6 more replies)
  0 siblings, 7 replies; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

From: Hannes Reinecke <hare@kernel.org>

Hi all,

based on the patch series from Pankaj '[PATCH v5 00/11] enable bs > ps in XFS'
it's now quite simple to enable support for block devices with block sizes
larger than page size even without having to disable CONFIG_BUFFER_HEAD.
The patchset really is just two rather trivial patches to fs/mpage,
and two patches to remove hardcoded restrictions on the block size.

As usual, comments and reviews are welcome.

Hannes Reinecke (4):
  fs/mpage: use blocks_per_folio instead of blocks_per_page
  fs/mpage: avoid negative shift for large blocksize
  block/bdev: lift restrictions on supported blocksize
  block/bdev: enable large folio support for large logical block sizes

Pankaj Raghav (1):
  nvme: enable logical block size > PAGE_SIZE

 block/bdev.c             | 10 ++++++---
 drivers/nvme/host/core.c |  7 +++----
 fs/mpage.c               | 44 ++++++++++++++++++++--------------------
 3 files changed, 32 insertions(+), 29 deletions(-)

-- 
2.35.3


^ permalink raw reply	[flat|nested] 35+ messages in thread

* [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
@ 2024-05-10 10:29 ` hare
  2024-05-10 22:19   ` Matthew Wilcox
  2024-05-10 10:29 ` [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize hare
                   ` (5 subsequent siblings)
  6 siblings, 1 reply; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

From: Hannes Reinecke <hare@suse.de>

Convert mpage to folios and associate the number of blocks with
a folio and not a page.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 fs/mpage.c | 40 ++++++++++++++++++++--------------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index fa8b99a199fa..379a71475c42 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -114,7 +114,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
 		 * don't make any buffers if there is only one buffer on
 		 * the folio and the folio just needs to be set up to date
 		 */
-		if (inode->i_blkbits == PAGE_SHIFT &&
+		if (inode->i_blkbits == folio_shift(folio) &&
 		    buffer_uptodate(bh)) {
 			folio_mark_uptodate(folio);
 			return;
@@ -160,7 +160,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	struct folio *folio = args->folio;
 	struct inode *inode = folio->mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
 	const unsigned blocksize = 1 << blkbits;
 	struct buffer_head *map_bh = &args->map_bh;
 	sector_t block_in_file;
@@ -168,7 +168,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	sector_t last_block_in_file;
 	sector_t first_block;
 	unsigned page_block;
-	unsigned first_hole = blocks_per_page;
+	unsigned first_hole = blocks_per_folio;
 	struct block_device *bdev = NULL;
 	int length;
 	int fully_mapped = 1;
@@ -189,7 +189,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 		goto confused;
 
 	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
-	last_block = block_in_file + args->nr_pages * blocks_per_page;
+	last_block = block_in_file + args->nr_pages * blocks_per_folio;
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 	if (last_block > last_block_in_file)
 		last_block = last_block_in_file;
@@ -211,7 +211,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 				clear_buffer_mapped(map_bh);
 				break;
 			}
-			if (page_block == blocks_per_page)
+			if (page_block == blocks_per_folio)
 				break;
 			page_block++;
 			block_in_file++;
@@ -223,7 +223,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	 * Then do more get_blocks calls until we are done with this folio.
 	 */
 	map_bh->b_folio = folio;
-	while (page_block < blocks_per_page) {
+	while (page_block < blocks_per_folio) {
 		map_bh->b_state = 0;
 		map_bh->b_size = 0;
 
@@ -236,7 +236,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 
 		if (!buffer_mapped(map_bh)) {
 			fully_mapped = 0;
-			if (first_hole == blocks_per_page)
+			if (first_hole == blocks_per_folio)
 				first_hole = page_block;
 			page_block++;
 			block_in_file++;
@@ -254,7 +254,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 			goto confused;
 		}
 	
-		if (first_hole != blocks_per_page)
+		if (first_hole != blocks_per_folio)
 			goto confused;		/* hole -> non-hole */
 
 		/* Contiguous blocks? */
@@ -267,7 +267,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 			if (relative_block == nblocks) {
 				clear_buffer_mapped(map_bh);
 				break;
-			} else if (page_block == blocks_per_page)
+			} else if (page_block == blocks_per_folio)
 				break;
 			page_block++;
 			block_in_file++;
@@ -275,7 +275,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 		bdev = map_bh->b_bdev;
 	}
 
-	if (first_hole != blocks_per_page) {
+	if (first_hole != blocks_per_folio) {
 		folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
 		if (first_hole == 0) {
 			folio_mark_uptodate(folio);
@@ -310,10 +310,10 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	relative_block = block_in_file - args->first_logical_block;
 	nblocks = map_bh->b_size >> blkbits;
 	if ((buffer_boundary(map_bh) && relative_block == nblocks) ||
-	    (first_hole != blocks_per_page))
+	    (first_hole != blocks_per_folio))
 		args->bio = mpage_bio_submit_read(args->bio);
 	else
-		args->last_block_in_bio = first_block + blocks_per_page - 1;
+		args->last_block_in_bio = first_block + blocks_per_folio - 1;
 out:
 	return args->bio;
 
@@ -392,7 +392,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block)
 {
 	struct mpage_readpage_args args = {
 		.folio = folio,
-		.nr_pages = 1,
+		.nr_pages = folio_nr_pages(folio),
 		.get_block = get_block,
 	};
 
@@ -463,12 +463,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 	struct address_space *mapping = folio->mapping;
 	struct inode *inode = mapping->host;
 	const unsigned blkbits = inode->i_blkbits;
-	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
+	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
 	sector_t last_block;
 	sector_t block_in_file;
 	sector_t first_block;
 	unsigned page_block;
-	unsigned first_unmapped = blocks_per_page;
+	unsigned first_unmapped = blocks_per_folio;
 	struct block_device *bdev = NULL;
 	int boundary = 0;
 	sector_t boundary_block = 0;
@@ -493,12 +493,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 				 */
 				if (buffer_dirty(bh))
 					goto confused;
-				if (first_unmapped == blocks_per_page)
+				if (first_unmapped == blocks_per_folio)
 					first_unmapped = page_block;
 				continue;
 			}
 
-			if (first_unmapped != blocks_per_page)
+			if (first_unmapped != blocks_per_folio)
 				goto confused;	/* hole -> non-hole */
 
 			if (!buffer_dirty(bh) || !buffer_uptodate(bh))
@@ -543,7 +543,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 		goto page_is_mapped;
 	last_block = (i_size - 1) >> blkbits;
 	map_bh.b_folio = folio;
-	for (page_block = 0; page_block < blocks_per_page; ) {
+	for (page_block = 0; page_block < blocks_per_folio; ) {
 
 		map_bh.b_state = 0;
 		map_bh.b_size = 1 << blkbits;
@@ -625,14 +625,14 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 	BUG_ON(folio_test_writeback(folio));
 	folio_start_writeback(folio);
 	folio_unlock(folio);
-	if (boundary || (first_unmapped != blocks_per_page)) {
+	if (boundary || (first_unmapped != blocks_per_folio)) {
 		bio = mpage_bio_submit_write(bio);
 		if (boundary_block) {
 			write_boundary_block(boundary_bdev,
 					boundary_block, 1 << blkbits);
 		}
 	} else {
-		mpd->last_block_in_bio = first_block + blocks_per_page - 1;
+		mpd->last_block_in_bio = first_block + blocks_per_folio - 1;
 	}
 	goto out;
 
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
  2024-05-10 10:29 ` [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page hare
@ 2024-05-10 10:29 ` hare
  2024-05-10 22:24   ` Matthew Wilcox
  2024-05-10 10:29 ` [PATCH 3/5] block/bdev: lift restrictions on supported blocksize hare
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

From: Hannes Reinecke <hare@kernel.org>

For large blocksizes the number of block bits is larger than PAGE_SHIFT,
so use shift to calculate the sector number from the page cache index.

Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 fs/mpage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/mpage.c b/fs/mpage.c
index 379a71475c42..e3732686e65f 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -188,7 +188,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	if (folio_buffers(folio))
 		goto confused;
 
-	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
+	block_in_file = (sector_t)(((loff_t)folio->index << PAGE_SHIFT) >> blkbits);
 	last_block = block_in_file + args->nr_pages * blocks_per_folio;
 	last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
 	if (last_block > last_block_in_file)
@@ -534,7 +534,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc,
 	 * The page has no buffers: map it to disk
 	 */
 	BUG_ON(!folio_test_uptodate(folio));
-	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
+	block_in_file = (sector_t)(((loff_t)folio->index << PAGE_SHIFT) >> blkbits);
 	/*
 	 * Whole page beyond EOF? Skip allocating blocks to avoid leaking
 	 * space.
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 3/5] block/bdev: lift restrictions on supported blocksize
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
  2024-05-10 10:29 ` [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page hare
  2024-05-10 10:29 ` [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize hare
@ 2024-05-10 10:29 ` hare
  2024-05-10 22:25   ` Matthew Wilcox
  2024-05-11 22:54   ` Luis Chamberlain
  2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
                   ` (3 subsequent siblings)
  6 siblings, 2 replies; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

From: Hannes Reinecke <hare@suse.de>

We now can support blocksizes larger than PAGE_SIZE, so lift
the restriction.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 block/bdev.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index b8e32d933a63..d37aa51b99ed 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -146,8 +146,8 @@ static void set_init_blocksize(struct block_device *bdev)
 
 int set_blocksize(struct block_device *bdev, int size)
 {
-	/* Size must be a power of two, and between 512 and PAGE_SIZE */
-	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+	/* Size must be a power of two, and larger than 512 */
+	if (size < 512 || !is_power_of_2(size))
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
@@ -170,7 +170,7 @@ int sb_set_blocksize(struct super_block *sb, int size)
 	if (set_blocksize(sb->s_bdev, size))
 		return 0;
 	/* If we get here, we know size is power of two
-	 * and it's value is between 512 and PAGE_SIZE */
+	 * and it's value is larger than 512 */
 	sb->s_blocksize = size;
 	sb->s_blocksize_bits = blksize_bits(size);
 	return sb->s_blocksize;
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
                   ` (2 preceding siblings ...)
  2024-05-10 10:29 ` [PATCH 3/5] block/bdev: lift restrictions on supported blocksize hare
@ 2024-05-10 10:29 ` hare
  2024-05-10 22:35   ` Matthew Wilcox
                     ` (2 more replies)
  2024-05-10 10:29 ` [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE hare
                   ` (2 subsequent siblings)
  6 siblings, 3 replies; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

From: Hannes Reinecke <hare@suse.de>

Call mapping_set_folio_min_order() when modifying the logical block
size to ensure folios are allocated with the correct size.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 block/bdev.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/block/bdev.c b/block/bdev.c
index d37aa51b99ed..a0c0ecc886e8 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -142,6 +142,8 @@ static void set_init_blocksize(struct block_device *bdev)
 		bsize <<= 1;
 	}
 	bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+	mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
+				    get_order(bsize));
 }
 
 int set_blocksize(struct block_device *bdev, int size)
@@ -158,6 +160,8 @@ int set_blocksize(struct block_device *bdev, int size)
 	if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
 		sync_blockdev(bdev);
 		bdev->bd_inode->i_blkbits = blksize_bits(size);
+		mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
+					    get_order(size));
 		kill_bdev(bdev);
 	}
 	return 0;
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
                   ` (3 preceding siblings ...)
  2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
@ 2024-05-10 10:29 ` hare
  2024-05-10 22:37   ` Matthew Wilcox
  2024-05-11 23:09   ` Luis Chamberlain
  2024-05-11 23:12 ` [PATCH 0/5] enable bs > ps for block devices Luis Chamberlain
  2024-05-30 13:37 ` Pankaj Raghav (Samsung)
  6 siblings, 2 replies; 35+ messages in thread
From: hare @ 2024-05-10 10:29 UTC (permalink / raw)
  To: Andrew Morton
  Cc: Matthew Wilcox, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Pankaj Raghav, Hannes Reinecke

From: Pankaj Raghav <p.raghav@samsung.com>

Don't set the capacity to zero for when logical block size > PAGE_SIZE
as the block device with iomap aops support allocating block cache with
a minimum folio order.

Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
Signed-off-by: Hannes Reinecke <hare@kernel.org>
---
 drivers/nvme/host/core.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 828c77fa13b7..5f1308daa74f 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1963,11 +1963,10 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
 	bool valid = true;
 
 	/*
-	 * The block layer can't support LBA sizes larger than the page size
-	 * or smaller than a sector size yet, so catch this early and don't
-	 * allow block I/O.
+	 * The block layer can't support LBA sizes smaller than a sector size,
+	 * so catch this early and don't allow block I/O.
 	 */
-	if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
+	if (head->lba_shift < SECTOR_SHIFT) {
 		bs = (1 << 9);
 		valid = false;
 	}
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page
  2024-05-10 10:29 ` [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page hare
@ 2024-05-10 22:19   ` Matthew Wilcox
  2024-05-11  9:12     ` Hannes Reinecke
  0 siblings, 1 reply; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-10 22:19 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

On Fri, May 10, 2024 at 12:29:02PM +0200, hare@kernel.org wrote:
> +++ b/fs/mpage.c
> @@ -114,7 +114,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
>  		 * don't make any buffers if there is only one buffer on
>  		 * the folio and the folio just needs to be set up to date
>  		 */
> -		if (inode->i_blkbits == PAGE_SHIFT &&
> +		if (inode->i_blkbits == folio_shift(folio) &&

yes

> @@ -160,7 +160,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>  	struct folio *folio = args->folio;
>  	struct inode *inode = folio->mapping->host;
>  	const unsigned blkbits = inode->i_blkbits;
> -	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
> +	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
>  	const unsigned blocksize = 1 << blkbits;
>  	struct buffer_head *map_bh = &args->map_bh;
>  	sector_t block_in_file;
> @@ -168,7 +168,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>  	sector_t last_block_in_file;
>  	sector_t first_block;
>  	unsigned page_block;
> -	unsigned first_hole = blocks_per_page;
> +	unsigned first_hole = blocks_per_folio;

yes

> @@ -189,7 +189,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>  		goto confused;
>  
>  	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
> -	last_block = block_in_file + args->nr_pages * blocks_per_page;
> +	last_block = block_in_file + args->nr_pages * blocks_per_folio;

no.  args->nr_pages really is the number of pages, so last_block is
block_in_file + nr_pages * blocks_per_page.  except that blocks_per_page
might now be 0.

so i think this needs to be rewritten as:

	last_block = block_in_file + (args->nr_pages * PAGE_SIZE) >> blkbits;

or have i confused myself?

> @@ -275,7 +275,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>  		bdev = map_bh->b_bdev;
>  	}
>  
> -	if (first_hole != blocks_per_page) {
> +	if (first_hole != blocks_per_folio) {
>  		folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);

... doesn't that need to be folio_size(folio)?

there may be other problems, but let's settle these questions first.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize
  2024-05-10 10:29 ` [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize hare
@ 2024-05-10 22:24   ` Matthew Wilcox
  2024-05-11  9:13     ` Hannes Reinecke
  0 siblings, 1 reply; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-10 22:24 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block

On Fri, May 10, 2024 at 12:29:03PM +0200, hare@kernel.org wrote:
> -	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
> +	block_in_file = (sector_t)(((loff_t)folio->index << PAGE_SHIFT) >> blkbits);

	block_in_file = folio_pos(folio) >> blkbits;


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] block/bdev: lift restrictions on supported blocksize
  2024-05-10 10:29 ` [PATCH 3/5] block/bdev: lift restrictions on supported blocksize hare
@ 2024-05-10 22:25   ` Matthew Wilcox
  2024-05-11  9:15     ` Hannes Reinecke
  2024-05-11 22:57     ` Luis Chamberlain
  2024-05-11 22:54   ` Luis Chamberlain
  1 sibling, 2 replies; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-10 22:25 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

On Fri, May 10, 2024 at 12:29:04PM +0200, hare@kernel.org wrote:
> From: Hannes Reinecke <hare@suse.de>
> 
> We now can support blocksizes larger than PAGE_SIZE, so lift
> the restriction.

Do we need to make this conditional on a CONFIG option?  Not all
architectures support THP yet ... which means they don't support
large folios.  We should fix that, but it is well below the cutoff line
on my todo list.

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
  2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
@ 2024-05-10 22:35   ` Matthew Wilcox
  2024-05-11  9:18     ` Hannes Reinecke
  2024-05-11  5:07   ` kernel test robot
  2024-05-11  7:04   ` kernel test robot
  2 siblings, 1 reply; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-10 22:35 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Hannes Reinecke

On Fri, May 10, 2024 at 12:29:05PM +0200, hare@kernel.org wrote:
> From: Hannes Reinecke <hare@suse.de>
> 
> Call mapping_set_folio_min_order() when modifying the logical block
> size to ensure folios are allocated with the correct size.

This makes me nervous.  It lets the pagecache allocate folios larger
than min_order (all the way up to PMD_ORDER).  Filesystems may not
cope well with seeing tail pages.

I'd _like_ to be able to do this.  But I think for now we need to
call mapping_set_folio_order_range(mapping, order, order);


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-10 10:29 ` [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE hare
@ 2024-05-10 22:37   ` Matthew Wilcox
  2024-05-11  9:18     ` Hannes Reinecke
  2024-05-11 23:11     ` Luis Chamberlain
  2024-05-11 23:09   ` Luis Chamberlain
  1 sibling, 2 replies; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-10 22:37 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Pankaj Raghav

On Fri, May 10, 2024 at 12:29:06PM +0200, hare@kernel.org wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> Don't set the capacity to zero for when logical block size > PAGE_SIZE
> as the block device with iomap aops support allocating block cache with
> a minimum folio order.

It feels like this should be something the block layer does rather than
something an individual block driver does.  Is there similar code to
rip out of sd.c?  Or other block drivers?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
  2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
  2024-05-10 22:35   ` Matthew Wilcox
@ 2024-05-11  5:07   ` kernel test robot
  2024-05-11  7:04   ` kernel test robot
  2 siblings, 0 replies; 35+ messages in thread
From: kernel test robot @ 2024-05-11  5:07 UTC (permalink / raw)
  To: hare, Andrew Morton
  Cc: oe-kbuild-all, Linux Memory Management List, Matthew Wilcox,
	Pankaj Raghav, Luis Chamberlain, linux-nvme, linux-block,
	Hannes Reinecke

Hi,

kernel test robot noticed the following build errors:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on akpm-mm/mm-everything linus/master v6.9-rc7]
[cannot apply to next-20240510]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/hare-kernel-org/fs-mpage-use-blocks_per_folio-instead-of-blocks_per_page/20240510-183146
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20240510102906.51844-5-hare%40kernel.org
patch subject: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
config: arc-allnoconfig (https://download.01.org/0day-ci/archive/20240511/202405111234.wXN5f0fB-lkp@intel.com/config)
compiler: arc-elf-gcc (GCC) 13.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240511/202405111234.wXN5f0fB-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202405111234.wXN5f0fB-lkp@intel.com/

All errors (new ones prefixed by >>):

   block/bdev.c: In function 'set_init_blocksize':
>> block/bdev.c:145:9: error: implicit declaration of function 'mapping_set_folio_min_order' [-Werror=implicit-function-declaration]
     145 |         mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
         |         ^~~~~~~~~~~~~~~~~~~~~~~~~~~
   cc1: some warnings being treated as errors


vim +/mapping_set_folio_min_order +145 block/bdev.c

   133	
   134	static void set_init_blocksize(struct block_device *bdev)
   135	{
   136		unsigned int bsize = bdev_logical_block_size(bdev);
   137		loff_t size = i_size_read(bdev->bd_inode);
   138	
   139		while (bsize < PAGE_SIZE) {
   140			if (size & bsize)
   141				break;
   142			bsize <<= 1;
   143		}
   144		bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 > 145		mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
   146					    get_order(bsize));
   147	}
   148	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
  2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
  2024-05-10 22:35   ` Matthew Wilcox
  2024-05-11  5:07   ` kernel test robot
@ 2024-05-11  7:04   ` kernel test robot
  2 siblings, 0 replies; 35+ messages in thread
From: kernel test robot @ 2024-05-11  7:04 UTC (permalink / raw)
  To: hare, Andrew Morton
  Cc: llvm, oe-kbuild-all, Linux Memory Management List, Matthew Wilcox,
	Pankaj Raghav, Luis Chamberlain, linux-nvme, linux-block,
	Hannes Reinecke

Hi,

kernel test robot noticed the following build errors:

[auto build test ERROR on axboe-block/for-next]
[also build test ERROR on akpm-mm/mm-everything linus/master v6.9-rc7]
[cannot apply to next-20240510]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/hare-kernel-org/fs-mpage-use-blocks_per_folio-instead-of-blocks_per_page/20240510-183146
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
patch link:    https://lore.kernel.org/r/20240510102906.51844-5-hare%40kernel.org
patch subject: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
config: x86_64-rhel-8.3-rust (https://download.01.org/0day-ci/archive/20240511/202405111438.WaVytRae-lkp@intel.com/config)
compiler: clang version 18.1.5 (https://github.com/llvm/llvm-project 617a15a9eac96088ae5e9134248d8236e34b91b1)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20240511/202405111438.WaVytRae-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202405111438.WaVytRae-lkp@intel.com/

All errors (new ones prefixed by >>):

>> block/bdev.c:145:2: error: call to undeclared function 'mapping_set_folio_min_order'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     145 |         mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
         |         ^
   block/bdev.c:163:3: error: call to undeclared function 'mapping_set_folio_min_order'; ISO C99 and later do not support implicit function declarations [-Wimplicit-function-declaration]
     163 |                 mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
         |                 ^
   2 errors generated.


vim +/mapping_set_folio_min_order +145 block/bdev.c

   133	
   134	static void set_init_blocksize(struct block_device *bdev)
   135	{
   136		unsigned int bsize = bdev_logical_block_size(bdev);
   137		loff_t size = i_size_read(bdev->bd_inode);
   138	
   139		while (bsize < PAGE_SIZE) {
   140			if (size & bsize)
   141				break;
   142			bsize <<= 1;
   143		}
   144		bdev->bd_inode->i_blkbits = blksize_bits(bsize);
 > 145		mapping_set_folio_min_order(bdev->bd_inode->i_mapping,
   146					    get_order(bsize));
   147	}
   148	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page
  2024-05-10 22:19   ` Matthew Wilcox
@ 2024-05-11  9:12     ` Hannes Reinecke
  0 siblings, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-11  9:12 UTC (permalink / raw)
  To: Matthew Wilcox, hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block

On 5/11/24 00:19, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:02PM +0200, hare@kernel.org wrote:
>> +++ b/fs/mpage.c
>> @@ -114,7 +114,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh,
>>   		 * don't make any buffers if there is only one buffer on
>>   		 * the folio and the folio just needs to be set up to date
>>   		 */
>> -		if (inode->i_blkbits == PAGE_SHIFT &&
>> +		if (inode->i_blkbits == folio_shift(folio) &&
> 
> yes
> 
>> @@ -160,7 +160,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>>   	struct folio *folio = args->folio;
>>   	struct inode *inode = folio->mapping->host;
>>   	const unsigned blkbits = inode->i_blkbits;
>> -	const unsigned blocks_per_page = PAGE_SIZE >> blkbits;
>> +	const unsigned blocks_per_folio = folio_size(folio) >> blkbits;
>>   	const unsigned blocksize = 1 << blkbits;
>>   	struct buffer_head *map_bh = &args->map_bh;
>>   	sector_t block_in_file;
>> @@ -168,7 +168,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>>   	sector_t last_block_in_file;
>>   	sector_t first_block;
>>   	unsigned page_block;
>> -	unsigned first_hole = blocks_per_page;
>> +	unsigned first_hole = blocks_per_folio;
> 
> yes
> 
>> @@ -189,7 +189,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>>   		goto confused;
>>   
>>   	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
>> -	last_block = block_in_file + args->nr_pages * blocks_per_page;
>> +	last_block = block_in_file + args->nr_pages * blocks_per_folio;
> 
> no.  args->nr_pages really is the number of pages, so last_block is
> block_in_file + nr_pages * blocks_per_page.  except that blocks_per_page
> might now be 0.
> 
> so i think this needs to be rewritten as:
> 
> 	last_block = block_in_file + (args->nr_pages * PAGE_SIZE) >> blkbits;
> 
> or have i confused myself?
> 
Aw. No. MIssed that line.

Will be fixing it up.


>> @@ -275,7 +275,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>>   		bdev = map_bh->b_bdev;
>>   	}
>>   
>> -	if (first_hole != blocks_per_page) {
>> +	if (first_hole != blocks_per_folio) {
>>   		folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE);
> 
> ... doesn't that need to be folio_size(folio)?
> 
> there may be other problems, but let's settle these questions first.
> 

Thanks for the review.
I'll be sure to pester you next week :-)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize
  2024-05-10 22:24   ` Matthew Wilcox
@ 2024-05-11  9:13     ` Hannes Reinecke
  0 siblings, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-11  9:13 UTC (permalink / raw)
  To: Matthew Wilcox, hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block

On 5/11/24 00:24, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:03PM +0200, hare@kernel.org wrote:
>> -	block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits);
>> +	block_in_file = (sector_t)(((loff_t)folio->index << PAGE_SHIFT) >> blkbits);
> 
> 	block_in_file = folio_pos(folio) >> blkbits;
> 
> 
I know there was a better way. Will be fixing it up.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] block/bdev: lift restrictions on supported blocksize
  2024-05-10 22:25   ` Matthew Wilcox
@ 2024-05-11  9:15     ` Hannes Reinecke
  2024-05-11 22:57     ` Luis Chamberlain
  1 sibling, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-11  9:15 UTC (permalink / raw)
  To: Matthew Wilcox, hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block

On 5/11/24 00:25, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:04PM +0200, hare@kernel.org wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> We now can support blocksizes larger than PAGE_SIZE, so lift
>> the restriction.
> 
> Do we need to make this conditional on a CONFIG option?  Not all
> architectures support THP yet ... which means they don't support
> large folios.  We should fix that, but it is well below the cutoff line
> on my todo list.

Well. We will only ever trigger this if we _really_ encounter a block 
device with a block size larger than PAGE_SIZE.
If which there currently are none (note to self: check NFS).

And the overall idea is that the logical blocksize is mandatory, ie the 
device _cannot_ handle requests with shorter blocksizes.
We might be setting min_order == max_order, but again this can be 
discussed once we get min_order working :-)

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes
  2024-05-10 22:35   ` Matthew Wilcox
@ 2024-05-11  9:18     ` Hannes Reinecke
  0 siblings, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-11  9:18 UTC (permalink / raw)
  To: Matthew Wilcox, hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block

On 5/11/24 00:35, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:05PM +0200, hare@kernel.org wrote:
>> From: Hannes Reinecke <hare@suse.de>
>>
>> Call mapping_set_folio_min_order() when modifying the logical block
>> size to ensure folios are allocated with the correct size.
> 
> This makes me nervous.  It lets the pagecache allocate folios larger
> than min_order (all the way up to PMD_ORDER).  Filesystems may not
> cope well with seeing tail pages.
> 
> I'd _like_ to be able to do this.  But I think for now we need to
> call mapping_set_folio_order_range(mapping, order, order);
> 
(Misrouted reply to the previous mail; airports make me jumpy ...)

The whole point is that these devices cannot support smaller 
allocations. We might restrict ourselves to min_order == max_order,
but we should be focussing on getting min_order working first ...

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-10 22:37   ` Matthew Wilcox
@ 2024-05-11  9:18     ` Hannes Reinecke
  2024-05-11 23:11     ` Luis Chamberlain
  1 sibling, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-11  9:18 UTC (permalink / raw)
  To: Matthew Wilcox, hare
  Cc: Andrew Morton, Pankaj Raghav, Luis Chamberlain, linux-nvme,
	linux-block, Pankaj Raghav

On 5/11/24 00:37, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:06PM +0200, hare@kernel.org wrote:
>> From: Pankaj Raghav <p.raghav@samsung.com>
>>
>> Don't set the capacity to zero for when logical block size > PAGE_SIZE
>> as the block device with iomap aops support allocating block cache with
>> a minimum folio order.
> 
> It feels like this should be something the block layer does rather than
> something an individual block driver does.  Is there similar code to
> rip out of sd.c?  Or other block drivers?

Correct, this restriction should never have been there. I'll be checking 
sd.c, too.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] block/bdev: lift restrictions on supported blocksize
  2024-05-10 10:29 ` [PATCH 3/5] block/bdev: lift restrictions on supported blocksize hare
  2024-05-10 22:25   ` Matthew Wilcox
@ 2024-05-11 22:54   ` Luis Chamberlain
  1 sibling, 0 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 22:54 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Matthew Wilcox, Pankaj Raghav, linux-nvme,
	linux-block, Hannes Reinecke

On Fri, May 10, 2024 at 12:29:04PM +0200, hare@kernel.org wrote:
> From: Hannes Reinecke <hare@suse.de>
> 
> We now can support blocksizes larger than PAGE_SIZE, so lift
> the restriction.
> 
> Signed-off-by: Hannes Reinecke <hare@suse.de>
> ---
>  block/bdev.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/block/bdev.c b/block/bdev.c
> index b8e32d933a63..d37aa51b99ed 100644
> --- a/block/bdev.c
> +++ b/block/bdev.c
> @@ -146,8 +146,8 @@ static void set_init_blocksize(struct block_device *bdev)
>  
>  int set_blocksize(struct block_device *bdev, int size)
>  {
> -	/* Size must be a power of two, and between 512 and PAGE_SIZE */
> -	if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
> +	/* Size must be a power of two, and larger than 512 */
> +	if (size < 512 || !is_power_of_2(size))

That needs to just be replaced with:

1 << (PAGE_SHIFT + MAX_PAGECACHE_ORDER)

We're not magically shooting to the sky all of a sudden. The large folio
world still has a limit. While that's 2 MiB today, yes, we can go higher
later, but even then, we still have a cap.

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 3/5] block/bdev: lift restrictions on supported blocksize
  2024-05-10 22:25   ` Matthew Wilcox
  2024-05-11  9:15     ` Hannes Reinecke
@ 2024-05-11 22:57     ` Luis Chamberlain
  1 sibling, 0 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 22:57 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Hannes Reinecke

On Fri, May 10, 2024 at 11:25:59PM +0100, Matthew Wilcox wrote:
> Not all architectures support THP yet

I noted to Hannes the existing page cache limit, should we have a helper
which also then uses CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE to fallback
to PAGE_SIZE otherwise ?

 Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-10 10:29 ` [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE hare
  2024-05-10 22:37   ` Matthew Wilcox
@ 2024-05-11 23:09   ` Luis Chamberlain
  2024-05-11 23:30     ` Matthew Wilcox
  1 sibling, 1 reply; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 23:09 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Matthew Wilcox, Pankaj Raghav, linux-nvme,
	linux-block, Pankaj Raghav

On Fri, May 10, 2024 at 12:29:06PM +0200, hare@kernel.org wrote:
> From: Pankaj Raghav <p.raghav@samsung.com>
> 
> Don't set the capacity to zero for when logical block size > PAGE_SIZE
> as the block device with iomap aops support allocating block cache with
> a minimum folio order.
> 
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> Signed-off-by: Hannes Reinecke <hare@kernel.org>
> ---
>  drivers/nvme/host/core.c | 7 +++----
>  1 file changed, 3 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 828c77fa13b7..5f1308daa74f 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -1963,11 +1963,10 @@ static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
>  	bool valid = true;
>  
>  	/*
> -	 * The block layer can't support LBA sizes larger than the page size
> -	 * or smaller than a sector size yet, so catch this early and don't
> -	 * allow block I/O.
> +	 * The block layer can't support LBA sizes smaller than a sector size,
> +	 * so catch this early and don't allow block I/O.
>  	 */
> -	if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
> +	if (head->lba_shift < SECTOR_SHIFT) {

We can't just do this, we need to consider the actual nvme cap (test it,
and if it crashes and below what the page cache supports, then we have
to go below) and so to make the enablment easier. So we could just move
this to helper [0]. Then when the bdev cache patch goes through the
check for CONFIG_BUFFER_HEAD can be removed, if this goes first.

We crash if we go above 1 MiB today, we should be able to go up to 2
MiB but that requires some review to see what stupid thing is getting
in the way.

[0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-10 22:37   ` Matthew Wilcox
  2024-05-11  9:18     ` Hannes Reinecke
@ 2024-05-11 23:11     ` Luis Chamberlain
  1 sibling, 0 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 23:11 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On Fri, May 10, 2024 at 11:37:22PM +0100, Matthew Wilcox wrote:
> On Fri, May 10, 2024 at 12:29:06PM +0200, hare@kernel.org wrote:
> > From: Pankaj Raghav <p.raghav@samsung.com>
> > 
> > Don't set the capacity to zero for when logical block size > PAGE_SIZE
> > as the block device with iomap aops support allocating block cache with
> > a minimum folio order.
> 
> It feels like this should be something the block layer does rather than
> something an individual block driver does.  Is there similar code to
> rip out of sd.c?  Or other block drivers?

Each block driver must be tested and set a max if its below what the page
cache supports.

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 0/5] enable bs > ps for block devices
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
                   ` (4 preceding siblings ...)
  2024-05-10 10:29 ` [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE hare
@ 2024-05-11 23:12 ` Luis Chamberlain
  2024-05-30 13:37 ` Pankaj Raghav (Samsung)
  6 siblings, 0 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 23:12 UTC (permalink / raw)
  To: hare; +Cc: Andrew Morton, Matthew Wilcox, Pankaj Raghav, linux-nvme,
	linux-block

On Fri, May 10, 2024 at 12:29:01PM +0200, hare@kernel.org wrote:
> From: Hannes Reinecke <hare@kernel.org>
> 
> Hi all,
> 
> based on the patch series from Pankaj '[PATCH v5 00/11] enable bs > ps in XFS'
> it's now quite simple to enable support for block devices with block sizes
> larger than page size even without having to disable CONFIG_BUFFER_HEAD.
> The patchset really is just two rather trivial patches to fs/mpage,
> and two patches to remove hardcoded restrictions on the block size.
> 
> As usual, comments and reviews are welcome.

What's the test plan?

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-11 23:09   ` Luis Chamberlain
@ 2024-05-11 23:30     ` Matthew Wilcox
  2024-05-11 23:51       ` Luis Chamberlain
  0 siblings, 1 reply; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-11 23:30 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On Sat, May 11, 2024 at 04:09:48PM -0700, Luis Chamberlain wrote:
> We can't just do this, we need to consider the actual nvme cap (test it,
> and if it crashes and below what the page cache supports, then we have
> to go below) and so to make the enablment easier. So we could just move
> this to helper [0]. Then when the bdev cache patch goes through the
> check for CONFIG_BUFFER_HEAD can be removed, if this goes first.
> 
> We crash if we go above 1 MiB today, we should be able to go up to 2
> MiB but that requires some review to see what stupid thing is getting
> in the way.
> 
> [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53

This is overengineered garbage.  What's the crash?

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-11 23:30     ` Matthew Wilcox
@ 2024-05-11 23:51       ` Luis Chamberlain
  2024-05-12  2:43         ` Luis Chamberlain
  0 siblings, 1 reply; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-11 23:51 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On Sun, May 12, 2024 at 12:30:40AM +0100, Matthew Wilcox wrote:
> On Sat, May 11, 2024 at 04:09:48PM -0700, Luis Chamberlain wrote:
> > We can't just do this, we need to consider the actual nvme cap (test it,
> > and if it crashes and below what the page cache supports, then we have
> > to go below) and so to make the enablment easier. So we could just move
> > this to helper [0]. Then when the bdev cache patch goes through the
> > check for CONFIG_BUFFER_HEAD can be removed, if this goes first.
> > 
> > We crash if we go above 1 MiB today, we should be able to go up to 2
> > MiB but that requires some review to see what stupid thing is getting
> > in the way.
> > 
> > [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53
> 
> This is overengineered garbage.  What's the crash?

I had only tested it with iomap, I had not tested it with buffer-heads,
and so it would require re-testing. It's Saturday 5pm, I should be doing
something else other than being on my computer.

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-11 23:51       ` Luis Chamberlain
@ 2024-05-12  2:43         ` Luis Chamberlain
  2024-05-12  9:16           ` Luis Chamberlain
  2024-05-13 13:43           ` Hannes Reinecke
  0 siblings, 2 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-12  2:43 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On Sat, May 11, 2024 at 04:51:32PM -0700, Luis Chamberlain wrote:
> On Sun, May 12, 2024 at 12:30:40AM +0100, Matthew Wilcox wrote:
> > On Sat, May 11, 2024 at 04:09:48PM -0700, Luis Chamberlain wrote:
> > > We can't just do this, we need to consider the actual nvme cap (test it,
> > > and if it crashes and below what the page cache supports, then we have
> > > to go below) and so to make the enablment easier. So we could just move
> > > this to helper [0]. Then when the bdev cache patch goes through the
> > > check for CONFIG_BUFFER_HEAD can be removed, if this goes first.
> > > 
> > > We crash if we go above 1 MiB today, we should be able to go up to 2
> > > MiB but that requires some review to see what stupid thing is getting
> > > in the way.
> > > 
> > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53
> > 
> > This is overengineered garbage.  What's the crash?
> 
> I had only tested it with iomap, I had not tested it with buffer-heads,
> and so it would require re-testing. It's Saturday 5pm, I should be doing
> something else other than being on my computer.

It crashes because we forgot two things in this series below, so the
change below its us to enable to *at least* boot up to 64k LBA format on
NVMe.

One can reproduce this with kdevops with:

make defconfig-lbs-xfs-bdev-nvme
make bringup
make linux

I've added another defconfig which bumps the LBA format up to 512 KiB to
see if bootup blows up, that has another defconfig:

make lbs-xfs-bdev-large-nvme
make bringup
make linux

That at least booted. Note that the above defconfigs use this thread's
message ID, so it applies this series on top of the min order branch.
The patch below is just needed.

I'll try next going above 512 KiB.
 
diff --git a/fs/buffer.c b/fs/buffer.c 
index 4f73d23c2c46..fa88e300a946 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2360,8 +2360,6 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
 		limit = inode->i_sb->s_maxbytes;
 
-	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
-
 	head = folio_create_buffers(folio, inode, 0);
 	blocksize = head->b_size;
 
diff --git a/fs/mpage.c b/fs/mpage.c
index e3732686e65f..e124c924b2e7 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -178,7 +178,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
 	gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
 
 	/* MAX_BUF_PER_PAGE, for example */
-	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
 
 	if (args->is_readahead) {
 		opf |= REQ_RAHEAD;

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-12  2:43         ` Luis Chamberlain
@ 2024-05-12  9:16           ` Luis Chamberlain
  2024-05-13 16:07             ` Hannes Reinecke
  2024-05-13 13:43           ` Hannes Reinecke
  1 sibling, 1 reply; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-12  9:16 UTC (permalink / raw)
  To: Matthew Wilcox, Kent Overstreet
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
> I'll try next going above 512 KiB.

At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().

[   13.401651] ------------[ cut here ]------------
[   13.403298] kernel BUG at block/bio.c:1626!
[   13.404708] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   13.406390] CPU: 5 PID: 87 Comm: kworker/u38:1 Not tainted 6.9.0-rc6+ #2
[   13.408480] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   13.411304] Workqueue: nvme-wq nvme_scan_work [nvme_core]
[   13.412928] RIP: 0010:bio_split (block/bio.c:1626 (discriminator 1)) 
[ 13.414148] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
All code
========
   0:	5b                   	pop    %rbx
   1:	4c 89 e0             	mov    %r12,%rax
   4:	5d                   	pop    %rbp
   5:	41 5c                	pop    %r12
   7:	41 5d                	pop    %r13
   9:	c3                   	ret
   a:	cc                   	int3
   b:	cc                   	int3
   c:	cc                   	int3
   d:	cc                   	int3
   e:	c7 43 28 00 00 00 00 	movl   $0x0,0x28(%rbx)
  15:	eb db                	jmp    0xfffffffffffffff2
  17:	0f 0b                	ud2
  19:	45 31 e4             	xor    %r12d,%r12d
  1c:	5b                   	pop    %rbx
  1d:	5d                   	pop    %rbp
  1e:	4c 89 e0             	mov    %r12,%rax
  21:	41 5c                	pop    %r12
  23:	41 5d                	pop    %r13
  25:	c3                   	ret
  26:	cc                   	int3
  27:	cc                   	int3
  28:	cc                   	int3
  29:	cc                   	int3
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	0f 0b                	ud2
  2e:	4c 89 e7             	mov    %r12,%rdi
  31:	e8 bf ee ff ff       	call   0xffffffffffffeef5
  36:	eb e1                	jmp    0x19
  38:	66                   	data16
  39:	66                   	data16
  3a:	2e                   	cs
  3b:	0f                   	.byte 0xf
  3c:	1f                   	(bad)
  3d:	84 00                	test   %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	0f 0b                	ud2
   4:	4c 89 e7             	mov    %r12,%rdi
   7:	e8 bf ee ff ff       	call   0xffffffffffffeecb
   c:	eb e1                	jmp    0xffffffffffffffef
   e:	66                   	data16
   f:	66                   	data16
  10:	2e                   	cs
  11:	0f                   	.byte 0xf
  12:	1f                   	(bad)
  13:	84 00                	test   %al,(%rax)
	...
[   13.420639] RSP: 0018:ffffc056407bb8e0 EFLAGS: 00010246
[   13.422140] RAX: 0000000000000001 RBX: ffff9d97c165fa80 RCX: ffff9d97e65ce860
[   13.424128] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9d97c165fa80
[   13.426123] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
[   13.428372] R10: ffff9d97c165fa80 R11: ffff9d97c165faf8 R12: ffff9d97df3f7250
[   13.430636] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
[   13.432510] FS:  0000000000000000(0000) GS:ffff9d983bd40000(0000) knlGS:0000000000000000
[   13.434539] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.435987] CR2: 000055b34c8d6188 CR3: 000000011dfb8002 CR4: 0000000000770ef0
[   13.437664] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   13.439354] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[   13.440911] PKRU: 55555554
[   13.441605] Call Trace:
[   13.442250]  <TASK>
[   13.442836] ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447) 
[   13.443585] ? do_trap (arch/x86/kernel/traps.c:114 arch/x86/kernel/traps.c:155) 
[   13.444356] ? bio_split (block/bio.c:1626 (discriminator 1)) 
[   13.445127] ? do_error_trap (./arch/x86/include/asm/traps.h:58 arch/x86/kernel/traps.c:176) 
[   13.445975] ? bio_split (block/bio.c:1626 (discriminator 1)) 
[   13.446760] ? exc_invalid_op (arch/x86/kernel/traps.c:267) 
[   13.447635] ? bio_split (block/bio.c:1626 (discriminator 1)) 
[   13.448325] ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621) 
[   13.449137] ? bio_split (block/bio.c:1626 (discriminator 1)) 
[   13.449819] __bio_split_to_limits (block/blk-merge.c:366) 
[   13.450675] blk_mq_submit_bio (block/blk-mq.c:2973) 
[   13.451490] ? kmem_cache_alloc (./include/linux/kmemleak.h:42 mm/slub.c:3802 mm/slub.c:3845 mm/slub.c:3852) 
[   13.452253] ? __mod_node_page_state (mm/vmstat.c:403 (discriminator 2)) 
[   13.453050] submit_bio_noacct_nocheck (./include/linux/bio.h:639 block/blk-core.c:701 block/blk-core.c:729) 
[   13.453897] ? submit_bio_noacct (block/blk-core.c:758 (discriminator 1)) 
[   13.454658] block_read_full_folio (fs/buffer.c:2429 (discriminator 1)) 
[   13.455467] ? __pfx_blkdev_get_block (block/fops.c:409) 
[   13.456240] ? __pfx_blkdev_read_folio (block/fops.c:437) 
[   13.457014] ? __pfx_blkdev_read_folio (block/fops.c:437) 
[   13.457792] filemap_read_folio (mm/filemap.c:2335) 
[   13.458484] do_read_cache_folio (mm/filemap.c:3763) 
[   13.459220] ? __pfx_adfspart_check_ICS (block/partitions/acorn.c:351) 
[   13.459967] read_part_sector (./include/linux/pagemap.h:970 block/partitions/core.c:715) 
[   13.460602] adfspart_check_ICS (block/partitions/acorn.c:361) 
[   13.461269] ? snprintf (lib/vsprintf.c:2963) 
[   13.461844] ? __pfx_adfspart_check_ICS (block/partitions/acorn.c:351) 
[   13.462601] bdev_disk_changed (block/partitions/core.c:138 block/partitions/core.c:582 block/partitions/core.c:686 block/partitions/core.c:635) 
[   13.463268] blkdev_get_whole (block/bdev.c:684) 
[   13.463871] bdev_open (block/bdev.c:893) 
[   13.464386] bdev_file_open_by_dev (block/bdev.c:995 block/bdev.c:969) 
[   13.465006] disk_scan_partitions (block/genhd.c:369 (discriminator 1)) 
[   13.465621] device_add_disk (block/genhd.c:512) 
[   13.466199] nvme_scan_ns (drivers/nvme/host/core.c:3807 (discriminator 1) drivers/nvme/host/core.c:3961 (discriminator 1)) nvme_core
[   13.466885] nvme_scan_work (drivers/nvme/host/core.c:4015 drivers/nvme/host/core.c:4105) nvme_core
[   13.467598] process_one_work (kernel/workqueue.c:3254) 
[   13.468186] worker_thread (kernel/workqueue.c:3329 (discriminator 2) kernel/workqueue.c:3416 (discriminator 2)) 
[   13.468723] ? _raw_spin_lock_irqsave (./arch/x86/include/asm/atomic.h:115 (discriminator 4) ./include/linux/atomic/atomic-arch-fallback.h:2170 (discriminator 4) ./include/linux/atomic/atomic-instrumented.h:1302 (discriminator 4) ./include/asm-generic/qspinlock.h:111 (discriminator 4) ./include/linux/spinlock.h:187 (discriminator 4) ./include/linux/spinlock_api_smp.h:111 (discriminator 4) kernel/locking/spinlock.c:162 (discriminator 4)) 
[   13.469343] ? __pfx_worker_thread (kernel/workqueue.c:3362) 
[   13.469933] kthread (kernel/kthread.c:388) 
[   13.470395] ? __pfx_kthread (kernel/kthread.c:341) 
[   13.470928] ret_from_fork (arch/x86/kernel/process.c:147) 
[   13.471457] ? __pfx_kthread (kernel/kthread.c:341) 
[   13.472008] ret_from_fork_asm (arch/x86/entry/entry_64.S:257) 
[   13.472540]  </TASK>
[   13.472865] Modules linked in: crc32c_intel psmouse nvme nvme_core t10_pi crc64_rocksoft crc64 virtio_pci virtio_pci_legacy_dev virtio_pci_modern_dev virtio virtio_ring
[   13.474629] ---[ end trace 0000000000000000 ]---
[   13.475229] RIP: 0010:bio_split (block/bio.c:1626 (discriminator 1)) 
[ 13.475742] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
All code
========
   0:	5b                   	pop    %rbx
   1:	4c 89 e0             	mov    %r12,%rax
   4:	5d                   	pop    %rbp
   5:	41 5c                	pop    %r12
   7:	41 5d                	pop    %r13
   9:	c3                   	ret
   a:	cc                   	int3
   b:	cc                   	int3
   c:	cc                   	int3
   d:	cc                   	int3
   e:	c7 43 28 00 00 00 00 	movl   $0x0,0x28(%rbx)
  15:	eb db                	jmp    0xfffffffffffffff2
  17:	0f 0b                	ud2
  19:	45 31 e4             	xor    %r12d,%r12d
  1c:	5b                   	pop    %rbx
  1d:	5d                   	pop    %rbp
  1e:	4c 89 e0             	mov    %r12,%rax
  21:	41 5c                	pop    %r12
  23:	41 5d                	pop    %r13
  25:	c3                   	ret
  26:	cc                   	int3
  27:	cc                   	int3
  28:	cc                   	int3
  29:	cc                   	int3
  2a:*	0f 0b                	ud2		<-- trapping instruction
  2c:	0f 0b                	ud2
  2e:	4c 89 e7             	mov    %r12,%rdi
  31:	e8 bf ee ff ff       	call   0xffffffffffffeef5
  36:	eb e1                	jmp    0x19
  38:	66                   	data16
  39:	66                   	data16
  3a:	2e                   	cs
  3b:	0f                   	.byte 0xf
  3c:	1f                   	(bad)
  3d:	84 00                	test   %al,(%rax)
	...

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2
   2:	0f 0b                	ud2
   4:	4c 89 e7             	mov    %r12,%rdi
   7:	e8 bf ee ff ff       	call   0xffffffffffffeecb
   c:	eb e1                	jmp    0xffffffffffffffef
   e:	66                   	data16
   f:	66                   	data16
  10:	2e                   	cs
  11:	0f                   	.byte 0xf
  12:	1f                   	(bad)
  13:	84 00                	test   %al,(%rax)
	...
[   13.477749] RSP: 0018:ffffc056407bb8e0 EFLAGS: 00010246
[   13.478405] RAX: 0000000000000001 RBX: ffff9d97c165fa80 RCX: ffff9d97e65ce860
[   13.479230] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9d97c165fa80
[   13.480026] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
[   13.480802] R10: ffff9d97c165fa80 R11: ffff9d97c165faf8 R12: ffff9d97df3f7250
[   13.481568] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
[   13.482337] FS:  0000000000000000(0000) GS:ffff9d983bd40000(0000) knlGS:0000000000000000
[   13.483218] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   13.483853] CR2: 000055b34c8d6188 CR3: 000000011dfb8002 CR4: 0000000000770ef0
[   13.484584] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   13.485327] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[   13.486070] PKRU: 55555554

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-12  2:43         ` Luis Chamberlain
  2024-05-12  9:16           ` Luis Chamberlain
@ 2024-05-13 13:43           ` Hannes Reinecke
  2024-05-13 14:32             ` Luis Chamberlain
  1 sibling, 1 reply; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-13 13:43 UTC (permalink / raw)
  To: Luis Chamberlain, Matthew Wilcox
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

On 5/12/24 04:43, Luis Chamberlain wrote:
> On Sat, May 11, 2024 at 04:51:32PM -0700, Luis Chamberlain wrote:
>> On Sun, May 12, 2024 at 12:30:40AM +0100, Matthew Wilcox wrote:
>>> On Sat, May 11, 2024 at 04:09:48PM -0700, Luis Chamberlain wrote:
>>>> We can't just do this, we need to consider the actual nvme cap (test it,
>>>> and if it crashes and below what the page cache supports, then we have
>>>> to go below) and so to make the enablment easier. So we could just move
>>>> this to helper [0]. Then when the bdev cache patch goes through the
>>>> check for CONFIG_BUFFER_HEAD can be removed, if this goes first.
>>>>
>>>> We crash if we go above 1 MiB today, we should be able to go up to 2
>>>> MiB but that requires some review to see what stupid thing is getting
>>>> in the way.
>>>>
>>>> [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53
>>>
>>> This is overengineered garbage.  What's the crash?
>>
>> I had only tested it with iomap, I had not tested it with buffer-heads,
>> and so it would require re-testing. It's Saturday 5pm, I should be doing
>> something else other than being on my computer.
> 
> It crashes because we forgot two things in this series below, so the
> change below its us to enable to *at least* boot up to 64k LBA format on
> NVMe.
> 
> One can reproduce this with kdevops with:
> 
> make defconfig-lbs-xfs-bdev-nvme
> make bringup
> make linux
> 
> I've added another defconfig which bumps the LBA format up to 512 KiB to
> see if bootup blows up, that has another defconfig:
> 
> make lbs-xfs-bdev-large-nvme
> make bringup
> make linux
> 
> That at least booted. Note that the above defconfigs use this thread's
> message ID, so it applies this series on top of the min order branch.
> The patch below is just needed.
> 
> I'll try next going above 512 KiB.
>   
> diff --git a/fs/buffer.c b/fs/buffer.c
> index 4f73d23c2c46..fa88e300a946 100644
> --- a/fs/buffer.c
> +++ b/fs/buffer.c
> @@ -2360,8 +2360,6 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
>   	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
>   		limit = inode->i_sb->s_maxbytes;
>   
> -	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
> -
>   	head = folio_create_buffers(folio, inode, 0);
>   	blocksize = head->b_size;
>   
> diff --git a/fs/mpage.c b/fs/mpage.c
> index e3732686e65f..e124c924b2e7 100644
> --- a/fs/mpage.c
> +++ b/fs/mpage.c
> @@ -178,7 +178,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
>   	gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
>   
>   	/* MAX_BUF_PER_PAGE, for example */
> -	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
>   
>   	if (args->is_readahead) {
>   		opf |= REQ_RAHEAD;
> 
Thanks. Will be including that in the next round.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-13 13:43           ` Hannes Reinecke
@ 2024-05-13 14:32             ` Luis Chamberlain
  0 siblings, 0 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-13 14:32 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Matthew Wilcox, hare, Andrew Morton, Pankaj Raghav, linux-nvme,
	linux-block, Pankaj Raghav, Pankaj Raghav

On Mon, May 13, 2024 at 03:43:38PM +0200, Hannes Reinecke wrote:
> On 5/12/24 04:43, Luis Chamberlain wrote:
> > On Sat, May 11, 2024 at 04:51:32PM -0700, Luis Chamberlain wrote:
> > > On Sun, May 12, 2024 at 12:30:40AM +0100, Matthew Wilcox wrote:
> > > > On Sat, May 11, 2024 at 04:09:48PM -0700, Luis Chamberlain wrote:
> > > > > We can't just do this, we need to consider the actual nvme cap (test it,
> > > > > and if it crashes and below what the page cache supports, then we have
> > > > > to go below) and so to make the enablment easier. So we could just move
> > > > > this to helper [0]. Then when the bdev cache patch goes through the
> > > > > check for CONFIG_BUFFER_HEAD can be removed, if this goes first.
> > > > > 
> > > > > We crash if we go above 1 MiB today, we should be able to go up to 2
> > > > > MiB but that requires some review to see what stupid thing is getting
> > > > > in the way.
> > > > > 
> > > > > [0] https://git.kernel.org/pub/scm/linux/kernel/git/mcgrof/linux.git/commit/?h=20240408-lbs-scsi-kludge&id=1f7f4dce548cc11872e977939a872b107c68ad53
> > > > 
> > > > This is overengineered garbage.  What's the crash?
> > > 
> > > I had only tested it with iomap, I had not tested it with buffer-heads,
> > > and so it would require re-testing. It's Saturday 5pm, I should be doing
> > > something else other than being on my computer.
> > 
> > It crashes because we forgot two things in this series below, so the
> > change below its us to enable to *at least* boot up to 64k LBA format on
> > NVMe.
> > 
> > One can reproduce this with kdevops with:
> > 
> > make defconfig-lbs-xfs-bdev-nvme
> > make bringup
> > make linux
> > 
> > I've added another defconfig which bumps the LBA format up to 512 KiB to
> > see if bootup blows up, that has another defconfig:
> > 
> > make lbs-xfs-bdev-large-nvme
> > make bringup
> > make linux
> > 
> > That at least booted. Note that the above defconfigs use this thread's
> > message ID, so it applies this series on top of the min order branch.
> > The patch below is just needed.
> > 
> > I'll try next going above 512 KiB.
> > diff --git a/fs/buffer.c b/fs/buffer.c
> > index 4f73d23c2c46..fa88e300a946 100644
> > --- a/fs/buffer.c
> > +++ b/fs/buffer.c
> > @@ -2360,8 +2360,6 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
> >   	if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode))
> >   		limit = inode->i_sb->s_maxbytes;
> > -	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
> > -
> >   	head = folio_create_buffers(folio, inode, 0);
> >   	blocksize = head->b_size;
> > diff --git a/fs/mpage.c b/fs/mpage.c
> > index e3732686e65f..e124c924b2e7 100644
> > --- a/fs/mpage.c
> > +++ b/fs/mpage.c
> > @@ -178,7 +178,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args)
> >   	gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL);
> >   	/* MAX_BUF_PER_PAGE, for example */
> > -	VM_BUG_ON_FOLIO(folio_test_large(folio), folio);
> >   	if (args->is_readahead) {
> >   		opf |= REQ_RAHEAD;
> > 
> Thanks. Will be including that in the next round.

I gave it a spin with the above changes, results:

https://github.com/linux-kdevops/kdevops/commit/f2641efe7d2037892e0477fdc8daf66af59c1f01
https://github.com/linux-kdevops/kdevops-results-archive/commit/c3ac880e909a30aa2f9b24231d0f7297dfdf4e8e

  Luis

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-12  9:16           ` Luis Chamberlain
@ 2024-05-13 16:07             ` Hannes Reinecke
  2024-05-13 21:05               ` Luis Chamberlain
  2024-05-14  5:08               ` Matthew Wilcox
  0 siblings, 2 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-13 16:07 UTC (permalink / raw)
  To: Luis Chamberlain, Matthew Wilcox, Kent Overstreet
  Cc: hare, Andrew Morton, Pankaj Raghav, linux-nvme, linux-block,
	Pankaj Raghav

[-- Attachment #1: Type: text/plain, Size: 11497 bytes --]

On 5/12/24 11:16, Luis Chamberlain wrote:
> On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
>> I'll try next going above 512 KiB.
> 
> At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().
> 
> [   13.401651] ------------[ cut here ]------------
> [   13.403298] kernel BUG at block/bio.c:1626!
> [   13.404708] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [   13.406390] CPU: 5 PID: 87 Comm: kworker/u38:1 Not tainted 6.9.0-rc6+ #2
> [   13.408480] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   13.411304] Workqueue: nvme-wq nvme_scan_work [nvme_core]
> [   13.412928] RIP: 0010:bio_split (block/bio.c:1626 (discriminator 1))
> [ 13.414148] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
> All code
> ========
>     0:	5b                   	pop    %rbx
>     1:	4c 89 e0             	mov    %r12,%rax
>     4:	5d                   	pop    %rbp
>     5:	41 5c                	pop    %r12
>     7:	41 5d                	pop    %r13
>     9:	c3                   	ret
>     a:	cc                   	int3
>     b:	cc                   	int3
>     c:	cc                   	int3
>     d:	cc                   	int3
>     e:	c7 43 28 00 00 00 00 	movl   $0x0,0x28(%rbx)
>    15:	eb db                	jmp    0xfffffffffffffff2
>    17:	0f 0b                	ud2
>    19:	45 31 e4             	xor    %r12d,%r12d
>    1c:	5b                   	pop    %rbx
>    1d:	5d                   	pop    %rbp
>    1e:	4c 89 e0             	mov    %r12,%rax
>    21:	41 5c                	pop    %r12
>    23:	41 5d                	pop    %r13
>    25:	c3                   	ret
>    26:	cc                   	int3
>    27:	cc                   	int3
>    28:	cc                   	int3
>    29:	cc                   	int3
>    2a:*	0f 0b                	ud2		<-- trapping instruction
>    2c:	0f 0b                	ud2
>    2e:	4c 89 e7             	mov    %r12,%rdi
>    31:	e8 bf ee ff ff       	call   0xffffffffffffeef5
>    36:	eb e1                	jmp    0x19
>    38:	66                   	data16
>    39:	66                   	data16
>    3a:	2e                   	cs
>    3b:	0f                   	.byte 0xf
>    3c:	1f                   	(bad)
>    3d:	84 00                	test   %al,(%rax)
> 	...
> 
> Code starting with the faulting instruction
> ===========================================
>     0:	0f 0b                	ud2
>     2:	0f 0b                	ud2
>     4:	4c 89 e7             	mov    %r12,%rdi
>     7:	e8 bf ee ff ff       	call   0xffffffffffffeecb
>     c:	eb e1                	jmp    0xffffffffffffffef
>     e:	66                   	data16
>     f:	66                   	data16
>    10:	2e                   	cs
>    11:	0f                   	.byte 0xf
>    12:	1f                   	(bad)
>    13:	84 00                	test   %al,(%rax)
> 	...
> [   13.420639] RSP: 0018:ffffc056407bb8e0 EFLAGS: 00010246
> [   13.422140] RAX: 0000000000000001 RBX: ffff9d97c165fa80 RCX: ffff9d97e65ce860
> [   13.424128] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9d97c165fa80
> [   13.426123] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
> [   13.428372] R10: ffff9d97c165fa80 R11: ffff9d97c165faf8 R12: ffff9d97df3f7250
> [   13.430636] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> [   13.432510] FS:  0000000000000000(0000) GS:ffff9d983bd40000(0000) knlGS:0000000000000000
> [   13.434539] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   13.435987] CR2: 000055b34c8d6188 CR3: 000000011dfb8002 CR4: 0000000000770ef0
> [   13.437664] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   13.439354] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> [   13.440911] PKRU: 55555554
> [   13.441605] Call Trace:
> [   13.442250]  <TASK>
> [   13.442836] ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447)
> [   13.443585] ? do_trap (arch/x86/kernel/traps.c:114 arch/x86/kernel/traps.c:155)
> [   13.444356] ? bio_split (block/bio.c:1626 (discriminator 1))
> [   13.445127] ? do_error_trap (./arch/x86/include/asm/traps.h:58 arch/x86/kernel/traps.c:176)
> [   13.445975] ? bio_split (block/bio.c:1626 (discriminator 1))
> [   13.446760] ? exc_invalid_op (arch/x86/kernel/traps.c:267)
> [   13.447635] ? bio_split (block/bio.c:1626 (discriminator 1))
> [   13.448325] ? asm_exc_invalid_op (./arch/x86/include/asm/idtentry.h:621)
> [   13.449137] ? bio_split (block/bio.c:1626 (discriminator 1))
> [   13.449819] __bio_split_to_limits (block/blk-merge.c:366)
> [   13.450675] blk_mq_submit_bio (block/blk-mq.c:2973)
> [   13.451490] ? kmem_cache_alloc (./include/linux/kmemleak.h:42 mm/slub.c:3802 mm/slub.c:3845 mm/slub.c:3852)
> [   13.452253] ? __mod_node_page_state (mm/vmstat.c:403 (discriminator 2))
> [   13.453050] submit_bio_noacct_nocheck (./include/linux/bio.h:639 block/blk-core.c:701 block/blk-core.c:729)
> [   13.453897] ? submit_bio_noacct (block/blk-core.c:758 (discriminator 1))
> [   13.454658] block_read_full_folio (fs/buffer.c:2429 (discriminator 1))
> [   13.455467] ? __pfx_blkdev_get_block (block/fops.c:409)
> [   13.456240] ? __pfx_blkdev_read_folio (block/fops.c:437)
> [   13.457014] ? __pfx_blkdev_read_folio (block/fops.c:437)
> [   13.457792] filemap_read_folio (mm/filemap.c:2335)
> [   13.458484] do_read_cache_folio (mm/filemap.c:3763)
> [   13.459220] ? __pfx_adfspart_check_ICS (block/partitions/acorn.c:351)
> [   13.459967] read_part_sector (./include/linux/pagemap.h:970 block/partitions/core.c:715)
> [   13.460602] adfspart_check_ICS (block/partitions/acorn.c:361)
> [   13.461269] ? snprintf (lib/vsprintf.c:2963)
> [   13.461844] ? __pfx_adfspart_check_ICS (block/partitions/acorn.c:351)
> [   13.462601] bdev_disk_changed (block/partitions/core.c:138 block/partitions/core.c:582 block/partitions/core.c:686 block/partitions/core.c:635)
> [   13.463268] blkdev_get_whole (block/bdev.c:684)
> [   13.463871] bdev_open (block/bdev.c:893)
> [   13.464386] bdev_file_open_by_dev (block/bdev.c:995 block/bdev.c:969)
> [   13.465006] disk_scan_partitions (block/genhd.c:369 (discriminator 1))
> [   13.465621] device_add_disk (block/genhd.c:512)
> [   13.466199] nvme_scan_ns (drivers/nvme/host/core.c:3807 (discriminator 1) drivers/nvme/host/core.c:3961 (discriminator 1)) nvme_core
> [   13.466885] nvme_scan_work (drivers/nvme/host/core.c:4015 drivers/nvme/host/core.c:4105) nvme_core
> [   13.467598] process_one_work (kernel/workqueue.c:3254)
> [   13.468186] worker_thread (kernel/workqueue.c:3329 (discriminator 2) kernel/workqueue.c:3416 (discriminator 2))
> [   13.468723] ? _raw_spin_lock_irqsave (./arch/x86/include/asm/atomic.h:115 (discriminator 4) ./include/linux/atomic/atomic-arch-fallback.h:2170 (discriminator 4) ./include/linux/atomic/atomic-instrumented.h:1302 (discriminator 4) ./include/asm-generic/qspinlock.h:111 (discriminator 4) ./include/linux/spinlock.h:187 (discriminator 4) ./include/linux/spinlock_api_smp.h:111 (discriminator 4) kernel/locking/spinlock.c:162 (discriminator 4))
> [   13.469343] ? __pfx_worker_thread (kernel/workqueue.c:3362)
> [   13.469933] kthread (kernel/kthread.c:388)
> [   13.470395] ? __pfx_kthread (kernel/kthread.c:341)
> [   13.470928] ret_from_fork (arch/x86/kernel/process.c:147)
> [   13.471457] ? __pfx_kthread (kernel/kthread.c:341)
> [   13.472008] ret_from_fork_asm (arch/x86/entry/entry_64.S:257)
> [   13.472540]  </TASK>
> [   13.472865] Modules linked in: crc32c_intel psmouse nvme nvme_core t10_pi crc64_rocksoft crc64 virtio_pci virtio_pci_legacy_dev virtio_pci_modern_dev virtio virtio_ring
> [   13.474629] ---[ end trace 0000000000000000 ]---
> [   13.475229] RIP: 0010:bio_split (block/bio.c:1626 (discriminator 1))
> [ 13.475742] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
> All code
> ========
>     0:	5b                   	pop    %rbx
>     1:	4c 89 e0             	mov    %r12,%rax
>     4:	5d                   	pop    %rbp
>     5:	41 5c                	pop    %r12
>     7:	41 5d                	pop    %r13
>     9:	c3                   	ret
>     a:	cc                   	int3
>     b:	cc                   	int3
>     c:	cc                   	int3
>     d:	cc                   	int3
>     e:	c7 43 28 00 00 00 00 	movl   $0x0,0x28(%rbx)
>    15:	eb db                	jmp    0xfffffffffffffff2
>    17:	0f 0b                	ud2
>    19:	45 31 e4             	xor    %r12d,%r12d
>    1c:	5b                   	pop    %rbx
>    1d:	5d                   	pop    %rbp
>    1e:	4c 89 e0             	mov    %r12,%rax
>    21:	41 5c                	pop    %r12
>    23:	41 5d                	pop    %r13
>    25:	c3                   	ret
>    26:	cc                   	int3
>    27:	cc                   	int3
>    28:	cc                   	int3
>    29:	cc                   	int3
>    2a:*	0f 0b                	ud2		<-- trapping instruction
>    2c:	0f 0b                	ud2
>    2e:	4c 89 e7             	mov    %r12,%rdi
>    31:	e8 bf ee ff ff       	call   0xffffffffffffeef5
>    36:	eb e1                	jmp    0x19
>    38:	66                   	data16
>    39:	66                   	data16
>    3a:	2e                   	cs
>    3b:	0f                   	.byte 0xf
>    3c:	1f                   	(bad)
>    3d:	84 00                	test   %al,(%rax)
> 	...
> 
> Code starting with the faulting instruction
> ===========================================
>     0:	0f 0b                	ud2
>     2:	0f 0b                	ud2
>     4:	4c 89 e7             	mov    %r12,%rdi
>     7:	e8 bf ee ff ff       	call   0xffffffffffffeecb
>     c:	eb e1                	jmp    0xffffffffffffffef
>     e:	66                   	data16
>     f:	66                   	data16
>    10:	2e                   	cs
>    11:	0f                   	.byte 0xf
>    12:	1f                   	(bad)
>    13:	84 00                	test   %al,(%rax)
> 	...
> [   13.477749] RSP: 0018:ffffc056407bb8e0 EFLAGS: 00010246
> [   13.478405] RAX: 0000000000000001 RBX: ffff9d97c165fa80 RCX: ffff9d97e65ce860
> [   13.479230] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9d97c165fa80
> [   13.480026] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
> [   13.480802] R10: ffff9d97c165fa80 R11: ffff9d97c165faf8 R12: ffff9d97df3f7250
> [   13.481568] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> [   13.482337] FS:  0000000000000000(0000) GS:ffff9d983bd40000(0000) knlGS:0000000000000000
> [   13.483218] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   13.483853] CR2: 000055b34c8d6188 CR3: 000000011dfb8002 CR4: 0000000000770ef0
> [   13.484584] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   13.485327] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> [   13.486070] PKRU: 55555554
> 

Ah. MAX_BUFS_PER_PAGE getting in the way.

Can you test with the attached patch?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich

[-- Attachment #2: 0001-fs-buffer-restart-block_read_full_folio-to-avoid-arr.patch --]
[-- Type: text/x-patch, Size: 2347 bytes --]

From 550f8f3b556cfd1f9190e3895511cf46659521d1 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Wed, 14 Feb 2024 09:20:21 +0100
Subject: [PATCH] fs/buffer: restart block_read_full_folio() to avoid array
 overflow

block_read_full_folio() uses an on-stack array to hold any buffer_heads
which should be updated. The array is sized for the number of buffer_heads
per PAGE_SIZE, which of course will overflow for large folios.
So instead of increasing the size of the array (and thereby incurring
a possible stack overflow for really large folios) stop the iteration
when the array is filled up, submit these buffer_heads, and restart
the iteration with the remaining buffer_heads.

Signed-off-by: Hannes Reinecke <hare@suse.de>
---
 fs/buffer.c | 19 +++++++++++++++++--
 1 file changed, 17 insertions(+), 2 deletions(-)

diff --git a/fs/buffer.c b/fs/buffer.c
index fa88e300a946..41511193ae5c 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2349,7 +2349,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 {
 	struct inode *inode = folio->mapping->host;
 	sector_t iblock, lblock;
-	struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
+	struct buffer_head *bh, *head, *restart_bh = NULL, *arr[MAX_BUF_PER_PAGE];
 	size_t blocksize;
 	int nr, i;
 	int fully_mapped = 1;
@@ -2366,6 +2366,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 	iblock = div_u64(folio_pos(folio), blocksize);
 	lblock = div_u64(limit + blocksize - 1, blocksize);
 	bh = head;
+restart:
 	nr = 0;
 	i = 0;
 
@@ -2400,7 +2401,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 				continue;
 		}
 		arr[nr++] = bh;
-	} while (i++, iblock++, (bh = bh->b_this_page) != head);
+	} while (i++, iblock++, (bh = bh->b_this_page) != head && nr < MAX_BUF_PER_PAGE);
+
+	if (nr == MAX_BUF_PER_PAGE && bh != head)
+		restart_bh = bh;
+	else
+		restart_bh = NULL;
 
 	if (fully_mapped)
 		folio_set_mappedtodisk(folio);
@@ -2433,6 +2439,15 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block)
 		else
 			submit_bh(REQ_OP_READ, bh);
 	}
+	/*
+	 * Found more buffers than 'arr' could hold,
+	 * restart to submit the remaining ones.
+	 */
+	if (restart_bh) {
+		bh = restart_bh;
+		goto restart;
+	}
+
 	return 0;
 }
 EXPORT_SYMBOL(block_read_full_folio);
-- 
2.35.3


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-13 16:07             ` Hannes Reinecke
@ 2024-05-13 21:05               ` Luis Chamberlain
  2024-05-13 23:07                 ` Hannes Reinecke
  2024-05-24 10:04                 ` Hannes Reinecke
  2024-05-14  5:08               ` Matthew Wilcox
  1 sibling, 2 replies; 35+ messages in thread
From: Luis Chamberlain @ 2024-05-13 21:05 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Matthew Wilcox, Kent Overstreet, hare, Andrew Morton,
	Pankaj Raghav, linux-nvme, linux-block, Pankaj Raghav

On Mon, May 13, 2024 at 06:07:55PM +0200, Hannes Reinecke wrote:
> On 5/12/24 11:16, Luis Chamberlain wrote:
> > On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
> > > I'll try next going above 512 KiB.
> > 
> > At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().
> > 
> > [   13.401651] ------------[ cut here ]------------
> > [   13.403298] kernel BUG at block/bio.c:1626!
> Ah. MAX_BUFS_PER_PAGE getting in the way.
> 
> Can you test with the attached patch?

Nope same crash:

I've enabled you to easily test with with NVMe on libvirt with kdevops,
please test.

 Luis

[   14.972734] ------------[ cut here ]------------
[   14.974731] kernel BUG at block/bio.c:1626!
[   14.976906] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
[   14.978899] CPU: 3 PID: 59 Comm: kworker/u36:0 Not tainted 6.9.0-rc6+ #4
[   14.981005] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
[   14.983782] Workqueue: nvme-wq nvme_scan_work [nvme_core]
[   14.985431] RIP: 0010:bio_split+0xd5/0xf0
[   14.986627] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
[   14.992063] RSP: 0018:ffffbecc002378d0 EFLAGS: 00010246
[   14.993416] RAX: 0000000000000001 RBX: ffff9e2fe8583e40 RCX: ffff9e2fdcb73060
[   14.995181] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9e2fe8583e40
[   14.996960] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
[   14.998715] R10: ffff9e2fe8583e40 R11: ffff9e2fe8583eb8 R12: ffff9e2fe884b750
[   15.000510] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
[   15.002128] FS:  0000000000000000(0000) GS:ffff9e303bcc0000(0000) knlGS:0000000000000000
[   15.003956] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[   15.005294] CR2: 0000561b2b5ce478 CR3: 0000000102484002 CR4: 0000000000770ef0
[   15.006921] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[   15.008509] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
[   15.010001] PKRU: 55555554
[   15.010672] Call Trace:
[   15.011297]  <TASK>
[   15.011868]  ? die+0x32/0x80
[   15.012572]  ? do_trap+0xd9/0x100
[   15.013306]  ? bio_split+0xd5/0xf0
[   15.014051]  ? do_error_trap+0x6a/0x90
[   15.014854]  ? bio_split+0xd5/0xf0
[   15.015597]  ? exc_invalid_op+0x4c/0x60
[   15.016419]  ? bio_split+0xd5/0xf0
[   15.017113]  ? asm_exc_invalid_op+0x16/0x20
[   15.017932]  ? bio_split+0xd5/0xf0
[   15.018624]  __bio_split_to_limits+0x90/0x2d0
[   15.019474]  blk_mq_submit_bio+0x111/0x6a0
[   15.020280]  ? kmem_cache_alloc+0x254/0x2e0
[   15.021040]  submit_bio_noacct_nocheck+0x2f1/0x3d0
[   15.021893]  ? submit_bio_noacct+0x42/0x5b0
[   15.022658]  block_read_full_folio+0x2b7/0x350
[   15.023457]  ? __pfx_blkdev_get_block+0x10/0x10
[   15.024284]  ? __pfx_blkdev_read_folio+0x10/0x10
[   15.025073]  ? __pfx_blkdev_read_folio+0x10/0x10
[   15.025851]  filemap_read_folio+0x32/0xb0
[   15.026540]  do_read_cache_folio+0x108/0x200
[   15.027271]  ? __pfx_adfspart_check_ICS+0x10/0x10
[   15.028066]  read_part_sector+0x32/0xe0
[   15.028701]  adfspart_check_ICS+0x32/0x480
[   15.029334]  ? snprintf+0x49/0x70
[   15.029875]  ? __pfx_adfspart_check_ICS+0x10/0x10
[   15.030592]  bdev_disk_changed+0x2a2/0x6e0
[   15.031226]  blkdev_get_whole+0x5f/0xa0
[   15.031827]  bdev_open+0x201/0x3c0
[   15.032360]  bdev_file_open_by_dev+0xb5/0x110
[   15.032990]  disk_scan_partitions+0x65/0xe0
[   15.033598]  device_add_disk+0x3e0/0x3f0
[   15.034172]  nvme_scan_ns+0x5f0/0xe50 [nvme_core]
[   15.034862]  nvme_scan_work+0x26f/0x5a0 [nvme_core]
[   15.035568]  process_one_work+0x189/0x3b0
[   15.036168]  worker_thread+0x273/0x390
[   15.036713]  ? __pfx_worker_thread+0x10/0x10
[   15.037312]  kthread+0xda/0x110
[   15.037779]  ? __pfx_kthread+0x10/0x10
[   15.038316]  ret_from_fork+0x2d/0x50
[   15.038829]  ? __pfx_kthread+0x10/0x10
[   15.039364]  ret_from_fork_asm+0x1a/0x30
[   15.039924]  </TASK>


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-13 21:05               ` Luis Chamberlain
@ 2024-05-13 23:07                 ` Hannes Reinecke
  2024-05-24 10:04                 ` Hannes Reinecke
  1 sibling, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-13 23:07 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Matthew Wilcox, Kent Overstreet, hare, Andrew Morton,
	Pankaj Raghav, linux-nvme, linux-block, Pankaj Raghav

On 5/13/24 23:05, Luis Chamberlain wrote:
> On Mon, May 13, 2024 at 06:07:55PM +0200, Hannes Reinecke wrote:
>> On 5/12/24 11:16, Luis Chamberlain wrote:
>>> On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
>>>> I'll try next going above 512 KiB.
>>>
>>> At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().
>>>
>>> [   13.401651] ------------[ cut here ]------------
>>> [   13.403298] kernel BUG at block/bio.c:1626!
>> Ah. MAX_BUFS_PER_PAGE getting in the way.
>>
>> Can you test with the attached patch?
> 
> Nope same crash:
> 
> I've enabled you to easily test with with NVMe on libvirt with kdevops,
> please test.
> 
>   Luis
> 
> [   14.972734] ------------[ cut here ]------------
> [   14.974731] kernel BUG at block/bio.c:1626!
> [   14.976906] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [   14.978899] CPU: 3 PID: 59 Comm: kworker/u36:0 Not tainted 6.9.0-rc6+ #4
> [   14.981005] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   14.983782] Workqueue: nvme-wq nvme_scan_work [nvme_core]
> [   14.985431] RIP: 0010:bio_split+0xd5/0xf0
> [   14.986627] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
> [   14.992063] RSP: 0018:ffffbecc002378d0 EFLAGS: 00010246
> [   14.993416] RAX: 0000000000000001 RBX: ffff9e2fe8583e40 RCX: ffff9e2fdcb73060
> [   14.995181] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9e2fe8583e40
> [   14.996960] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
> [   14.998715] R10: ffff9e2fe8583e40 R11: ffff9e2fe8583eb8 R12: ffff9e2fe884b750
> [   15.000510] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> [   15.002128] FS:  0000000000000000(0000) GS:ffff9e303bcc0000(0000) knlGS:0000000000000000
> [   15.003956] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   15.005294] CR2: 0000561b2b5ce478 CR3: 0000000102484002 CR4: 0000000000770ef0
> [   15.006921] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   15.008509] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> [   15.010001] PKRU: 55555554
> [   15.010672] Call Trace:
> [   15.011297]  <TASK>
> [   15.011868]  ? die+0x32/0x80
> [   15.012572]  ? do_trap+0xd9/0x100
> [   15.013306]  ? bio_split+0xd5/0xf0
> [   15.014051]  ? do_error_trap+0x6a/0x90
> [   15.014854]  ? bio_split+0xd5/0xf0
> [   15.015597]  ? exc_invalid_op+0x4c/0x60
> [   15.016419]  ? bio_split+0xd5/0xf0
> [   15.017113]  ? asm_exc_invalid_op+0x16/0x20
> [   15.017932]  ? bio_split+0xd5/0xf0
> [   15.018624]  __bio_split_to_limits+0x90/0x2d0
> [   15.019474]  blk_mq_submit_bio+0x111/0x6a0
> [   15.020280]  ? kmem_cache_alloc+0x254/0x2e0
> [   15.021040]  submit_bio_noacct_nocheck+0x2f1/0x3d0
> [   15.021893]  ? submit_bio_noacct+0x42/0x5b0
> [   15.022658]  block_read_full_folio+0x2b7/0x350
> [   15.023457]  ? __pfx_blkdev_get_block+0x10/0x10
> [   15.024284]  ? __pfx_blkdev_read_folio+0x10/0x10
> [   15.025073]  ? __pfx_blkdev_read_folio+0x10/0x10
> [   15.025851]  filemap_read_folio+0x32/0xb0
> [   15.026540]  do_read_cache_folio+0x108/0x200
> [   15.027271]  ? __pfx_adfspart_check_ICS+0x10/0x10
> [   15.028066]  read_part_sector+0x32/0xe0
> [   15.028701]  adfspart_check_ICS+0x32/0x480
> [   15.029334]  ? snprintf+0x49/0x70
> [   15.029875]  ? __pfx_adfspart_check_ICS+0x10/0x10
> [   15.030592]  bdev_disk_changed+0x2a2/0x6e0
> [   15.031226]  blkdev_get_whole+0x5f/0xa0
> [   15.031827]  bdev_open+0x201/0x3c0
> [   15.032360]  bdev_file_open_by_dev+0xb5/0x110
> [   15.032990]  disk_scan_partitions+0x65/0xe0
> [   15.033598]  device_add_disk+0x3e0/0x3f0
> [   15.034172]  nvme_scan_ns+0x5f0/0xe50 [nvme_core]
> [   15.034862]  nvme_scan_work+0x26f/0x5a0 [nvme_core]
> [   15.035568]  process_one_work+0x189/0x3b0
> [   15.036168]  worker_thread+0x273/0x390
> [   15.036713]  ? __pfx_worker_thread+0x10/0x10
> [   15.037312]  kthread+0xda/0x110
> [   15.037779]  ? __pfx_kthread+0x10/0x10
> [   15.038316]  ret_from_fork+0x2d/0x50
> [   15.038829]  ? __pfx_kthread+0x10/0x10
> [   15.039364]  ret_from_fork_asm+0x1a/0x30
> [   15.039924]  </TASK>
> 

Ah. So this should fix it:

diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4e3483a16b75..4fac11edd0c8 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -289,7 +289,7 @@ struct bio *bio_split_rw(struct bio *bio, const 
struct queue_limits *lim,

                 if (nsegs < lim->max_segments &&
                     bytes + bv.bv_len <= max_bytes &&
-                   bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+                   bv.bv_offset + bv.bv_len <= lim->max_segment_size) {
                         nsegs++;
                         bytes += bv.bv_len;
                 } else {

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare@suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich


^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-13 16:07             ` Hannes Reinecke
  2024-05-13 21:05               ` Luis Chamberlain
@ 2024-05-14  5:08               ` Matthew Wilcox
  1 sibling, 0 replies; 35+ messages in thread
From: Matthew Wilcox @ 2024-05-14  5:08 UTC (permalink / raw)
  To: Hannes Reinecke
  Cc: Luis Chamberlain, Kent Overstreet, hare, Andrew Morton,
	Pankaj Raghav, linux-nvme, linux-block, Pankaj Raghav

On Mon, May 13, 2024 at 06:07:55PM +0200, Hannes Reinecke wrote:
> On 5/12/24 11:16, Luis Chamberlain wrote:
> > On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
> > > I'll try next going above 512 KiB.
> > 
> > At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().
> 
> Ah. MAX_BUFS_PER_PAGE getting in the way.

That doesn't make sense.  We will need something like this patch for
filesystems which use large folios with buffer_heads.  But for a 1MB
block size device, we should have one buffer_head per folio.


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE
  2024-05-13 21:05               ` Luis Chamberlain
  2024-05-13 23:07                 ` Hannes Reinecke
@ 2024-05-24 10:04                 ` Hannes Reinecke
  1 sibling, 0 replies; 35+ messages in thread
From: Hannes Reinecke @ 2024-05-24 10:04 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Matthew Wilcox, Kent Overstreet, hare, Andrew Morton,
	Pankaj Raghav, linux-nvme, linux-block, Pankaj Raghav

On 5/13/24 23:05, Luis Chamberlain wrote:
> On Mon, May 13, 2024 at 06:07:55PM +0200, Hannes Reinecke wrote:
>> On 5/12/24 11:16, Luis Chamberlain wrote:
>>> On Sat, May 11, 2024 at 07:43:26PM -0700, Luis Chamberlain wrote:
>>>> I'll try next going above 512 KiB.
>>>
>>> At 1 MiB NVMe LBA format we crash with the BUG_ON(sectors <= 0) on bio_split().
>>>
>>> [   13.401651] ------------[ cut here ]------------
>>> [   13.403298] kernel BUG at block/bio.c:1626!
>> Ah. MAX_BUFS_PER_PAGE getting in the way.
>>
>> Can you test with the attached patch?
> 
> Nope same crash:
> 
> I've enabled you to easily test with with NVMe on libvirt with kdevops,
> please test.
> 
>   Luis
> 
> [   14.972734] ------------[ cut here ]------------
> [   14.974731] kernel BUG at block/bio.c:1626!
> [   14.976906] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
> [   14.978899] CPU: 3 PID: 59 Comm: kworker/u36:0 Not tainted 6.9.0-rc6+ #4
> [   14.981005] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014
> [   14.983782] Workqueue: nvme-wq nvme_scan_work [nvme_core]
> [   14.985431] RIP: 0010:bio_split+0xd5/0xf0
> [   14.986627] Code: 5b 4c 89 e0 5d 41 5c 41 5d c3 cc cc cc cc c7 43 28 00 00 00 00 eb db 0f 0b 45 31 e4 5b 5d 4c 89 e0 41 5c 41 5d c3 cc cc cc cc <0f> 0b 0f 0b 4c 89 e7 e8 bf ee ff ff eb e1 66 66 2e 0f 1f 84 00 00
> [   14.992063] RSP: 0018:ffffbecc002378d0 EFLAGS: 00010246
> [   14.993416] RAX: 0000000000000001 RBX: ffff9e2fe8583e40 RCX: ffff9e2fdcb73060
> [   14.995181] RDX: 0000000000000c00 RSI: 0000000000000000 RDI: ffff9e2fe8583e40
> [   14.996960] RBP: 0000000000000000 R08: 0000000000000080 R09: 0000000000000000
> [   14.998715] R10: ffff9e2fe8583e40 R11: ffff9e2fe8583eb8 R12: ffff9e2fe884b750
> [   15.000510] R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000
> [   15.002128] FS:  0000000000000000(0000) GS:ffff9e303bcc0000(0000) knlGS:0000000000000000
> [   15.003956] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> [   15.005294] CR2: 0000561b2b5ce478 CR3: 0000000102484002 CR4: 0000000000770ef0
> [   15.006921] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
> [   15.008509] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400
> [   15.010001] PKRU: 55555554
> [   15.010672] Call Trace:
> [   15.011297]  <TASK>
> [   15.011868]  ? die+0x32/0x80
> [   15.012572]  ? do_trap+0xd9/0x100
> [   15.013306]  ? bio_split+0xd5/0xf0
> [   15.014051]  ? do_error_trap+0x6a/0x90
> [   15.014854]  ? bio_split+0xd5/0xf0
> [   15.015597]  ? exc_invalid_op+0x4c/0x60
> [   15.016419]  ? bio_split+0xd5/0xf0
> [   15.017113]  ? asm_exc_invalid_op+0x16/0x20
> [   15.017932]  ? bio_split+0xd5/0xf0
> [   15.018624]  __bio_split_to_limits+0x90/0x2d0
> [   15.019474]  blk_mq_submit_bio+0x111/0x6a0
> [   15.020280]  ? kmem_cache_alloc+0x254/0x2e0
> [   15.021040]  submit_bio_noacct_nocheck+0x2f1/0x3d0
> [   15.021893]  ? submit_bio_noacct+0x42/0x5b0
> [   15.022658]  block_read_full_folio+0x2b7/0x350
> [   15.023457]  ? __pfx_blkdev_get_block+0x10/0x10
> [   15.024284]  ? __pfx_blkdev_read_folio+0x10/0x10
> [   15.025073]  ? __pfx_blkdev_read_folio+0x10/0x10
> [   15.025851]  filemap_read_folio+0x32/0xb0
> [   15.026540]  do_read_cache_folio+0x108/0x200
> [   15.027271]  ? __pfx_adfspart_check_ICS+0x10/0x10
> [   15.028066]  read_part_sector+0x32/0xe0
> [   15.028701]  adfspart_check_ICS+0x32/0x480
> [   15.029334]  ? snprintf+0x49/0x70
> [   15.029875]  ? __pfx_adfspart_check_ICS+0x10/0x10
> [   15.030592]  bdev_disk_changed+0x2a2/0x6e0
> [   15.031226]  blkdev_get_whole+0x5f/0xa0
> [   15.031827]  bdev_open+0x201/0x3c0
> [   15.032360]  bdev_file_open_by_dev+0xb5/0x110
> [   15.032990]  disk_scan_partitions+0x65/0xe0
> [   15.033598]  device_add_disk+0x3e0/0x3f0
> [   15.034172]  nvme_scan_ns+0x5f0/0xe50 [nvme_core]
> [   15.034862]  nvme_scan_work+0x26f/0x5a0 [nvme_core]
> [   15.035568]  process_one_work+0x189/0x3b0
> [   15.036168]  worker_thread+0x273/0x390
> [   15.036713]  ? __pfx_worker_thread+0x10/0x10
> [   15.037312]  kthread+0xda/0x110
> [   15.037779]  ? __pfx_kthread+0x10/0x10
> [   15.038316]  ret_from_fork+0x2d/0x50
> [   15.038829]  ? __pfx_kthread+0x10/0x10
> [   15.039364]  ret_from_fork_asm+0x1a/0x30
> [   15.039924]  </TASK>
> 
So, finally nailed it down.

Problem was that qemu is setting a default mdts of '7', which obviously
is too small for 1M sector sizes.
And the next problem was that we don't validate the 'max_hw_sectors' 
setting against the 'logical_sector_size' setting, letting the block
layer happily accept limit which have an hw_sector_size smaller than
the logical block size.
For that I've meanwhile sent a patch, but if you want continue testing
you need to the 'mdts=0' for the qemu nvme emulation.
We _might_ want to send a patch for qemu to set mdts to '0' as the
default value, but that probably is too much hassle.

Cheers,

Hannes


^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 0/5] enable bs > ps for block devices
  2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
                   ` (5 preceding siblings ...)
  2024-05-11 23:12 ` [PATCH 0/5] enable bs > ps for block devices Luis Chamberlain
@ 2024-05-30 13:37 ` Pankaj Raghav (Samsung)
  6 siblings, 0 replies; 35+ messages in thread
From: Pankaj Raghav (Samsung) @ 2024-05-30 13:37 UTC (permalink / raw)
  To: hare
  Cc: Andrew Morton, Matthew Wilcox, Luis Chamberlain, linux-nvme,
	linux-block

On Fri, May 10, 2024 at 12:29:01PM +0200, hare@kernel.org wrote:
> From: Hannes Reinecke <hare@kernel.org>
> 
> Hi all,
> 
> based on the patch series from Pankaj '[PATCH v5 00/11] enable bs > ps in XFS'
> it's now quite simple to enable support for block devices with block sizes
> larger than page size even without having to disable CONFIG_BUFFER_HEAD.
> The patchset really is just two rather trivial patches to fs/mpage,
> and two patches to remove hardcoded restrictions on the block size.
> 
> As usual, comments and reviews are welcome.
> 
> Hannes Reinecke (4):
>   fs/mpage: use blocks_per_folio instead of blocks_per_page
>   fs/mpage: avoid negative shift for large blocksize
>   block/bdev: lift restrictions on supported blocksize
>   block/bdev: enable large folio support for large logical block sizes
> 
> Pankaj Raghav (1):
>   nvme: enable logical block size > PAGE_SIZE
> 
>  block/bdev.c             | 10 ++++++---
>  drivers/nvme/host/core.c |  7 +++----
>  fs/mpage.c               | 44 ++++++++++++++++++++--------------------

What about fs/buffer.c? Do we need some changes there?

One obvious place I see is:

diff --git a/fs/buffer.c b/fs/buffer.c
index 8c19e705b9c3..ae248a10a467 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1121,7 +1121,7 @@ __getblk_slow(struct block_device *bdev, sector_t block,
 {
        /* Size must be multiple of hard sectorsize */
        if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
-                       (size < 512 || size > PAGE_SIZE))) {
+                       (size < 512))) {
                printk(KERN_ERR "getblk(): invalid block size %d requested\n",
                                        size);
                printk(KERN_ERR "logical block size: %d\n",


^ permalink raw reply related	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2024-05-30 13:37 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-05-10 10:29 [PATCH 0/5] enable bs > ps for block devices hare
2024-05-10 10:29 ` [PATCH 1/5] fs/mpage: use blocks_per_folio instead of blocks_per_page hare
2024-05-10 22:19   ` Matthew Wilcox
2024-05-11  9:12     ` Hannes Reinecke
2024-05-10 10:29 ` [PATCH 2/5] fs/mpage: avoid negative shift for large blocksize hare
2024-05-10 22:24   ` Matthew Wilcox
2024-05-11  9:13     ` Hannes Reinecke
2024-05-10 10:29 ` [PATCH 3/5] block/bdev: lift restrictions on supported blocksize hare
2024-05-10 22:25   ` Matthew Wilcox
2024-05-11  9:15     ` Hannes Reinecke
2024-05-11 22:57     ` Luis Chamberlain
2024-05-11 22:54   ` Luis Chamberlain
2024-05-10 10:29 ` [PATCH 4/5] block/bdev: enable large folio support for large logical block sizes hare
2024-05-10 22:35   ` Matthew Wilcox
2024-05-11  9:18     ` Hannes Reinecke
2024-05-11  5:07   ` kernel test robot
2024-05-11  7:04   ` kernel test robot
2024-05-10 10:29 ` [PATCH 5/5] nvme: enable logical block size > PAGE_SIZE hare
2024-05-10 22:37   ` Matthew Wilcox
2024-05-11  9:18     ` Hannes Reinecke
2024-05-11 23:11     ` Luis Chamberlain
2024-05-11 23:09   ` Luis Chamberlain
2024-05-11 23:30     ` Matthew Wilcox
2024-05-11 23:51       ` Luis Chamberlain
2024-05-12  2:43         ` Luis Chamberlain
2024-05-12  9:16           ` Luis Chamberlain
2024-05-13 16:07             ` Hannes Reinecke
2024-05-13 21:05               ` Luis Chamberlain
2024-05-13 23:07                 ` Hannes Reinecke
2024-05-24 10:04                 ` Hannes Reinecke
2024-05-14  5:08               ` Matthew Wilcox
2024-05-13 13:43           ` Hannes Reinecke
2024-05-13 14:32             ` Luis Chamberlain
2024-05-11 23:12 ` [PATCH 0/5] enable bs > ps for block devices Luis Chamberlain
2024-05-30 13:37 ` Pankaj Raghav (Samsung)

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).