Linux block layer

Linux block layer
 help / color / mirror / Atom feed

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
From: Christoph Hellwig @ 2022-05-19  7:32 UTC (permalink / raw)
  To: Keith Busch
  Cc: linux-fsdevel, linux-block, axboe, Kernel Team, hch, bvanassche,
	damien.lemoal, Keith Busch
In-Reply-To: <20220518171131.3525293-2-kbusch@fb.com>

On Wed, May 18, 2022 at 10:11:29AM -0700, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> The setup for getting pages are identical for zone append and normal IO.
> Use common code for each.

How about using even more common code and avoiding churn at the same
time?  Something like:

diff --git a/block/bio.c b/block/bio.c
index a3893d80dccc9..15da722ed26d1 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1158,6 +1158,37 @@ static void bio_put_pages(struct page **pages, size_t size, size_t off)
 		put_page(pages[i]);
 }
 
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	bool same_page = false;
+
+	if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
+		if (WARN_ON_ONCE(bio_full(bio, len)))
+			return -EINVAL;
+		__bio_add_page(bio, page, len, offset);
+		return 0;
+	}
+
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+		unsigned int len, unsigned int offset)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	bool same_page = false;
+
+	if (bio_add_hw_page(q, bio, page, len, offset,
+			queue_max_zone_append_sectors(q), &same_page) != len)
+		return -EINVAL;
+	if (same_page)
+		put_page(page);
+	return 0;
+}
+
 #define PAGE_PTRS_PER_BVEC     (sizeof(struct bio_vec) / sizeof(struct page *))
 
 /**
@@ -1176,7 +1207,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
 	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
 	struct page **pages = (struct page **)bv;
-	bool same_page = false;
 	ssize_t size, left;
 	unsigned len, i;
 	size_t offset;
@@ -1195,18 +1225,18 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 
 	for (left = size, i = 0; left > 0; left -= len, i++) {
 		struct page *page = pages[i];
+		int ret;
 
 		len = min_t(size_t, PAGE_SIZE - offset, left);
+		if (bio_op(bio) == REQ_OP_ZONE_APPEND)	
+			ret = bio_iov_add_zone_append_page(bio, page, len,
+					offset);
+		else
+			ret = bio_iov_add_page(bio, page, len, offset);
 
-		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
-			if (same_page)
-				put_page(page);
-		} else {
-			if (WARN_ON_ONCE(bio_full(bio, len))) {
-				bio_put_pages(pages + i, left, offset);
-				return -EINVAL;
-			}
-			__bio_add_page(bio, page, len, offset);
+		if (ret) {
+			bio_put_pages(pages + i, left, offset);
+			return ret;
 		}
 		offset = 0;
 	}
@@ -1215,54 +1245,6 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	return 0;
 }
 
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
-	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
-	unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
-	struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
-	struct page **pages = (struct page **)bv;
-	ssize_t size, left;
-	unsigned len, i;
-	size_t offset;
-	int ret = 0;
-
-	if (WARN_ON_ONCE(!max_append_sectors))
-		return 0;
-
-	/*
-	 * Move page array up in the allocated memory for the bio vecs as far as
-	 * possible so that we can start filling biovecs from the beginning
-	 * without overwriting the temporary page array.
-	 */
-	BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
-	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
-	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
-	if (unlikely(size <= 0))
-		return size ? size : -EFAULT;
-
-	for (left = size, i = 0; left > 0; left -= len, i++) {
-		struct page *page = pages[i];
-		bool same_page = false;
-
-		len = min_t(size_t, PAGE_SIZE - offset, left);
-		if (bio_add_hw_page(q, bio, page, len, offset,
-				max_append_sectors, &same_page) != len) {
-			bio_put_pages(pages + i, left, offset);
-			ret = -EINVAL;
-			break;
-		}
-		if (same_page)
-			put_page(page);
-		offset = 0;
-	}
-
-	iov_iter_advance(iter, size - left);
-	return ret;
-}
-
 /**
  * bio_iov_iter_get_pages - add user or kernel pages to a bio
  * @bio: bio to add pages to
@@ -1297,10 +1279,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	}
 
 	do {
-		if (bio_op(bio) == REQ_OP_ZONE_APPEND)
-			ret = __bio_iov_append_get_pages(bio, iter);
-		else
-			ret = __bio_iov_iter_get_pages(bio, iter);
+		ret = __bio_iov_iter_get_pages(bio, iter);
 	} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
 
 	/* don't account direct I/O as memory stall */

^ permalink raw reply related

* Re: cleanup blk_execute_rq*
From: Christoph Hellwig @ 2022-05-19  7:09 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Christoph Hellwig, Ming Lei, linux-block, linux-nvme, linux-scsi,
	target-devel
In-Reply-To: <28682d9a-ac58-ea19-6d51-73fbd87bfb5e@kernel.dk>

On Wed, May 18, 2022 at 04:40:55PM -0600, Jens Axboe wrote:
> On 5/17/22 12:48 AM, Christoph Hellwig wrote:
> > Hi Jens,
> > 
> > this series cleans up the blk_execute_rq* helpers.  It simplifies the
> > plugging mess a bit, fixes the sparse __bitwise warnings and simplifies
> > the blk_execute_rq_nowait API a bit.
> 
> Looks good to me, but let's do this series post flushing out the
> initial bits. It ends up depending on the passthrough changes,
> yet also conflicts with the nvme changes on the driver side.

Ok.  I'll resend after the initial merged, the buildbot also complained
about a UFS hunk.

^ permalink raw reply

* Re: [PATCH -next v2 2/2] blk-throttle: fix io hung due to configuration updates
From: yukuai (C) @ 2022-05-19  7:06 UTC (permalink / raw)
  To: Geert Uytterhoeven
  Cc: kernel test robot, Tejun Heo, Jens Axboe, Ming Lei, kbuild-all,
	cgroups, linux-block, Linux Kernel Mailing List, yi.zhang
In-Reply-To: <CAMuHMdV6NysKKh+HZ-cgHh+=SVcydmxO6ic82+t3ySTgfkoEOg@mail.gmail.com>



在 2022/05/19 15:01, Geert Uytterhoeven 写道:
> Hi Yukuai,
> 
> On Thu, May 19, 2022 at 5:25 AM yukuai (C) <yukuai3@huawei.com> wrote:
>> 在 2022/05/19 10:11, yukuai (C) 写道:
>>> 在 2022/05/18 23:52, kernel test robot 写道:
>>>> Thank you for the patch! Yet something to improve:
>>>>
>>>> [auto build test ERROR on next-20220517]
>>>>
>>>> url:
>>>> https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
>>>>
>>>> base:    47c1c54d1bcd0a69a56b49473bc20f17b70e5242
>>>> config: m68k-allyesconfig
>>>> (https://download.01.org/0day-ci/archive/20220518/202205182347.tMOOqyfL-lkp@intel.com/config)
>>>>
>>>> compiler: m68k-linux-gcc (GCC) 11.3.0
>>>> reproduce (this is a W=1 build):
>>>>           wget
>>>> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
>>>> -O ~/bin/make.cross
>>>>           chmod +x ~/bin/make.cross
>>>>           #
>>>> https://github.com/intel-lab-lkp/linux/commit/f8345dbaf4ed491742aab29834aff66b4930c087
>>>>
>>>>           git remote add linux-review
>>>> https://github.com/intel-lab-lkp/linux
>>>>           git fetch --no-tags linux-review
>>>> Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
>>>>           git checkout f8345dbaf4ed491742aab29834aff66b4930c087
>>>>           # save the config file
>>>>           mkdir build_dir && cp config build_dir/.config
>>>>           COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0
>>>> make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
>>>>
>>>> If you fix the issue, kindly add following tag as appropriate
>>>> Reported-by: kernel test robot <lkp@intel.com>
>>>>
>>>> All errors (new ones prefixed by >>):
>>>>
>>>>      m68k-linux-ld: block/blk-throttle.o: in function `tg_conf_updated':
>>>>>> blk-throttle.c:(.text+0x25bc): undefined reference to `__udivdi3'
>>>>>> m68k-linux-ld: blk-throttle.c:(.text+0x2626): undefined reference to
>>>>>> `__udivdi3'
>>> Hi,
>>>
>>> I'm confused here, the only place that I can relate to this:
>>>
>>>       return dispatched * new_limit / old_limit;
>>>
>>> However, I don't understand yet why this is problematic...
>>>>      `.exit.text' referenced in section `.data' of
>>>> sound/soc/codecs/tlv320adc3xxx.o: defined in discarded section
>>>> `.exit.text' of sound/soc/codecs/tlv320adc3xxx.o
>>
>> + static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit,
>> +                                    u64 old_limit)
>> + {
>> +        if (new_limit == old_limit)
>> +                return dispatched;
>> +
>> +        if (new_limit == U64_MAX)
>> +                return 0;
>> +
>> +        return dispatched * new_limit / old_limit;
>>
>> I understand it now. I'm doing (u64 / u64), I should use div64_u64
> 
> Better, use mul_u64_u64_div_u64(), as "dispatched * new_limit"
> may overflow?
Hi,

It's right that it  can overflow, I'll handle such case in next version.
> 
>> + }
>> +
>> + static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit)
>> + {
>> +        if (new_limit == old_limit)
>> +                return dispatched;
>> +
>> +        if (new_limit == UINT_MAX)
>> +                return 0;
>> +
>> +        return dispatched * new_limit / old_limit;
> 
> This is the same as above, but now operating on u32s instead of u64s.
> Likewise, can the multiplication overflow?
same as above.

Thanks,
Kuai
> 
>> + }
> 
> Gr{oetje,eeting}s,
> 
>                          Geert
> 
> --
> Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org
> 
> In personal conversations with technical people, I call myself a hacker. But
> when I'm talking to journalists I just say "programmer" or something like that.
>                                  -- Linus Torvalds
> .
> 

^ permalink raw reply

* Re: [RFC PATCH v2 1/7] statx: add I/O alignment information
From: Christoph Hellwig @ 2022-05-19  7:05 UTC (permalink / raw)
  To: Eric Biggers
  Cc: linux-fsdevel, linux-ext4, linux-f2fs-devel, linux-xfs, linux-api,
	linux-fscrypt, linux-block, linux-kernel, Keith Busch
In-Reply-To: <20220518235011.153058-2-ebiggers@kernel.org>

Looks good:

Reviewed-by: Christoph Hellwig <hch@lst.de>

^ permalink raw reply

* Re: [PATCH -next v2 2/2] blk-throttle: fix io hung due to configuration updates
From: Geert Uytterhoeven @ 2022-05-19  7:01 UTC (permalink / raw)
  To: yukuai (C)
  Cc: kernel test robot, Tejun Heo, Jens Axboe, Ming Lei, kbuild-all,
	cgroups, linux-block, Linux Kernel Mailing List, yi.zhang
In-Reply-To: <3d6878f4-1902-633d-0af2-276831364a4f@huawei.com>

Hi Yukuai,

On Thu, May 19, 2022 at 5:25 AM yukuai (C) <yukuai3@huawei.com> wrote:
> 在 2022/05/19 10:11, yukuai (C) 写道:
> > 在 2022/05/18 23:52, kernel test robot 写道:
> >> Thank you for the patch! Yet something to improve:
> >>
> >> [auto build test ERROR on next-20220517]
> >>
> >> url:
> >> https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
> >>
> >> base:    47c1c54d1bcd0a69a56b49473bc20f17b70e5242
> >> config: m68k-allyesconfig
> >> (https://download.01.org/0day-ci/archive/20220518/202205182347.tMOOqyfL-lkp@intel.com/config)
> >>
> >> compiler: m68k-linux-gcc (GCC) 11.3.0
> >> reproduce (this is a W=1 build):
> >>          wget
> >> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross
> >> -O ~/bin/make.cross
> >>          chmod +x ~/bin/make.cross
> >>          #
> >> https://github.com/intel-lab-lkp/linux/commit/f8345dbaf4ed491742aab29834aff66b4930c087
> >>
> >>          git remote add linux-review
> >> https://github.com/intel-lab-lkp/linux
> >>          git fetch --no-tags linux-review
> >> Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
> >>          git checkout f8345dbaf4ed491742aab29834aff66b4930c087
> >>          # save the config file
> >>          mkdir build_dir && cp config build_dir/.config
> >>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0
> >> make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
> >>
> >> If you fix the issue, kindly add following tag as appropriate
> >> Reported-by: kernel test robot <lkp@intel.com>
> >>
> >> All errors (new ones prefixed by >>):
> >>
> >>     m68k-linux-ld: block/blk-throttle.o: in function `tg_conf_updated':
> >>>> blk-throttle.c:(.text+0x25bc): undefined reference to `__udivdi3'
> >>>> m68k-linux-ld: blk-throttle.c:(.text+0x2626): undefined reference to
> >>>> `__udivdi3'
> > Hi,
> >
> > I'm confused here, the only place that I can relate to this:
> >
> >      return dispatched * new_limit / old_limit;
> >
> > However, I don't understand yet why this is problematic...
> >>     `.exit.text' referenced in section `.data' of
> >> sound/soc/codecs/tlv320adc3xxx.o: defined in discarded section
> >> `.exit.text' of sound/soc/codecs/tlv320adc3xxx.o
>
> + static u64 throtl_update_bytes_disp(u64 dispatched, u64 new_limit,
> +                                    u64 old_limit)
> + {
> +        if (new_limit == old_limit)
> +                return dispatched;
> +
> +        if (new_limit == U64_MAX)
> +                return 0;
> +
> +        return dispatched * new_limit / old_limit;
>
> I understand it now. I'm doing (u64 / u64), I should use div64_u64

Better, use mul_u64_u64_div_u64(), as "dispatched * new_limit"
may overflow?

> + }
> +
> + static u32 throtl_update_io_disp(u32 dispatched, u32 new_limit, u32 old_limit)
> + {
> +        if (new_limit == old_limit)
> +                return dispatched;
> +
> +        if (new_limit == UINT_MAX)
> +                return 0;
> +
> +        return dispatched * new_limit / old_limit;

This is the same as above, but now operating on u32s instead of u64s.
Likewise, can the multiplication overflow?

> + }

Gr{oetje,eeting}s,

                        Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- geert@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
                                -- Linus Torvalds

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Damien Le Moal @ 2022-05-19  6:45 UTC (permalink / raw)
  To: Keith Busch, Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche
In-Reply-To: <YoXN5CpSGGe7+OJs@kbusch-mbp.dhcp.thefacebook.com>

On 2022/05/19 6:56, Keith Busch wrote:
> On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
>>
>> So the bio ends up with a total length that is a multiple of the logical block
>> size, but the lengths of the individual bvecs in the bio are *not* necessarily
>> multiples of the logical block size.  That's the problem.
> 
> I'm surely missing something here. I know the bvecs are not necessarily lbs
> aligned, but why does that matter? Is there some driver that can only take
> exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
> queue limit into account, but I am not sure that we need to.

For direct IO, the first bvec will always be aligned to a logical block size.
See __blkdev_direct_IO() and __blkdev_direct_IO_simple():

        if ((pos | iov_iter_alignment(iter)) &
            (bdev_logical_block_size(bdev) - 1))
                return -EINVAL;

And given that, all bvecs should also be LBA aligned since the LBA size is
always a divisor of the page size. Since splitting is always done on an LBA
boundary, I do not see how we can ever get bvecs that are not LBA aligned.
Or I am missing something too...

>  
>> Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
>> of 512.  
> 
> Could you point me to some examples?
> 
>> That was implied by it being a multiple of the logical block size.  But
>> the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).
> 
> That's the driver this was tested on, though I just changed it to 4 bytes for
> 5.19.


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply

* Re: [RFC: kdevops] Standardizing on failure rate nomenclature for expunges
From: Amir Goldstein @ 2022-05-19  6:36 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: linux-fsdevel, linux-block, pankydev8, Theodore Tso, Josef Bacik,
	jmeneghi, Jan Kara, Davidlohr Bueso, Dan Williams, Jake Edge,
	Klaus Jensen, Zorro Lang, fstests
In-Reply-To: <YoW0ZC+zM27Pi0Us@bombadil.infradead.org>

[adding fstests and Zorro]

On Thu, May 19, 2022 at 6:07 AM Luis Chamberlain <mcgrof@kernel.org> wrote:
>
> I've been promoting the idea that running fstests once is nice,
> but things get interesting if you try to run fstests multiple
> times until a failure is found. It turns out at least kdevops has
> found tests which fail with a failure rate of typically 1/2 to
> 1/30 average failure rate. That is 1/2 means a failure can happen
> 50% of the time, whereas 1/30 means it takes 30 runs to find the
> failure.
>
> I have tried my best to annotate failure rates when I know what
> they might be on the test expunge list, as an example:
>
> workflows/fstests/expunges/5.17.0-rc7/xfs/unassigned/xfs_reflink.txt:generic/530 # failure rate about 1/15 https://gist.github.com/mcgrof/4129074db592c170e6bf748aa11d783d
>
> The term "failure rate 1/15" is 16 characters long, so I'd like
> to propose to standardize a way to represent this. How about
>
> generic/530 # F:1/15
>

I am not fond of the 1/15 annotation at all, because the only fact that you
are able to document is that the test failed after 15 runs.
Suggesting that this means failure rate of 1/15 is a very big step.

> Then we could extend the definition. F being current estimate, and this
> can be just how long it took to find the first failure. A more valuable
> figure would be failure rate avarage, so running the test multiple
> times, say 10, to see what the failure rate is and then averaging the
> failure out. So this could be a more accurate representation. For this
> how about:
>
> generic/530 # FA:1/15
>
> This would mean on average there failure rate has been found to be about
> 1/15, and this was determined based on 10 runs.
>
> We should also go extend check for fstests/blktests to run a test
> until a failure is found and report back the number of successes.
>
> Thoughts?
>

I have had a discussion about those tests with Zorro.

Those tests that some people refer to as "flaky" are valuable,
but they are not deterministic, they are stochastic.

I think MTBF is the standard way to describe reliability
of such tests, but I am having a hard time imagining how
the community can manage to document accurate annotations
of this sort, so I would stick with documenting the facts
(i.e. the test fails after N runs).

OTOH, we do have deterministic tests, maybe even the majority of
fstests are deterministic(?)

Considering that every auto test loop takes ~2 hours on our rig and that
I have been running over 100 loops over the past two weeks, if half
of fstests are deterministic, that is a lot of wait time and a lot of carbon
emission gone to waste.

It would have been nice if I was able to exclude a "deterministic" group.
The problem is - can a developer ever tag a test as being "deterministic"?

Thanks,
Amir.

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Keith Busch @ 2022-05-19  4:56 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoW5Iy+Vbk4Rv3zT@sol.localdomain>

On Wed, May 18, 2022 at 08:27:31PM -0700, Eric Biggers wrote:
> 
> So the bio ends up with a total length that is a multiple of the logical block
> size, but the lengths of the individual bvecs in the bio are *not* necessarily
> multiples of the logical block size.  That's the problem.

I'm surely missing something here. I know the bvecs are not necessarily lbs
aligned, but why does that matter? Is there some driver that can only take
exactly 1 bvec, but allows it to be unaligned? If so, we could take the segment
queue limit into account, but I am not sure that we need to.

> Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
> of 512.  

Could you point me to some examples?

> That was implied by it being a multiple of the logical block size.  But
> the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

That's the driver this was tested on, though I just changed it to 4 bytes for
5.19.

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Bart Van Assche @ 2022-05-19  4:40 UTC (permalink / raw)
  To: Eric Biggers, Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	damien.lemoal
In-Reply-To: <YoW5Iy+Vbk4Rv3zT@sol.localdomain>

On 5/19/22 05:27, Eric Biggers wrote:
> But the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

There are block drivers that support byte alignment of block layer data 
buffers. From the iSCSI over TCP driver:

	blk_queue_dma_alignment(sdev->request_queue, 0);

Thanks,

Bart.

^ permalink raw reply

* Re: [PATCHv2 2/3] block: export dma_alignment attribute
From: Bart Van Assche @ 2022-05-19  4:30 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, damien.lemoal, Keith Busch
In-Reply-To: <20220518171131.3525293-3-kbusch@fb.com>

On 5/18/22 19:11, Keith Busch wrote:
> From: Keith Busch <kbusch@kernel.org>
> 
> User space may want to know how to align their buffers to avoid
> bouncing. Export the queue attribute.
> 
> Signed-off-by: Keith Busch <kbusch@kernel.org>
> ---
>   block/blk-sysfs.c | 7 +++++++
>   1 file changed, 7 insertions(+)
> 
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index 88bd41d4cb59..14607565d781 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -274,6 +274,11 @@ static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page
>   	return queue_var_show(q->limits.virt_boundary_mask, page);
>   }
>   
> +static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(queue_dma_alignment(q), page);
> +}
> +
>   #define QUEUE_SYSFS_BIT_FNS(name, flag, neg)				\
>   static ssize_t								\
>   queue_##name##_show(struct request_queue *q, char *page)		\
> @@ -606,6 +611,7 @@ QUEUE_RO_ENTRY(queue_dax, "dax");
>   QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
>   QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
>   QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
> +QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
>   
>   #ifdef CONFIG_BLK_DEV_THROTTLING_LOW
>   QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
> @@ -667,6 +673,7 @@ static struct attribute *queue_attrs[] = {
>   	&blk_throtl_sample_time_entry.attr,
>   #endif
>   	&queue_virt_boundary_mask_entry.attr,
> +	&queue_dma_alignment_entry.attr,
>   	NULL,
>   };

Please add an entry for the new sysfs attribute in 
Documentation/ABI/stable/sysfs-block.

Thanks,

Bart.

^ permalink raw reply

* Re: [PATCHv2 1/3] block/bio: remove duplicate append pages code
From: Bart Van Assche @ 2022-05-19  4:28 UTC (permalink / raw)
  To: Keith Busch, linux-fsdevel, linux-block
  Cc: axboe, Kernel Team, hch, damien.lemoal, Keith Busch
In-Reply-To: <20220518171131.3525293-2-kbusch@fb.com>

On 5/18/22 19:11, Keith Busch wrote:
> +	for (left = size, i = 0; left > 0; left -= len, i++) {
> +		struct page *page = pages[i];
> +		bool same_page = false;
> +
> +		len = min_t(size_t, PAGE_SIZE - offset, left);
> +		if (bio_add_hw_page(q, bio, page, len, offset,
> +				max_append_sectors, &same_page) != len) {
> +			bio_put_pages(pages + i, left, offset);
> +			ret = -EINVAL;
> +			break;
> +		}
> +		if (same_page)
> +			put_page(page);
> +		offset = 0;
> +	}

Consider renaming 'same_page' into 'merged'. I think that name reflects 
much better the purpose of that variable.

Thanks,

Bart.

^ permalink raw reply

* Re: [PATCH v4 07/13] btrfs: zoned: use generic btrfs zone helpers to support npo2 zoned devices
From: Naohiro Aota @ 2022-05-19  4:13 UTC (permalink / raw)
  To: Pankaj Raghav
  Cc: axboe@kernel.dk, damien.lemoal@opensource.wdc.com,
	pankydev8@gmail.com, dsterba@suse.com, hch@lst.de,
	linux-nvme@lists.infradead.org, linux-fsdevel@vger.kernel.org,
	linux-btrfs@vger.kernel.org, jiangbo.365@bytedance.com,
	linux-block@vger.kernel.org, gost.dev@samsung.com,
	linux-kernel@vger.kernel.org, dm-devel@redhat.com,
	Luis Chamberlain
In-Reply-To: <20220516165416.171196-8-p.raghav@samsung.com>

On Mon, May 16, 2022 at 06:54:10PM +0200, Pankaj Raghav wrote:
> Add helpers to calculate alignment, round up and round down
> for zoned devices. These helpers encapsulates the necessary handling for
> power_of_2 and non-power_of_2 zone sizes. Optimized calculations are
> performed for zone sizes that are power_of_2 with log and shifts.
> 
> btrfs_zoned_is_aligned() is added instead of reusing bdev_zone_aligned()
> helper due to some use cases in btrfs where zone alignment is checked
> before having access to the underlying block device such as in this
> function: btrfs_load_block_group_zone_info().
> 
> Use the generic btrfs zone helpers to calculate zone index, check zone
> alignment, round up and round down operations.
> 
> The zone_size_shift field is not needed anymore as generic helpers are
> used for calculation.
> 
> Reviewed-by: Luis Chamberlain <mcgrof@kernel.org>
> Signed-off-by: Pankaj Raghav <p.raghav@samsung.com>
> ---
>  fs/btrfs/volumes.c | 24 +++++++++-------
>  fs/btrfs/zoned.c   | 72 ++++++++++++++++++++++------------------------
>  fs/btrfs/zoned.h   | 43 +++++++++++++++++++++++----
>  3 files changed, 85 insertions(+), 54 deletions(-)
> 
><snip>
> diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
> index 694ab6d1e..b94ce4d1f 100644
> --- a/fs/btrfs/zoned.h
> +++ b/fs/btrfs/zoned.h
> @@ -9,6 +9,7 @@
>  #include "disk-io.h"
>  #include "block-group.h"
>  #include "btrfs_inode.h"
> +#include "misc.h"
>  
>  #define BTRFS_DEFAULT_RECLAIM_THRESH           			(75)
>  
> @@ -18,7 +19,6 @@ struct btrfs_zoned_device_info {
>  	 * zoned block device.
>  	 */
>  	u64 zone_size;
> -	u8  zone_size_shift;
>  	u32 nr_zones;
>  	unsigned int max_active_zones;
>  	atomic_t active_zones_left;
> @@ -30,6 +30,36 @@ struct btrfs_zoned_device_info {
>  	u32 sb_zone_location[BTRFS_SUPER_MIRROR_MAX];
>  };
>  
> +static inline bool btrfs_zoned_is_aligned(u64 pos, u64 zone_size)
> +{
> +	u64 remainder = 0;
> +
> +	if (is_power_of_two_u64(zone_size))
> +		return IS_ALIGNED(pos, zone_size);
> +
> +	div64_u64_rem(pos, zone_size, &remainder);
> +	return remainder == 0;
> +}
> +
> +static inline u64 btrfs_zoned_roundup(u64 pos, u64 zone_size)
> +{
> +	if (is_power_of_two_u64(zone_size))
> +		return ALIGN(pos, zone_size);
> +
> +	return div64_u64(pos + zone_size - 1, zone_size) * zone_size;
> +}
> +
> +static inline u64 btrfs_zoned_rounddown(u64 pos, u64 zone_size)
> +{
> +	u64 remainder = 0;
> +	if (is_power_of_two_u64(zone_size))
> +		return round_down(pos, zone_size);
> +
> +	div64_u64_rem(pos, zone_size, &remainder);
> +	pos -= remainder;
> +	return pos;
> +}
> +

This is just a preference, but how about naming these helpers not related
to "zoned"? While they take "zone_size" as an argument, it does not do
anything special on zoned things. For my preference, I would take
btrfs_device or btrfs_zoned_device_info for "btrfs_zoned_*" function.

Actually, I was a bit confused seeing the part
below. btrfs_zoned_is_aligned() takes "sector_t" values while the arguments
are often byte granularity address. Yeah, actually sector_t == u64 and the
code does not relies on the unit ... so, it is OK as long as the values are
in the same unit.

    @@ -409,9 +409,8 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
            }
    
            nr_sectors = bdev_nr_sectors(bdev);
    -       zone_info->zone_size_shift = ilog2(zone_info->zone_size);
    -       zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors);
    -       if (!IS_ALIGNED(nr_sectors, zone_sectors))
    +       zone_info->nr_zones = bdev_zone_no(bdev, nr_sectors);
    +       if (!btrfs_zoned_is_aligned(nr_sectors, zone_sectors))
                    zone_info->nr_zones++;
    
            max_active_zones = bdev_max_active_zones(bdev);

BTW, aren't these helpers used in other subsystems? Maybe adding these in
include/linux/math64.h or so?

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Eric Biggers @ 2022-05-19  3:27 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoWqlqIzBcYGkcnu@kbusch-mbp.dhcp.thefacebook.com>

On Wed, May 18, 2022 at 08:25:26PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 07:08:11PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> > > I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> > > condition, but I believe it's well handled here. Unless I'm terribly confused,
> > > which is certainly possible, I think you may have missed this part of the
> > > patch:
> > > 
> > > @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> > >  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> > > 
> > >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > > +	if (size > 0)
> > > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> > >  	if (unlikely(size <= 0))
> > >  		return size ? size : -EFAULT;
> > > 
> > 
> > That makes the total length of each "batch" of pages be a multiple of the
> > logical block size, but individual logical blocks within that batch can still be
> > divided into multiple bvecs in the loop just below it:
> 
> I understand that, but the existing code conservatively assumes all pages are
> physically discontiguous and wouldn't have requested more pages if it didn't
> have enough bvecs for each of them:
> 
> 	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
> 
> So with the segment alignment guarantee, and ensured available bvec space, the
> created bio will always be a logical block size multiple.
> 
> If we need to split it later due to some other constraint, we'll only split on
> a logical block size, even if its in the middle of a bvec.
> 

So the bio ends up with a total length that is a multiple of the logical block
size, but the lengths of the individual bvecs in the bio are *not* necessarily
multiples of the logical block size.  That's the problem.

Note, there's also lots of code that assumes that bio_vec::bv_len is a multiple
of 512.  That was implied by it being a multiple of the logical block size.  But
the DMA alignment can be much lower, like 8 bytes (see nvme_set_queue_limits()).

- Eric

^ permalink raw reply

* Re: [dm-devel] [PATCH v4 00/13] support non power of 2 zoned devices
From: Damien Le Moal @ 2022-05-19  3:19 UTC (permalink / raw)
  To: Luis Chamberlain
  Cc: Theodore Ts'o, Christoph Hellwig, Pankaj Raghav, axboe,
	pankydev8, gost.dev, jiangbo.365, linux-nvme, linux-kernel,
	linux-block, linux-fsdevel, dm-devel, dsterba, linux-btrfs
In-Reply-To: <20220519031237.sw45lvzrydrm7fpb@garbanzo>

On 5/19/22 12:12, Luis Chamberlain wrote:
> On Thu, May 19, 2022 at 12:08:26PM +0900, Damien Le Moal wrote:
>> On 5/18/22 00:34, Theodore Ts'o wrote:
>>> On Tue, May 17, 2022 at 10:10:48AM +0200, Christoph Hellwig wrote:
>>>> I'm a little surprised about all this activity.
>>>>
>>>> I though the conclusion at LSF/MM was that for Linux itself there
>>>> is very little benefit in supporting this scheme.  It will massively
>>>> fragment the supported based of devices and applications, while only
>>>> having the benefit of supporting some Samsung legacy devices.
>>>
>>> FWIW,
>>>
>>> That wasn't my impression from that LSF/MM session, but once the
>>> videos become available, folks can decide for themselves.
>>
>> There was no real discussion about zone size constraint on the zone
>> storage BoF. Many discussions happened in the hallway track though.
> 
> Right so no direct clear blockers mentioned at all during the BoF.

Nor any clear OK.

> 
>   Luis


-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply

* Re: [dm-devel] [PATCH v4 00/13] support non power of 2 zoned devices
From: Luis Chamberlain @ 2022-05-19  3:12 UTC (permalink / raw)
  To: Damien Le Moal
  Cc: Theodore Ts'o, Christoph Hellwig, Pankaj Raghav, axboe,
	pankydev8, gost.dev, jiangbo.365, linux-nvme, linux-kernel,
	linux-block, linux-fsdevel, dm-devel, dsterba, linux-btrfs
In-Reply-To: <7f9cb19b-621b-75ea-7273-2d2769237851@opensource.wdc.com>

On Thu, May 19, 2022 at 12:08:26PM +0900, Damien Le Moal wrote:
> On 5/18/22 00:34, Theodore Ts'o wrote:
> > On Tue, May 17, 2022 at 10:10:48AM +0200, Christoph Hellwig wrote:
> >> I'm a little surprised about all this activity.
> >>
> >> I though the conclusion at LSF/MM was that for Linux itself there
> >> is very little benefit in supporting this scheme.  It will massively
> >> fragment the supported based of devices and applications, while only
> >> having the benefit of supporting some Samsung legacy devices.
> > 
> > FWIW,
> > 
> > That wasn't my impression from that LSF/MM session, but once the
> > videos become available, folks can decide for themselves.
> 
> There was no real discussion about zone size constraint on the zone
> storage BoF. Many discussions happened in the hallway track though.

Right so no direct clear blockers mentioned at all during the BoF.

  Luis

^ permalink raw reply

* Re: [dm-devel] [PATCH v4 00/13] support non power of 2 zoned devices
From: Damien Le Moal @ 2022-05-19  3:08 UTC (permalink / raw)
  To: Theodore Ts'o, Christoph Hellwig
  Cc: Pankaj Raghav, axboe, pankydev8, gost.dev, jiangbo.365,
	linux-nvme, linux-kernel, linux-block, linux-fsdevel, dm-devel,
	dsterba, linux-btrfs
In-Reply-To: <YoPAnj9ufkt5nh1G@mit.edu>

On 5/18/22 00:34, Theodore Ts'o wrote:
> On Tue, May 17, 2022 at 10:10:48AM +0200, Christoph Hellwig wrote:
>> I'm a little surprised about all this activity.
>>
>> I though the conclusion at LSF/MM was that for Linux itself there
>> is very little benefit in supporting this scheme.  It will massively
>> fragment the supported based of devices and applications, while only
>> having the benefit of supporting some Samsung legacy devices.
> 
> FWIW,
> 
> That wasn't my impression from that LSF/MM session, but once the
> videos become available, folks can decide for themselves.

There was no real discussion about zone size constraint on the zone
storage BoF. Many discussions happened in the hallway track though.

-- 
Damien Le Moal
Western Digital Research

^ permalink raw reply

* [RFC: kdevops] Standardizing on failure rate nomenclature for expunges
From: Luis Chamberlain @ 2022-05-19  3:07 UTC (permalink / raw)
  To: linux-fsdevel, linux-block
  Cc: amir73il, pankydev8, tytso, josef, jmeneghi, Jan Kara,
	Davidlohr Bueso, Dan Williams, Jake Edge, Klaus Jensen

I've been promoting the idea that running fstests once is nice,
but things get interesting if you try to run fstests multiple
times until a failure is found. It turns out at least kdevops has
found tests which fail with a failure rate of typically 1/2 to
1/30 average failure rate. That is 1/2 means a failure can happen
50% of the time, whereas 1/30 means it takes 30 runs to find the
failure.

I have tried my best to annotate failure rates when I know what
they might be on the test expunge list, as an example:

workflows/fstests/expunges/5.17.0-rc7/xfs/unassigned/xfs_reflink.txt:generic/530 # failure rate about 1/15 https://gist.github.com/mcgrof/4129074db592c170e6bf748aa11d783d

The term "failure rate 1/15" is 16 characters long, so I'd like
to propose to standardize a way to represent this. How about

generic/530 # F:1/15

Then we could extend the definition. F being current estimate, and this
can be just how long it took to find the first failure. A more valuable
figure would be failure rate avarage, so running the test multiple
times, say 10, to see what the failure rate is and then averaging the
failure out. So this could be a more accurate representation. For this
how about:

generic/530 # FA:1/15

This would mean on average there failure rate has been found to be about
1/15, and this was determined based on 10 runs.

We should also go extend check for fstests/blktests to run a test
until a failure is found and report back the number of successes.

Thoughts?

Note: yes failure rates lower than 1/100 do exist but they are rare
creatures. I love them though as my experience shows so far that they
uncover hidden bones in the closet, and they they make take months and
a lot of eyeballs to resolve.

  Luis

^ permalink raw reply

* Re: [PATCH -next v2 2/2] blk-throttle: fix io hung due to configuration updates
From: yukuai (C) @ 2022-05-19  3:01 UTC (permalink / raw)
  To: kernel test robot, tj, axboe, ming.lei
  Cc: kbuild-all, cgroups, linux-block, linux-kernel, yi.zhang
In-Reply-To: <84fe296e-6e56-3ca9-73a8-357beb675c6e@huawei.com>

在 2022/05/19 10:11, yukuai (C) 写道:
> 
> 
> 在 2022/05/18 23:52, kernel test robot 写道:
>> Hi Yu,
>>
>> Thank you for the patch! Yet something to improve:
>>
>> [auto build test ERROR on next-20220517]
>>
>> url:    
>> https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/bugfix-for-blk-throttle/20220518-151713 
>>
>> base:    47c1c54d1bcd0a69a56b49473bc20f17b70e5242
>> config: m68k-allyesconfig 
>> (https://download.01.org/0day-ci/archive/20220518/202205182347.tMOOqyfL-lkp@intel.com/config) 
>>
>> compiler: m68k-linux-gcc (GCC) 11.3.0
>> reproduce (this is a W=1 build):
>>          wget 
>> https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross 
>> -O ~/bin/make.cross
>>          chmod +x ~/bin/make.cross
>>          # 
>> https://github.com/intel-lab-lkp/linux/commit/f8345dbaf4ed491742aab29834aff66b4930c087 
>>
>>          git remote add linux-review 
>> https://github.com/intel-lab-lkp/linux
>>          git fetch --no-tags linux-review 
>> Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
>>          git checkout f8345dbaf4ed491742aab29834aff66b4930c087
>>          # save the config file
>>          mkdir build_dir && cp config build_dir/.config
>>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 
>> make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
>>
>> If you fix the issue, kindly add following tag as appropriate
>> Reported-by: kernel test robot <lkp@intel.com>
>>
>> All errors (new ones prefixed by >>):
>>
>>     m68k-linux-ld: block/blk-throttle.o: in function `tg_conf_updated':
>>>> blk-throttle.c:(.text+0x25bc): undefined reference to `__udivdi3'
>>>> m68k-linux-ld: blk-throttle.c:(.text+0x2626): undefined reference to 
>>>> `__udivdi3'
> Hi,
> 
> I'm confused here, the only place that I can relate to this:
> 
>      return dispatched * new_limit / old_limit;
> 
I understand it now. I'm doing (u64 / u64), I should use div64_u64
> However, I don't understand yet why this is problematic...
>>     `.exit.text' referenced in section `.data' of 
>> sound/soc/codecs/tlv320adc3xxx.o: defined in discarded section 
>> `.exit.text' of sound/soc/codecs/tlv320adc3xxx.o
>>

^ permalink raw reply

* Re: [PATCH V2 0/1] ubd: add io_uring based userspace block driver
From: Ming Lei @ 2022-05-19  2:42 UTC (permalink / raw)
  To: Stefan Hajnoczi
  Cc: Jens Axboe, linux-block, linux-kernel, Harris James R, io-uring,
	Gabriel Krisman Bertazi, ZiyangZhang, Xiaoguang Wang, ming.lei
In-Reply-To: <YoUVb8CeWRIErJBY@stefanha-x1.localdomain>

On Wed, May 18, 2022 at 04:49:03PM +0100, Stefan Hajnoczi wrote:
> On Wed, May 18, 2022 at 08:53:54PM +0800, Ming Lei wrote:
> > On Wed, May 18, 2022 at 11:45:32AM +0100, Stefan Hajnoczi wrote:
> > > On Wed, May 18, 2022 at 03:09:46PM +0800, Ming Lei wrote:
> > > > On Tue, May 17, 2022 at 03:06:34PM +0100, Stefan Hajnoczi wrote:
> > > > > Here are some more thoughts on the ubd-control device:
> > > > > 
> > > > > The current patch provides a ubd-control device for processes with
> > > > > suitable permissions (i.e. root) to create, start, stop, and fetch
> > > > > information about devices.
> > > > > 
> > > > > There is no isolation between devices created by one process and those
> > > > 
> > > > I understand linux hasn't device namespace yet, so can you share the
> > > > rational behind the idea of device isolation, is it because ubd device
> > > > is served by ubd daemon which belongs to one pid NS? Or the user creating
> > > > /dev/ubdbN belongs to one user NS?
> > > 
> > > With the current model a process with access to ubd-control has control
> > > over all ubd devices. This is not desirable for most container use cases
> > > because ubd-control usage within a container means that container could
> > > stop any ubd device on the system.
> > > 
> > > Even for non-container use cases it's problematic that two applications
> > > that use ubd can interfere with each other. If an application passes the
> > > wrong device ID they can stop the other application's device, for
> > > example.
> > > 
> > > I think it's worth supporting a model where there are multiple ubd
> > > daemons that are not cooperating/aware of each other. They should be
> > > isolated from each other.
> > 
> > Maybe I didn't mention it clearly, I meant the following model in last email:
> > 
> > 1) every user can send UBD_CMD_ADD_DEV to /dev/ubd-control
> > 
> > 2) the created /dev/ubdcN & /dev/udcbN are owned by the user who creates
> > it
> 
> How does this work? Does userspace (udev) somehow get the uid/gid from
> the uevent so it can set the device node permissions?

We can let 'ubd list' export the owner info, then udev may override the default
owner with exported info.

Or it can be done inside devtmpfs_create_node() by passing ubd's uid/gid
at default.

For /dev/ubdcN, I think it is safe, since the driver is only
communicating with the userspace daemon, and both belong to same owner.
Also ubd driver is simple enough to get full audited.

For /dev/ubdbN, even though FS isn't allowed to mount, there is still
lots of kernel code path involved, and some code path may not be run
with unprivileged user before, that needs careful audit.

So the biggest problem is if it is safe to export block disk to unprivileged
user, and that is the one which can't be bypassed for any approach.




Thanks, 
Ming


^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Keith Busch @ 2022-05-19  2:25 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoWmi0mvoIk3CfQN@sol.localdomain>

On Wed, May 18, 2022 at 07:08:11PM -0700, Eric Biggers wrote:
> On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> > I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> > condition, but I believe it's well handled here. Unless I'm terribly confused,
> > which is certainly possible, I think you may have missed this part of the
> > patch:
> > 
> > @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
> >  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> > 
> >  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> > +	if (size > 0)
> > +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
> >  	if (unlikely(size <= 0))
> >  		return size ? size : -EFAULT;
> > 
> 
> That makes the total length of each "batch" of pages be a multiple of the
> logical block size, but individual logical blocks within that batch can still be
> divided into multiple bvecs in the loop just below it:

I understand that, but the existing code conservatively assumes all pages are
physically discontiguous and wouldn't have requested more pages if it didn't
have enough bvecs for each of them:

	unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;

So with the segment alignment guarantee, and ensured available bvec space, the
created bio will always be a logical block size multiple.

If we need to split it later due to some other constraint, we'll only split on
a logical block size, even if its in the middle of a bvec.

> 	for (left = size, i = 0; left > 0; left -= len, i++) {
> 		struct page *page = pages[i];
> 
> 		len = min_t(size_t, PAGE_SIZE - offset, left);
> 
> 		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
> 			if (same_page)
> 				put_page(page);
> 		} else {
> 			if (WARN_ON_ONCE(bio_full(bio, len))) {
> 				bio_put_pages(pages + i, left, offset);
> 				return -EINVAL;
> 			}
> 			__bio_add_page(bio, page, len, offset);
> 		}
> 		offset = 0;
> 	}
> 
> - Eric

^ permalink raw reply

* Re: [PATCH -next v2 2/2] blk-throttle: fix io hung due to configuration updates
From: yukuai (C) @ 2022-05-19  2:11 UTC (permalink / raw)
  To: kernel test robot, tj, axboe, ming.lei
  Cc: kbuild-all, cgroups, linux-block, linux-kernel, yi.zhang
In-Reply-To: <202205182347.tMOOqyfL-lkp@intel.com>



在 2022/05/18 23:52, kernel test robot 写道:
> Hi Yu,
> 
> Thank you for the patch! Yet something to improve:
> 
> [auto build test ERROR on next-20220517]
> 
> url:    https://github.com/intel-lab-lkp/linux/commits/Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
> base:    47c1c54d1bcd0a69a56b49473bc20f17b70e5242
> config: m68k-allyesconfig (https://download.01.org/0day-ci/archive/20220518/202205182347.tMOOqyfL-lkp@intel.com/config)
> compiler: m68k-linux-gcc (GCC) 11.3.0
> reproduce (this is a W=1 build):
>          wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross
>          chmod +x ~/bin/make.cross
>          # https://github.com/intel-lab-lkp/linux/commit/f8345dbaf4ed491742aab29834aff66b4930c087
>          git remote add linux-review https://github.com/intel-lab-lkp/linux
>          git fetch --no-tags linux-review Yu-Kuai/bugfix-for-blk-throttle/20220518-151713
>          git checkout f8345dbaf4ed491742aab29834aff66b4930c087
>          # save the config file
>          mkdir build_dir && cp config build_dir/.config
>          COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.3.0 make.cross W=1 O=build_dir ARCH=m68k SHELL=/bin/bash
> 
> If you fix the issue, kindly add following tag as appropriate
> Reported-by: kernel test robot <lkp@intel.com>
> 
> All errors (new ones prefixed by >>):
> 
>     m68k-linux-ld: block/blk-throttle.o: in function `tg_conf_updated':
>>> blk-throttle.c:(.text+0x25bc): undefined reference to `__udivdi3'
>>> m68k-linux-ld: blk-throttle.c:(.text+0x2626): undefined reference to `__udivdi3'
Hi,

I'm confused here, the only place that I can relate to this:

	return dispatched * new_limit / old_limit;

However, I don't understand yet why this is problematic...
>     `.exit.text' referenced in section `.data' of sound/soc/codecs/tlv320adc3xxx.o: defined in discarded section `.exit.text' of sound/soc/codecs/tlv320adc3xxx.o
> 

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Eric Biggers @ 2022-05-19  2:08 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoWkiCdduzyQxHR+@kbusch-mbp.dhcp.thefacebook.com>

On Wed, May 18, 2022 at 07:59:36PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 06:53:11PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> > > On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > > > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > > > diff --git a/block/fops.c b/block/fops.c
> > > > > index b9b83030e0df..d8537c29602f 100644
> > > > > --- a/block/fops.c
> > > > > +++ b/block/fops.c
> > > > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > > > >  	struct bio bio;
> > > > >  	ssize_t ret;
> > > > >  
> > > > > -	if ((pos | iov_iter_alignment(iter)) &
> > > > > -	    (bdev_logical_block_size(bdev) - 1))
> > > > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > > > +		return -EINVAL;
> > > > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > > > >  		return -EINVAL;
> > > > 
> > > > The block layer makes a lot of assumptions that bios can be split at any bvec
> > > > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > > > block size can be generated by splitting, which isn't valid.
> > > 
> > > How? This patch ensures every segment is block size aligned.
> > 
> > No, it doesn't.  It ensures that the *total* length of each bio is logical block
> > size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
> > the required memory alignment to below the logical block size, you're allowing
> > logical blocks to span a page boundary.  Whenever the two pages involved aren't
> > physically contiguous, the data of the block will be split across two bvecs.
> 
> I'm aware that spanning pages can cause bad splits on the bi_max_vecs
> condition, but I believe it's well handled here. Unless I'm terribly confused,
> which is certainly possible, I think you may have missed this part of the
> patch:
> 
> @@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
>  	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
> 
>  	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
> +	if (size > 0)
> +		size = ALIGN_DOWN(size, queue_logical_block_size(q));
>  	if (unlikely(size <= 0))
>  		return size ? size : -EFAULT;
> 

That makes the total length of each "batch" of pages be a multiple of the
logical block size, but individual logical blocks within that batch can still be
divided into multiple bvecs in the loop just below it:

	for (left = size, i = 0; left > 0; left -= len, i++) {
		struct page *page = pages[i];

		len = min_t(size_t, PAGE_SIZE - offset, left);

		if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
			if (same_page)
				put_page(page);
		} else {
			if (WARN_ON_ONCE(bio_full(bio, len))) {
				bio_put_pages(pages + i, left, offset);
				return -EINVAL;
			}
			__bio_add_page(bio, page, len, offset);
		}
		offset = 0;
	}

- Eric

^ permalink raw reply

* Re: [PATCHv2 0/3] direct io alignment relax
From: Eric Biggers @ 2022-05-19  2:02 UTC (permalink / raw)
  To: Chaitanya Kulkarni
  Cc: Keith Busch, Keith Busch, linux-fsdevel@vger.kernel.org,
	linux-block@vger.kernel.org, axboe@kernel.dk, Kernel Team,
	hch@lst.de, bvanassche@acm.org, damien.lemoal@opensource.wdc.com
In-Reply-To: <f42c74b9-67fa-50fc-d97e-2a7f153f10e4@nvidia.com>

On Thu, May 19, 2022 at 01:02:42AM +0000, Chaitanya Kulkarni wrote:
> On 5/18/22 17:51, Keith Busch wrote:
> > On Wed, May 18, 2022 at 11:26:20PM +0000, Eric Biggers wrote:
> >> On Wed, May 18, 2022 at 10:11:28AM -0700, Keith Busch wrote:
> >>> From: Keith Busch <kbusch@kernel.org>
> >>>
> >>> Including the fs list this time.
> >>>
> >>> I am still working on a better interface to report the dio alignment to
> >>> an application. The most recent suggestion of using statx is proving to
> >>> be less straight forward than I thought, but I don't want to hold this
> >>> series up for that.
> >>>
> >>
> >> Note that I already implemented the statx support and sent it out for review:
> >> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#u
> >> However, the patch series only received one comment.  I can send it out again if
> >> people have become interested in it again...
> > 
> > Thanks, I didn't see that the first time around, but I'll be sure to look at
> > your new version. It sounds like you encountered the same problem I did
> > regarding block device handles: the devtmpfs inodes for the raw block device
> > handles are not the bdev inodes. I do think it's useful the alignment
> > attributes are accessible through the block device files, though.
> 
> Irrespective of above problem, as per my review comment [1] on the
> initial version of Eric's series I really want to see the generic
> interface that can accommodate exposing optimal values for different
> operations REQ_OP_DISCARD/REQ_OP_WRITE_ZEROES/REQ_OP_VERIFY etc.
> and not only for read/write.
> 
> -ck
> 
> https://lore.kernel.org/linux-fsdevel/20220211061158.227688-1-ebiggers@kernel.org/T/#r3ffe9183c372fb97a9753e286f9cf6400e8ec272

Userspace doesn't know what REQ_OP_* are; they are internal implementation
details of the block layer.  What UAPIs, specifically, do you have in mind for
wanting to know the alignment requirements of?

If you're thinking about about BLKDISCARD and BLKZEROOUT, those are block device
ioctls, so statx() doesn't seem like a great fit for them.  There is already a
place to expose block device properties to userspace (/sys/block/$dev/).  Direct
I/O is different because direct I/O is not just a feature of block devices but
also of regular files, and it is affected by filesystem-level constraints.

- Eric

^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Keith Busch @ 2022-05-19  1:59 UTC (permalink / raw)
  To: Eric Biggers
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoWjBxmKDQC1mCIz@sol.localdomain>

On Wed, May 18, 2022 at 06:53:11PM -0700, Eric Biggers wrote:
> On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> > On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > > diff --git a/block/fops.c b/block/fops.c
> > > > index b9b83030e0df..d8537c29602f 100644
> > > > --- a/block/fops.c
> > > > +++ b/block/fops.c
> > > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > > >  	struct bio bio;
> > > >  	ssize_t ret;
> > > >  
> > > > -	if ((pos | iov_iter_alignment(iter)) &
> > > > -	    (bdev_logical_block_size(bdev) - 1))
> > > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > > +		return -EINVAL;
> > > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > > >  		return -EINVAL;
> > > 
> > > The block layer makes a lot of assumptions that bios can be split at any bvec
> > > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > > block size can be generated by splitting, which isn't valid.
> > 
> > How? This patch ensures every segment is block size aligned.
> 
> No, it doesn't.  It ensures that the *total* length of each bio is logical block
> size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
> the required memory alignment to below the logical block size, you're allowing
> logical blocks to span a page boundary.  Whenever the two pages involved aren't
> physically contiguous, the data of the block will be split across two bvecs.

I'm aware that spanning pages can cause bad splits on the bi_max_vecs
condition, but I believe it's well handled here. Unless I'm terribly confused,
which is certainly possible, I think you may have missed this part of the
patch:

@@ -1223,6 +1224,8 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
 	pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);

 	size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+	if (size > 0)
+		size = ALIGN_DOWN(size, queue_logical_block_size(q));
 	if (unlikely(size <= 0))
 		return size ? size : -EFAULT;


^ permalink raw reply

* Re: [PATCHv2 3/3] block: relax direct io memory alignment
From: Eric Biggers @ 2022-05-19  1:53 UTC (permalink / raw)
  To: Keith Busch
  Cc: Keith Busch, linux-fsdevel, linux-block, axboe, Kernel Team, hch,
	bvanassche, damien.lemoal
In-Reply-To: <YoWWtwsiKGqoTbVU@kbusch-mbp.dhcp.thefacebook.com>

On Wed, May 18, 2022 at 07:00:39PM -0600, Keith Busch wrote:
> On Wed, May 18, 2022 at 05:14:49PM -0700, Eric Biggers wrote:
> > On Wed, May 18, 2022 at 10:11:31AM -0700, Keith Busch wrote:
> > > diff --git a/block/fops.c b/block/fops.c
> > > index b9b83030e0df..d8537c29602f 100644
> > > --- a/block/fops.c
> > > +++ b/block/fops.c
> > > @@ -54,8 +54,9 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
> > >  	struct bio bio;
> > >  	ssize_t ret;
> > >  
> > > -	if ((pos | iov_iter_alignment(iter)) &
> > > -	    (bdev_logical_block_size(bdev) - 1))
> > > +	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
> > > +		return -EINVAL;
> > > +	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
> > >  		return -EINVAL;
> > 
> > The block layer makes a lot of assumptions that bios can be split at any bvec
> > boundary.  With this patch, bios whose length isn't a multiple of the logical
> > block size can be generated by splitting, which isn't valid.
> 
> How? This patch ensures every segment is block size aligned.

No, it doesn't.  It ensures that the *total* length of each bio is logical block
size aligned.  It doesn't ensure that for the individual bvecs.  By decreasing
the required memory alignment to below the logical block size, you're allowing
logical blocks to span a page boundary.  Whenever the two pages involved aren't
physically contiguous, the data of the block will be split across two bvecs.

> > Also some devices aren't compatible with logical blocks spanning bdevs at all.
> > dm-crypt errors out in this case, for example.
> 
> I'm sorry, but I am not understanding this.

I meant to write bvecs, not bdevs.

- Eric

^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox