[PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling

public inbox for linux-btrfs@vger.kernel.org
 help / color / mirror / Atom feed

* [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
@ 2023-11-16  5:19 Qu Wenruo
  2023-11-20 17:00 ` David Sterba
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Qu Wenruo @ 2023-11-16  5:19 UTC (permalink / raw)
  To: linux-btrfs

Currently btrfs extent buffer helpers are doing all the cross-page
handling, as there is no guarantee that all those eb pages are
contiguous.

However on systems with enough memory, there is a very high chance the
page cache for btree_inode are allocated with physically contiguous
pages.

In that case, we can skip all the complex cross-page handling, thus
speeding up the code.

This patch adds a new member, extent_buffer::addr, which is only set to
non-NULL if all the extent buffer pages are physically contiguous.

Signed-off-by: Qu Wenruo <wqu@suse.com>
---
Reason for RFC:

This change would increase the code size for all extent buffer helpers,
and since there one more branch introduced, it may even slow down the
system if most ebs do not have physically contiguous pages.

But I still believe this is worthy trying, as my previous attempt to
use virtually contiguous pages are rejected due to possible slow down in
vm_map() call.

I don't have convincing benchmark yet, but so far no obvious performance
drop observed either.
---
 fs/btrfs/disk-io.c   |  9 +++++++-
 fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |  7 ++++++
 3 files changed, 70 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5ac6789ca55f..7fc78171a262 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 	char *kaddr;
 	int i;
 
+	memset(result, 0, BTRFS_CSUM_SIZE);
 	shash->tfm = fs_info->csum_shash;
 	crypto_shash_init(shash);
+
+	if (buf->addr) {
+		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
+				    buf->len - BTRFS_CSUM_SIZE, result);
+		return;
+	}
+
 	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
 	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
 			    first_page_part - BTRFS_CSUM_SIZE);
@@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 		kaddr = page_address(buf->pages[i]);
 		crypto_shash_update(shash, kaddr, PAGE_SIZE);
 	}
-	memset(result, 0, BTRFS_CSUM_SIZE);
 	crypto_shash_final(shash, result);
 }
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 03cef28d9e37..004b0ba6b1c7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3476,6 +3476,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	struct address_space *mapping = fs_info->btree_inode->i_mapping;
 	struct btrfs_subpage *prealloc = NULL;
 	u64 lockdep_owner = owner_root;
+	bool page_contig = true;
 	int uptodate = 1;
 	int ret;
 
@@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 
 		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
 		eb->pages[i] = p;
+
+		/*
+		 * Check if the current page is physically contiguous with previous eb
+		 * page.
+		 */
+		if (i && eb->pages[i - 1] + 1 != p)
+			page_contig = false;
+
 		if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len))
 			uptodate = 0;
 
@@ -3575,6 +3584,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 	}
 	if (uptodate)
 		set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+	/* All pages are physically contiguous, can skip cross page handling. */
+	if (page_contig)
+		eb->addr = page_address(eb->pages[0]) + offset_in_page(eb->start);
 again:
 	ret = radix_tree_preload(GFP_NOFS);
 	if (ret) {
@@ -4009,6 +4021,11 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
 		return;
 	}
 
+	if (eb->addr) {
+		memcpy(dstv, eb->addr + start, len);
+		return;
+	}
+
 	offset = get_eb_offset_in_page(eb, start);
 
 	while (len > 0) {
@@ -4040,6 +4057,12 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
+	if (eb->addr) {
+		if (copy_to_user_nofault(dstv, eb->addr + start, len))
+			ret = EFAULT;
+		return ret;
+	}
+
 	offset = get_eb_offset_in_page(eb, start);
 
 	while (len > 0) {
@@ -4075,6 +4098,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
 	if (check_eb_range(eb, start, len))
 		return -EINVAL;
 
+	if (eb->addr)
+		return memcmp(ptrv, eb->addr + start, len);
+
 	offset = get_eb_offset_in_page(eb, start);
 
 	while (len > 0) {
@@ -4144,6 +4170,14 @@ static void __write_extent_buffer(const struct extent_buffer *eb,
 	if (check_eb_range(eb, start, len))
 		return;
 
+	if (eb->addr) {
+		if (use_memmove)
+			memmove(eb->addr + start, srcv, len);
+		else
+			memcpy(eb->addr + start, srcv, len);
+		return;
+	}
+
 	offset = get_eb_offset_in_page(eb, start);
 
 	while (len > 0) {
@@ -4176,6 +4210,11 @@ static void memset_extent_buffer(const struct extent_buffer *eb, int c,
 {
 	unsigned long cur = start;
 
+	if (eb->addr) {
+		memset(eb->addr + start, c, len);
+		return;
+	}
+
 	while (cur < start + len) {
 		unsigned long index = get_eb_page_index(cur);
 		unsigned int offset = get_eb_offset_in_page(eb, cur);
@@ -4403,6 +4442,17 @@ void memcpy_extent_buffer(const struct extent_buffer *dst,
 	    check_eb_range(dst, src_offset, len))
 		return;
 
+	if (dst->addr) {
+		const bool use_memmove = areas_overlap(src_offset,
+						       dst_offset, len);
+
+		if (use_memmove)
+			memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+		else
+			memcpy(dst->addr + dst_offset, dst->addr + src_offset, len);
+		return;
+	}
+
 	while (cur_off < len) {
 		unsigned long cur_src = cur_off + src_offset;
 		unsigned long pg_index = get_eb_page_index(cur_src);
@@ -4435,6 +4485,11 @@ void memmove_extent_buffer(const struct extent_buffer *dst,
 		return;
 	}
 
+	if (dst->addr) {
+		memmove(dst->addr + dst_offset, dst->addr + src_offset, len);
+		return;
+	}
+
 	while (len > 0) {
 		unsigned long src_i;
 		size_t cur;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2171057a4477..a88374ad248f 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -77,6 +77,13 @@ struct extent_buffer {
 	unsigned long len;
 	unsigned long bflags;
 	struct btrfs_fs_info *fs_info;
+
+	/*
+	 * The address where the eb can be accessed without any cross-page handling.
+	 * This can be NULL if not possible.
+	 */
+	void *addr;
+
 	spinlock_t refs_lock;
 	atomic_t refs;
 	int read_mirror;
-- 
2.42.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-16  5:19 [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling Qu Wenruo
@ 2023-11-20 17:00 ` David Sterba
  2023-11-20 20:25   ` Qu Wenruo
  2023-11-22 13:34 ` David Sterba
  2023-11-22 13:46 ` David Sterba
  2 siblings, 1 reply; 15+ messages in thread
From: David Sterba @ 2023-11-20 17:00 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> Currently btrfs extent buffer helpers are doing all the cross-page
> handling, as there is no guarantee that all those eb pages are
> contiguous.
> 
> However on systems with enough memory, there is a very high chance the
> page cache for btree_inode are allocated with physically contiguous
> pages.
> 
> In that case, we can skip all the complex cross-page handling, thus
> speeding up the code.
> 
> This patch adds a new member, extent_buffer::addr, which is only set to
> non-NULL if all the extent buffer pages are physically contiguous.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>
> ---
> Reason for RFC:
> 
> This change would increase the code size for all extent buffer helpers,
> and since there one more branch introduced, it may even slow down the
> system if most ebs do not have physically contiguous pages.
> 
> But I still believe this is worthy trying, as my previous attempt to
> use virtually contiguous pages are rejected due to possible slow down in
> vm_map() call.
> 
> I don't have convincing benchmark yet, but so far no obvious performance
> drop observed either.
> ---
>  fs/btrfs/disk-io.c   |  9 +++++++-
>  fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
>  fs/btrfs/extent_io.h |  7 ++++++
>  3 files changed, 70 insertions(+), 1 deletion(-)
> 
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5ac6789ca55f..7fc78171a262 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>  	char *kaddr;
>  	int i;
>  
> +	memset(result, 0, BTRFS_CSUM_SIZE);
>  	shash->tfm = fs_info->csum_shash;
>  	crypto_shash_init(shash);
> +
> +	if (buf->addr) {
> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
> +				    buf->len - BTRFS_CSUM_SIZE, result);
> +		return;
> +	}
> +
>  	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
>  	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>  			    first_page_part - BTRFS_CSUM_SIZE);
> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>  		kaddr = page_address(buf->pages[i]);
>  		crypto_shash_update(shash, kaddr, PAGE_SIZE);
>  	}
> -	memset(result, 0, BTRFS_CSUM_SIZE);

This is not related to the contig pages but the result buffer for
checksum should be always cleared before storing the digest.

>  	crypto_shash_final(shash, result);
>  }
>  
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 03cef28d9e37..004b0ba6b1c7 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -3476,6 +3476,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  	struct address_space *mapping = fs_info->btree_inode->i_mapping;
>  	struct btrfs_subpage *prealloc = NULL;
>  	u64 lockdep_owner = owner_root;
> +	bool page_contig = true;
>  	int uptodate = 1;
>  	int ret;
>  
> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>  
>  		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
>  		eb->pages[i] = p;
> +
> +		/*
> +		 * Check if the current page is physically contiguous with previous eb
> +		 * page.
> +		 */
> +		if (i && eb->pages[i - 1] + 1 != p)
> +			page_contig = false;

This hasn't been fixed from last time, this has almost zero chance to
succeed once the system is up for some time. Page addresses returned
from allocator are random. What I was suggesting is to use alloc_page()
with the given order (16K pages are 2).

This works for all eb sizes we need, the prolematic one could be for 64K
because this is order 4 and PAGE_ALLOC_COSTLY_ORDER is 3, so this would
cost more on the MM side. But who uses 64K node size on x8_64.

> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -77,6 +77,13 @@ struct extent_buffer {
>  	unsigned long len;
>  	unsigned long bflags;
>  	struct btrfs_fs_info *fs_info;
> +
> +	/*
> +	 * The address where the eb can be accessed without any cross-page handling.
> +	 * This can be NULL if not possible.
> +	 */
> +	void *addr;

So this is a remnant of the vm_map, we would not need to store the
address in case all the pages are contiguous, it would be the address of
pages[0]. That it's contiguous could be tracked as a bit in the flags.

> +
>  	spinlock_t refs_lock;
>  	atomic_t refs;
>  	int read_mirror;
> -- 
> 2.42.1
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-20 17:00 ` David Sterba
@ 2023-11-20 20:25   ` Qu Wenruo
  2023-11-21 15:35     ` David Sterba
  0 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2023-11-20 20:25 UTC (permalink / raw)
  To: dsterba, Qu Wenruo; +Cc: linux-btrfs



On 2023/11/21 03:30, David Sterba wrote:
> On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
>> Currently btrfs extent buffer helpers are doing all the cross-page
>> handling, as there is no guarantee that all those eb pages are
>> contiguous.
>>
>> However on systems with enough memory, there is a very high chance the
>> page cache for btree_inode are allocated with physically contiguous
>> pages.
>>
>> In that case, we can skip all the complex cross-page handling, thus
>> speeding up the code.
>>
>> This patch adds a new member, extent_buffer::addr, which is only set to
>> non-NULL if all the extent buffer pages are physically contiguous.
>>
>> Signed-off-by: Qu Wenruo <wqu@suse.com>
>> ---
>> Reason for RFC:
>>
>> This change would increase the code size for all extent buffer helpers,
>> and since there one more branch introduced, it may even slow down the
>> system if most ebs do not have physically contiguous pages.
>>
>> But I still believe this is worthy trying, as my previous attempt to
>> use virtually contiguous pages are rejected due to possible slow down in
>> vm_map() call.
>>
>> I don't have convincing benchmark yet, but so far no obvious performance
>> drop observed either.
>> ---
>>   fs/btrfs/disk-io.c   |  9 +++++++-
>>   fs/btrfs/extent_io.c | 55 ++++++++++++++++++++++++++++++++++++++++++++
>>   fs/btrfs/extent_io.h |  7 ++++++
>>   3 files changed, 70 insertions(+), 1 deletion(-)
>>
>> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
>> index 5ac6789ca55f..7fc78171a262 100644
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>>   	char *kaddr;
>>   	int i;
>>
>> +	memset(result, 0, BTRFS_CSUM_SIZE);
>>   	shash->tfm = fs_info->csum_shash;
>>   	crypto_shash_init(shash);
>> +
>> +	if (buf->addr) {
>> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
>> +				    buf->len - BTRFS_CSUM_SIZE, result);
>> +		return;
>> +	}
>> +
>>   	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
>>   	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>>   			    first_page_part - BTRFS_CSUM_SIZE);
>> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>>   		kaddr = page_address(buf->pages[i]);
>>   		crypto_shash_update(shash, kaddr, PAGE_SIZE);
>>   	}
>> -	memset(result, 0, BTRFS_CSUM_SIZE);
>
> This is not related to the contig pages but the result buffer for
> checksum should be always cleared before storing the digest.

This just get moved before the branch, as that clearing is shared for
both cross-page and contig cases.

>
>>   	crypto_shash_final(shash, result);
>>   }
>>
>> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
>> index 03cef28d9e37..004b0ba6b1c7 100644
>> --- a/fs/btrfs/extent_io.c
>> +++ b/fs/btrfs/extent_io.c
>> @@ -3476,6 +3476,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>   	struct address_space *mapping = fs_info->btree_inode->i_mapping;
>>   	struct btrfs_subpage *prealloc = NULL;
>>   	u64 lockdep_owner = owner_root;
>> +	bool page_contig = true;
>>   	int uptodate = 1;
>>   	int ret;
>>
>> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>
>>   		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
>>   		eb->pages[i] = p;
>> +
>> +		/*
>> +		 * Check if the current page is physically contiguous with previous eb
>> +		 * page.
>> +		 */
>> +		if (i && eb->pages[i - 1] + 1 != p)
>> +			page_contig = false;
>
> This hasn't been fixed from last time, this has almost zero chance to
> succeed once the system is up for some time.

I have the same counter argument as the last time.

If so, transparent huge page should not work, thus I strongly doubt
about above statement.

> Page addresses returned
> from allocator are random. What I was suggesting is to use alloc_page()
> with the given order (16K pages are 2).

Nope, this patch is not intended to do that at all.

This is just the preparation for the incoming changes.
In fact alloc_page() with order needs more than those changes, it would
only come after all the preparation, including:

- Change how we allocate  pages for eb
   It has to go allocation in one-go, then attaching those pages to
   filemap.

- Extra changes to how concurrent eb allocation

- Folio flags related changes
   Remember a lot of folio flags are not applied to all its pages.

>
> This works for all eb sizes we need, the prolematic one could be for 64K
> because this is order 4 and PAGE_ALLOC_COSTLY_ORDER is 3, so this would
> cost more on the MM side. But who uses 64K node size on x8_64.

As long as you still want per-page allocation as fallback, this patch
itself is still required.

All the higher order allocation is only going to be an optimization or
fast path.

Furthermore, I found this suggestion is conflicting with your previous
statement on contig pages.
If you say the system can no longer provides contig pages after some
uptime, then all above higher order page allocation should all fail.

>
>> --- a/fs/btrfs/extent_io.h
>> +++ b/fs/btrfs/extent_io.h
>> @@ -77,6 +77,13 @@ struct extent_buffer {
>>   	unsigned long len;
>>   	unsigned long bflags;
>>   	struct btrfs_fs_info *fs_info;
>> +
>> +	/*
>> +	 * The address where the eb can be accessed without any cross-page handling.
>> +	 * This can be NULL if not possible.
>> +	 */
>> +	void *addr;
>
> So this is a remnant of the vm_map, we would not need to store the
> address in case all the pages are contiguous, it would be the address of
> pages[0]. That it's contiguous could be tracked as a bit in the flags.

It's the indicator of whether the pages are contig.

Or you have to check all the pages' address then determine if we go fast
path everytime, which is too slow afaik.

Thanks,
Qu

>
>> +
>>   	spinlock_t refs_lock;
>>   	atomic_t refs;
>>   	int read_mirror;
>> --
>> 2.42.1
>>
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-20 20:25   ` Qu Wenruo
@ 2023-11-21 15:35     ` David Sterba
  2023-11-21 20:37       ` Qu Wenruo
  0 siblings, 1 reply; 15+ messages in thread
From: David Sterba @ 2023-11-21 15:35 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Tue, Nov 21, 2023 at 06:55:35AM +1030, Qu Wenruo wrote:
> >> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> >>
> >>   		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
> >>   		eb->pages[i] = p;
> >> +
> >> +		/*
> >> +		 * Check if the current page is physically contiguous with previous eb
> >> +		 * page.
> >> +		 */
> >> +		if (i && eb->pages[i - 1] + 1 != p)
> >> +			page_contig = false;
> >
> > This hasn't been fixed from last time, this has almost zero chance to
> > succeed once the system is up for some time.
> 
> I have the same counter argument as the last time.
> 
> If so, transparent huge page should not work, thus I strongly doubt
> about above statement.
> 
> > Page addresses returned
> > from allocator are random. What I was suggesting is to use alloc_page()
> > with the given order (16K pages are 2).
> 
> Nope, this patch is not intended to do that at all.
> 
> This is just the preparation for the incoming changes.
> In fact alloc_page() with order needs more than those changes, it would
> only come after all the preparation, including:
> 
> - Change how we allocate  pages for eb
>    It has to go allocation in one-go, then attaching those pages to
>    filemap.
> 
> - Extra changes to how concurrent eb allocation
> 
> - Folio flags related changes
>    Remember a lot of folio flags are not applied to all its pages.
> 
> >
> > This works for all eb sizes we need, the prolematic one could be for 64K
> > because this is order 4 and PAGE_ALLOC_COSTLY_ORDER is 3, so this would
> > cost more on the MM side. But who uses 64K node size on x8_64.
> 
> As long as you still want per-page allocation as fallback, this patch
> itself is still required.
> 
> All the higher order allocation is only going to be an optimization or
> fast path.
> 
> Furthermore, I found this suggestion is conflicting with your previous
> statement on contig pages.
> If you say the system can no longer provides contig pages after some
> uptime, then all above higher order page allocation should all fail.

No, what I say is that alloc_pages would be the fast path and
optimization if there's enough memory, otherwise allocation page-by-page
would happen as a fallback in case of fragmentation.

The idea is to try hard when allocating the extent buffers, with
fallbacks and potentially slower code but with the same guarantees as
now at least.

But as it is now, alloc_pages can't be used as replacement due to how
the pages are obtained, find_or_create_page(). Currently I don't see a
way how to convince folios to allocate the full nodesize range (with a
given order) and then get the individual pages.

Roughly like that in alloc_extent_buffer():

+       fgp_flags = fgf_set_order(fs_info->nodesize);
+       fgp_flags |= FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
+       eb_folio = __filemap_get_folio(mapping, index, fgp_flags, GFP_NOFS | __GFP_NOFAIL);

	for (i = 0; i < num_pages; i++, index++) {
-               p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
+               p = folio_file_page(eb_folio, index);

fgf_set_order() does not guarantee the order, it's just a hint so I
guess we'll have to do full conversion to folios.

As an intermediate step we can take your patch with the contig
heuristic.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-21 15:35     ` David Sterba
@ 2023-11-21 20:37       ` Qu Wenruo
  2023-11-21 21:14         ` David Sterba
  0 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2023-11-21 20:37 UTC (permalink / raw)
  To: dsterba; +Cc: Qu Wenruo, linux-btrfs



On 2023/11/22 02:05, David Sterba wrote:
> On Tue, Nov 21, 2023 at 06:55:35AM +1030, Qu Wenruo wrote:
>>>> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>>>
>>>>    		WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
>>>>    		eb->pages[i] = p;
>>>> +
>>>> +		/*
>>>> +		 * Check if the current page is physically contiguous with previous eb
>>>> +		 * page.
>>>> +		 */
>>>> +		if (i && eb->pages[i - 1] + 1 != p)
>>>> +			page_contig = false;
>>>
>>> This hasn't been fixed from last time, this has almost zero chance to
>>> succeed once the system is up for some time.
>>
>> I have the same counter argument as the last time.
>>
>> If so, transparent huge page should not work, thus I strongly doubt
>> about above statement.
>>
>>> Page addresses returned
>>> from allocator are random. What I was suggesting is to use alloc_page()
>>> with the given order (16K pages are 2).
>>
>> Nope, this patch is not intended to do that at all.
>>
>> This is just the preparation for the incoming changes.
>> In fact alloc_page() with order needs more than those changes, it would
>> only come after all the preparation, including:
>>
>> - Change how we allocate  pages for eb
>>     It has to go allocation in one-go, then attaching those pages to
>>     filemap.
>>
>> - Extra changes to how concurrent eb allocation
>>
>> - Folio flags related changes
>>     Remember a lot of folio flags are not applied to all its pages.
>>
>>>
>>> This works for all eb sizes we need, the prolematic one could be for 64K
>>> because this is order 4 and PAGE_ALLOC_COSTLY_ORDER is 3, so this would
>>> cost more on the MM side. But who uses 64K node size on x8_64.
>>
>> As long as you still want per-page allocation as fallback, this patch
>> itself is still required.
>>
>> All the higher order allocation is only going to be an optimization or
>> fast path.
>>
>> Furthermore, I found this suggestion is conflicting with your previous
>> statement on contig pages.
>> If you say the system can no longer provides contig pages after some
>> uptime, then all above higher order page allocation should all fail.
>
> No, what I say is that alloc_pages would be the fast path and
> optimization if there's enough memory, otherwise allocation page-by-page
> would happen as a fallback in case of fragmentation.

That's also my understanding.

But the counter argument is still there, if you really think after some
uptime there would be no contig pages, then the fast path will never
trigger, and all fall back to page-by-page routine, thus defeating any
changes to introduce any patch like this one.

>
> The idea is to try hard when allocating the extent buffers, with
> fallbacks and potentially slower code but with the same guarantees as
> now at least.
>
> But as it is now, alloc_pages can't be used as replacement due to how
> the pages are obtained, find_or_create_page(). Currently I don't see a
> way how to convince folios to allocate the full nodesize range (with a
> given order) and then get the individual pages.

I'm doing a preparation to separate the memory allocation for metadata
page allocation.

The current patch is to do the allocation first, then attach the new
pages to the address space of btree inode.

By this, we can easily add new method of allocation, like trying higher
order folio first, if failed then page by page allocation.

But unfortunately there is a big problem here, even if we can forget the
argument on whether we can get contig pages after some uptime:

- My local tests can still sometimes lockup due to some phantom locked
   pages

Thanks,
Qu
>
> Roughly like that in alloc_extent_buffer():
>
> +       fgp_flags = fgf_set_order(fs_info->nodesize);
> +       fgp_flags |= FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
> +       eb_folio = __filemap_get_folio(mapping, index, fgp_flags, GFP_NOFS | __GFP_NOFAIL);
>
> 	for (i = 0; i < num_pages; i++, index++) {
> -               p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL);
> +               p = folio_file_page(eb_folio, index);
>
> fgf_set_order() does not guarantee the order, it's just a hint so I
> guess we'll have to do full conversion to folios.
>
> As an intermediate step we can take your patch with the contig
> heuristic.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-21 20:37       ` Qu Wenruo
@ 2023-11-21 21:14         ` David Sterba
  2023-11-21 21:30           ` Qu Wenruo
  0 siblings, 1 reply; 15+ messages in thread
From: David Sterba @ 2023-11-21 21:14 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Wed, Nov 22, 2023 at 07:07:10AM +1030, Qu Wenruo wrote:
> On 2023/11/22 02:05, David Sterba wrote:
> > On Tue, Nov 21, 2023 at 06:55:35AM +1030, Qu Wenruo wrote:
> >>>> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
> > No, what I say is that alloc_pages would be the fast path and
> > optimization if there's enough memory, otherwise allocation page-by-page
> > would happen as a fallback in case of fragmentation.
> 
> That's also my understanding.
> 
> But the counter argument is still there, if you really think after some
> uptime there would be no contig pages, then the fast path will never
> trigger, and all fall back to page-by-page routine, thus defeating any
> changes to introduce any patch like this one.

Such state is not permanent and memory management tries to coalesce the
freed pages under the costly order back to bigger chunks. So in a system
under heavy load the fragmentation can become bad, we would be ready for
that. It would have to be very bad luck for a long time not to be able
to get any higher order allocation at all. The process stacks depend on
contig allocations 16K, slabs are backed by 2x4K pages, it wouldn't be
just us depending on that.

> > The idea is to try hard when allocating the extent buffers, with
> > fallbacks and potentially slower code but with the same guarantees as
> > now at least.
> >
> > But as it is now, alloc_pages can't be used as replacement due to how
> > the pages are obtained, find_or_create_page(). Currently I don't see a
> > way how to convince folios to allocate the full nodesize range (with a
> > given order) and then get the individual pages.
> 
> I'm doing a preparation to separate the memory allocation for metadata
> page allocation.
> 
> The current patch is to do the allocation first, then attach the new
> pages to the address space of btree inode.
> 
> By this, we can easily add new method of allocation, like trying higher
> order folio first, if failed then page by page allocation.

Right, that would work and probably be the easiest way how to switch to
folios.

> But unfortunately there is a big problem here, even if we can forget the
> argument on whether we can get contig pages after some uptime:
> 
> - My local tests can still sometimes lockup due to some phantom locked
>    pages

I don't know how your patches do it, it could be that the page is not
attached the same way as find_or_create_page/pagecache_get_page/__filemap_get_folio
would do.

So do you want this patch (adding the contig detection and eb::addr)
applied?

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-21 21:14         ` David Sterba
@ 2023-11-21 21:30           ` Qu Wenruo
  2023-11-22 13:23             ` David Sterba
  0 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2023-11-21 21:30 UTC (permalink / raw)
  To: dsterba; +Cc: Qu Wenruo, linux-btrfs



On 2023/11/22 07:44, David Sterba wrote:
> On Wed, Nov 22, 2023 at 07:07:10AM +1030, Qu Wenruo wrote:
>> On 2023/11/22 02:05, David Sterba wrote:
>>> On Tue, Nov 21, 2023 at 06:55:35AM +1030, Qu Wenruo wrote:
>>>>>> @@ -3562,6 +3563,14 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
>>> No, what I say is that alloc_pages would be the fast path and
>>> optimization if there's enough memory, otherwise allocation page-by-page
>>> would happen as a fallback in case of fragmentation.
>>
>> That's also my understanding.
>>
>> But the counter argument is still there, if you really think after some
>> uptime there would be no contig pages, then the fast path will never
>> trigger, and all fall back to page-by-page routine, thus defeating any
>> changes to introduce any patch like this one.
>
> Such state is not permanent and memory management tries to coalesce the
> freed pages under the costly order back to bigger chunks. So in a system
> under heavy load the fragmentation can become bad, we would be ready for
> that. It would have to be very bad luck for a long time not to be able
> to get any higher order allocation at all. The process stacks depend on
> contig allocations 16K, slabs are backed by 2x4K pages, it wouldn't be
> just us depending on that.

Great we can finally be on the same page.

>
>>> The idea is to try hard when allocating the extent buffers, with
>>> fallbacks and potentially slower code but with the same guarantees as
>>> now at least.
>>>
>>> But as it is now, alloc_pages can't be used as replacement due to how
>>> the pages are obtained, find_or_create_page(). Currently I don't see a
>>> way how to convince folios to allocate the full nodesize range (with a
>>> given order) and then get the individual pages.
>>
>> I'm doing a preparation to separate the memory allocation for metadata
>> page allocation.
>>
>> The current patch is to do the allocation first, then attach the new
>> pages to the address space of btree inode.
>>
>> By this, we can easily add new method of allocation, like trying higher
>> order folio first, if failed then page by page allocation.
>
> Right, that would work and probably be the easiest way how to switch to
> folios.
>
>> But unfortunately there is a big problem here, even if we can forget the
>> argument on whether we can get contig pages after some uptime:
>>
>> - My local tests can still sometimes lockup due to some phantom locked
>>     pages
>
> I don't know how your patches do it, it could be that the page is not
> attached the same way as find_or_create_page/pagecache_get_page/__filemap_get_folio
> would do.

I'm doing the same filemap_add_folio() call, which is the final call of
all the above functions.
The tricky part is when there is already a page.
We need to lock the existing page, and do the existing eb detection etc.

My latest version seems to trigger very weird page count overflow.

Still debugging the various lock up/crashes though.

>
> So do you want this patch (adding the contig detection and eb::addr)
> applied?

I still want this patch to get merged, as it's really the first step for
the incoming higher order folio conversion.

Thanks,
Qu

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-21 21:30           ` Qu Wenruo
@ 2023-11-22 13:23             ` David Sterba
  0 siblings, 0 replies; 15+ messages in thread
From: David Sterba @ 2023-11-22 13:23 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Wed, Nov 22, 2023 at 08:00:53AM +1030, Qu Wenruo wrote:
> On 2023/11/22 07:44, David Sterba wrote:
> > On Wed, Nov 22, 2023 at 07:07:10AM +1030, Qu Wenruo wrote:
> >
> > So do you want this patch (adding the contig detection and eb::addr)
> > applied?
> 
> I still want this patch to get merged, as it's really the first step for
> the incoming higher order folio conversion.

Yeah, I understand the conversion will be incremental so no problem
adding the steps there.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-16  5:19 [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling Qu Wenruo
  2023-11-20 17:00 ` David Sterba
@ 2023-11-22 13:34 ` David Sterba
  2023-11-22 13:46 ` David Sterba
  2 siblings, 0 replies; 15+ messages in thread
From: David Sterba @ 2023-11-22 13:34 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> Currently btrfs extent buffer helpers are doing all the cross-page
> handling, as there is no guarantee that all those eb pages are
> contiguous.
> 
> However on systems with enough memory, there is a very high chance the
> page cache for btree_inode are allocated with physically contiguous
> pages.
> 
> In that case, we can skip all the complex cross-page handling, thus
> speeding up the code.
> 
> This patch adds a new member, extent_buffer::addr, which is only set to
> non-NULL if all the extent buffer pages are physically contiguous.
> 
> Signed-off-by: Qu Wenruo <wqu@suse.com>

Added to misc-next, thanks.

> ---
> Reason for RFC:
> 
> This change would increase the code size for all extent buffer helpers,
> and since there one more branch introduced, it may even slow down the
> system if most ebs do not have physically contiguous pages.

> But I still believe this is worthy trying, as my previous attempt to
> use virtually contiguous pages are rejected due to possible slow down in
> vm_map() call.
> 
> I don't have convincing benchmark yet, but so far no obvious performance
> drop observed either.

I've looked at the assembly code, it's "just" one test/jump in
frequently used functions but the slow path handling cross-page changes
is slow anyway.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-16  5:19 [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling Qu Wenruo
  2023-11-20 17:00 ` David Sterba
  2023-11-22 13:34 ` David Sterba
@ 2023-11-22 13:46 ` David Sterba
  2023-11-22 20:01   ` Qu Wenruo
  2 siblings, 1 reply; 15+ messages in thread
From: David Sterba @ 2023-11-22 13:46 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: linux-btrfs

On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>  	char *kaddr;
>  	int i;
>  
> +	memset(result, 0, BTRFS_CSUM_SIZE);
>  	shash->tfm = fs_info->csum_shash;
>  	crypto_shash_init(shash);
> +
> +	if (buf->addr) {
> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
> +				    buf->len - BTRFS_CSUM_SIZE, result);
> +		return;
> +	}

This duplicates the address and size
> +
>  	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
>  	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>  			    first_page_part - BTRFS_CSUM_SIZE);
> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>  		kaddr = page_address(buf->pages[i]);
>  		crypto_shash_update(shash, kaddr, PAGE_SIZE);
>  	}
> -	memset(result, 0, BTRFS_CSUM_SIZE);
>  	crypto_shash_final(shash, result);

I'd like to have only one code doing the crypto_shash_ calls, so I'm
suggesting this as the final code (the diff is not clear);

 74 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
 75 {
 76         struct btrfs_fs_info *fs_info = buf->fs_info;
 77         int num_pages;
 78         u32 first_page_part;
 79         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
 80         char *kaddr;
 81         int i;
 82
 83         shash->tfm = fs_info->csum_shash;
 84         crypto_shash_init(shash);
 85
 86         if (buf->addr) {
 87                 /* Pages are contiguous, handle it as one big page. */
 88                 kaddr = buf->addr;
 89                 first_page_part = fs_info->nodesize;
 90                 num_pages = 1;
 91         } else {
 92                 kaddr = page_address(buf->pages[0]);
 93                 first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
 94                 num_pages = num_extent_pages(buf);
 95         }
 96         kaddr += offset_in_page(buf->start) + BTRFS_CSUM_SIZE;
 97         first_page_part -= BTRFS_CSUM_SIZE;
 98
 99         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
100                             first_page_part - BTRFS_CSUM_SIZE);
101
102         for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
103                 kaddr = page_address(buf->pages[i]);
104                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
105         }
106         memset(result, 0, BTRFS_CSUM_SIZE);
107         crypto_shash_final(shash, result);
108 }

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-22 13:46 ` David Sterba
@ 2023-11-22 20:01   ` Qu Wenruo
  2023-11-22 22:05     ` David Sterba
                       ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Qu Wenruo @ 2023-11-22 20:01 UTC (permalink / raw)
  To: dsterba, Qu Wenruo; +Cc: linux-btrfs



On 2023/11/23 00:16, David Sterba wrote:
> On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
>> --- a/fs/btrfs/disk-io.c
>> +++ b/fs/btrfs/disk-io.c
>> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>>   	char *kaddr;
>>   	int i;
>>
>> +	memset(result, 0, BTRFS_CSUM_SIZE);
>>   	shash->tfm = fs_info->csum_shash;
>>   	crypto_shash_init(shash);
>> +
>> +	if (buf->addr) {
>> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
>> +				    buf->len - BTRFS_CSUM_SIZE, result);
>> +		return;
>> +	}
>
> This duplicates the address and size
>> +
>>   	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
>>   	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>>   			    first_page_part - BTRFS_CSUM_SIZE);
>> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>>   		kaddr = page_address(buf->pages[i]);
>>   		crypto_shash_update(shash, kaddr, PAGE_SIZE);
>>   	}
>> -	memset(result, 0, BTRFS_CSUM_SIZE);
>>   	crypto_shash_final(shash, result);
>
> I'd like to have only one code doing the crypto_shash_ calls, so I'm
> suggesting this as the final code (the diff is not clear);

This looks good to me, mind to update it inside your branch?

Thanks,
Qu
>
>   74 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>   75 {
>   76         struct btrfs_fs_info *fs_info = buf->fs_info;
>   77         int num_pages;
>   78         u32 first_page_part;
>   79         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>   80         char *kaddr;
>   81         int i;
>   82
>   83         shash->tfm = fs_info->csum_shash;
>   84         crypto_shash_init(shash);
>   85
>   86         if (buf->addr) {
>   87                 /* Pages are contiguous, handle it as one big page. */
>   88                 kaddr = buf->addr;
>   89                 first_page_part = fs_info->nodesize;
>   90                 num_pages = 1;
>   91         } else {
>   92                 kaddr = page_address(buf->pages[0]);
>   93                 first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize);
>   94                 num_pages = num_extent_pages(buf);
>   95         }
>   96         kaddr += offset_in_page(buf->start) + BTRFS_CSUM_SIZE;
>   97         first_page_part -= BTRFS_CSUM_SIZE;
>   98
>   99         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
> 100                             first_page_part - BTRFS_CSUM_SIZE);
> 101
> 102         for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) {
> 103                 kaddr = page_address(buf->pages[i]);
> 104                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
> 105         }
> 106         memset(result, 0, BTRFS_CSUM_SIZE);
> 107         crypto_shash_final(shash, result);
> 108 }
>

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-22 20:01   ` Qu Wenruo
@ 2023-11-22 22:05     ` David Sterba
  2023-11-23 18:50     ` David Sterba
  2023-11-23 20:51     ` Qu Wenruo
  2 siblings, 0 replies; 15+ messages in thread
From: David Sterba @ 2023-11-22 22:05 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Thu, Nov 23, 2023 at 06:31:41AM +1030, Qu Wenruo wrote:
> 
> 
> On 2023/11/23 00:16, David Sterba wrote:
> > On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> >> --- a/fs/btrfs/disk-io.c
> >> +++ b/fs/btrfs/disk-io.c
> >> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
> >>   	char *kaddr;
> >>   	int i;
> >>
> >> +	memset(result, 0, BTRFS_CSUM_SIZE);
> >>   	shash->tfm = fs_info->csum_shash;
> >>   	crypto_shash_init(shash);
> >> +
> >> +	if (buf->addr) {
> >> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
> >> +				    buf->len - BTRFS_CSUM_SIZE, result);
> >> +		return;
> >> +	}
> >
> > This duplicates the address and size
> >> +
> >>   	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
> >>   	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
> >>   			    first_page_part - BTRFS_CSUM_SIZE);
> >> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
> >>   		kaddr = page_address(buf->pages[i]);
> >>   		crypto_shash_update(shash, kaddr, PAGE_SIZE);
> >>   	}
> >> -	memset(result, 0, BTRFS_CSUM_SIZE);
> >>   	crypto_shash_final(shash, result);
> >
> > I'd like to have only one code doing the crypto_shash_ calls, so I'm
> > suggesting this as the final code (the diff is not clear);
> 
> This looks good to me, mind to update it inside your branch?

Thanks, yes I'll update it. As it's not a completely trivial change I'd
like to get the confirmation first.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-22 20:01   ` Qu Wenruo
  2023-11-22 22:05     ` David Sterba
@ 2023-11-23 18:50     ` David Sterba
  2023-11-23 20:51     ` Qu Wenruo
  2 siblings, 0 replies; 15+ messages in thread
From: David Sterba @ 2023-11-23 18:50 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Thu, Nov 23, 2023 at 06:31:41AM +1030, Qu Wenruo wrote:
> 
> 
> On 2023/11/23 00:16, David Sterba wrote:
> > On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> >> --- a/fs/btrfs/disk-io.c
> >> +++ b/fs/btrfs/disk-io.c
> >> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
> >>   	char *kaddr;
> >>   	int i;
> >>
> >> +	memset(result, 0, BTRFS_CSUM_SIZE);
> >>   	shash->tfm = fs_info->csum_shash;
> >>   	crypto_shash_init(shash);
> >> +
> >> +	if (buf->addr) {
> >> +		crypto_shash_digest(shash, buf->addr + offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
> >> +				    buf->len - BTRFS_CSUM_SIZE, result);
> >> +		return;
> >> +	}
> >
> > This duplicates the address and size
> >> +
> >>   	kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
> >>   	crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
> >>   			    first_page_part - BTRFS_CSUM_SIZE);
> >> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result)
> >>   		kaddr = page_address(buf->pages[i]);
> >>   		crypto_shash_update(shash, kaddr, PAGE_SIZE);
> >>   	}
> >> -	memset(result, 0, BTRFS_CSUM_SIZE);
> >>   	crypto_shash_final(shash, result);
> >
> > I'd like to have only one code doing the crypto_shash_ calls, so I'm
> > suggesting this as the final code (the diff is not clear);
> 
> This looks good to me, mind to update it inside your branch?

There's something wrong with my updates, it fails in the first test so
I've verified your works and the used it in misc-next for now.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-22 20:01   ` Qu Wenruo
  2023-11-22 22:05     ` David Sterba
  2023-11-23 18:50     ` David Sterba
@ 2023-11-23 20:51     ` Qu Wenruo
  2023-11-24 16:03       ` David Sterba
  2 siblings, 1 reply; 15+ messages in thread
From: Qu Wenruo @ 2023-11-23 20:51 UTC (permalink / raw)
  To: dsterba, Qu Wenruo; +Cc: linux-btrfs



On 2023/11/23 06:31, Qu Wenruo wrote:
> 
> 
> On 2023/11/23 00:16, David Sterba wrote:
>> On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
>>> --- a/fs/btrfs/disk-io.c
>>> +++ b/fs/btrfs/disk-io.c
>>> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer 
>>> *buf, u8 *result)
>>>       char *kaddr;
>>>       int i;
>>>
>>> +    memset(result, 0, BTRFS_CSUM_SIZE);
>>>       shash->tfm = fs_info->csum_shash;
>>>       crypto_shash_init(shash);
>>> +
>>> +    if (buf->addr) {
>>> +        crypto_shash_digest(shash, buf->addr + 
>>> offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
>>> +                    buf->len - BTRFS_CSUM_SIZE, result);
>>> +        return;
>>> +    }
>>
>> This duplicates the address and size
>>> +
>>>       kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
>>>       crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>>>                   first_page_part - BTRFS_CSUM_SIZE);
>>> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer 
>>> *buf, u8 *result)
>>>           kaddr = page_address(buf->pages[i]);
>>>           crypto_shash_update(shash, kaddr, PAGE_SIZE);
>>>       }
>>> -    memset(result, 0, BTRFS_CSUM_SIZE);
>>>       crypto_shash_final(shash, result);
>>
>> I'd like to have only one code doing the crypto_shash_ calls, so I'm
>> suggesting this as the final code (the diff is not clear);
> 
> This looks good to me, mind to update it inside your branch?
> 
> Thanks,
> Qu
>>
>>   74 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
>>   75 {
>>   76         struct btrfs_fs_info *fs_info = buf->fs_info;
>>   77         int num_pages;
>>   78         u32 first_page_part;
>>   79         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
>>   80         char *kaddr;
>>   81         int i;
>>   82
>>   83         shash->tfm = fs_info->csum_shash;
>>   84         crypto_shash_init(shash);
>>   85
>>   86         if (buf->addr) {
>>   87                 /* Pages are contiguous, handle it as one big 
>> page. */
>>   88                 kaddr = buf->addr;
>>   89                 first_page_part = fs_info->nodesize;
>>   90                 num_pages = 1;
>>   91         } else {
>>   92                 kaddr = page_address(buf->pages[0]);
>>   93                 first_page_part = min_t(u32, PAGE_SIZE, 
>> fs_info->nodesize);
>>   94                 num_pages = num_extent_pages(buf);
>>   95         }
>>   96         kaddr += offset_in_page(buf->start) + BTRFS_CSUM_SIZE;
>>   97         first_page_part -= BTRFS_CSUM_SIZE;

This is decreasing the @first_page_part.

>>   98
>>   99         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
>> 100                             first_page_part - BTRFS_CSUM_SIZE);

Meanwhile we're reducing the size again, and I guess this is the problem 
causing the test failure.

Although my initial version is indeed doing its own size calculation, 
the extra calculation is much simpler and does not affect the existing 
path (thus a little safer).

I'm fine with either way.

Thanks,
Qu

>> 101
>> 102         for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 
>> 1; i++) {
>> 103                 kaddr = page_address(buf->pages[i]);
>> 104                 crypto_shash_update(shash, kaddr, PAGE_SIZE);
>> 105         }
>> 106         memset(result, 0, BTRFS_CSUM_SIZE);
>> 107         crypto_shash_final(shash, result);
>> 108 }
>>
> 

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling
  2023-11-23 20:51     ` Qu Wenruo
@ 2023-11-24 16:03       ` David Sterba
  0 siblings, 0 replies; 15+ messages in thread
From: David Sterba @ 2023-11-24 16:03 UTC (permalink / raw)
  To: Qu Wenruo; +Cc: dsterba, Qu Wenruo, linux-btrfs

On Fri, Nov 24, 2023 at 07:21:04AM +1030, Qu Wenruo wrote:
> 
> 
> On 2023/11/23 06:31, Qu Wenruo wrote:
> > 
> > 
> > On 2023/11/23 00:16, David Sterba wrote:
> >> On Thu, Nov 16, 2023 at 03:49:06PM +1030, Qu Wenruo wrote:
> >>> --- a/fs/btrfs/disk-io.c
> >>> +++ b/fs/btrfs/disk-io.c
> >>> @@ -80,8 +80,16 @@ static void csum_tree_block(struct extent_buffer 
> >>> *buf, u8 *result)
> >>>       char *kaddr;
> >>>       int i;
> >>>
> >>> +    memset(result, 0, BTRFS_CSUM_SIZE);
> >>>       shash->tfm = fs_info->csum_shash;
> >>>       crypto_shash_init(shash);
> >>> +
> >>> +    if (buf->addr) {
> >>> +        crypto_shash_digest(shash, buf->addr + 
> >>> offset_in_page(buf->start) + BTRFS_CSUM_SIZE,
> >>> +                    buf->len - BTRFS_CSUM_SIZE, result);
> >>> +        return;
> >>> +    }
> >>
> >> This duplicates the address and size
> >>> +
> >>>       kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start);
> >>>       crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
> >>>                   first_page_part - BTRFS_CSUM_SIZE);
> >>> @@ -90,7 +98,6 @@ static void csum_tree_block(struct extent_buffer 
> >>> *buf, u8 *result)
> >>>           kaddr = page_address(buf->pages[i]);
> >>>           crypto_shash_update(shash, kaddr, PAGE_SIZE);
> >>>       }
> >>> -    memset(result, 0, BTRFS_CSUM_SIZE);
> >>>       crypto_shash_final(shash, result);
> >>
> >> I'd like to have only one code doing the crypto_shash_ calls, so I'm
> >> suggesting this as the final code (the diff is not clear);
> > 
> > This looks good to me, mind to update it inside your branch?
> > 
> > Thanks,
> > Qu
> >>
> >>   74 static void csum_tree_block(struct extent_buffer *buf, u8 *result)
> >>   75 {
> >>   76         struct btrfs_fs_info *fs_info = buf->fs_info;
> >>   77         int num_pages;
> >>   78         u32 first_page_part;
> >>   79         SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
> >>   80         char *kaddr;
> >>   81         int i;
> >>   82
> >>   83         shash->tfm = fs_info->csum_shash;
> >>   84         crypto_shash_init(shash);
> >>   85
> >>   86         if (buf->addr) {
> >>   87                 /* Pages are contiguous, handle it as one big 
> >> page. */
> >>   88                 kaddr = buf->addr;
> >>   89                 first_page_part = fs_info->nodesize;
> >>   90                 num_pages = 1;
> >>   91         } else {
> >>   92                 kaddr = page_address(buf->pages[0]);
> >>   93                 first_page_part = min_t(u32, PAGE_SIZE, 
> >> fs_info->nodesize);
> >>   94                 num_pages = num_extent_pages(buf);
> >>   95         }
> >>   96         kaddr += offset_in_page(buf->start) + BTRFS_CSUM_SIZE;
> >>   97         first_page_part -= BTRFS_CSUM_SIZE;
> 
> This is decreasing the @first_page_part.
> 
> >>   98
> >>   99         crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
> >> 100                             first_page_part - BTRFS_CSUM_SIZE);
> 
> Meanwhile we're reducing the size again, and I guess this is the problem 
> causing the test failure.

Yes that was it.

> Although my initial version is indeed doing its own size calculation, 
> the extra calculation is much simpler and does not affect the existing 
> path (thus a little safer).
> 
> I'm fine with either way.

I have some WIP that modifies the checksumming and there are already
like 4 ways how to pass the data, that's the reason I'd like to keep the
cases to minimum, here it was easy.

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2023-11-24 16:10 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2023-11-16  5:19 [PATCH RFC] btrfs: allow extent buffer helpers to skip cross-page handling Qu Wenruo
2023-11-20 17:00 ` David Sterba
2023-11-20 20:25   ` Qu Wenruo
2023-11-21 15:35     ` David Sterba
2023-11-21 20:37       ` Qu Wenruo
2023-11-21 21:14         ` David Sterba
2023-11-21 21:30           ` Qu Wenruo
2023-11-22 13:23             ` David Sterba
2023-11-22 13:34 ` David Sterba
2023-11-22 13:46 ` David Sterba
2023-11-22 20:01   ` Qu Wenruo
2023-11-22 22:05     ` David Sterba
2023-11-23 18:50     ` David Sterba
2023-11-23 20:51     ` Qu Wenruo
2023-11-24 16:03       ` David Sterba

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox