linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] filemap: Convert generic_perform_write() to support large folios
@ 2024-01-11 19:25 Matthew Wilcox (Oracle)
  2024-01-11 19:57 ` Matthew Wilcox
  2024-01-19  2:36 ` kernel test robot
  0 siblings, 2 replies; 4+ messages in thread
From: Matthew Wilcox (Oracle) @ 2024-01-11 19:25 UTC (permalink / raw)
  To: David Howells, linux-fsdevel; +Cc: Matthew Wilcox (Oracle)

Modelled after the loop in iomap_write_iter(), copy larger chunks from
userspace if the filesystem has created large folios.

Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/filemap.c | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 750e779c23db..964d5d7b9721 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -3893,6 +3893,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
 ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 {
 	struct file *file = iocb->ki_filp;
+	size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
 	loff_t pos = iocb->ki_pos;
 	struct address_space *mapping = file->f_mapping;
 	const struct address_space_operations *a_ops = mapping->a_ops;
@@ -3901,16 +3902,18 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 
 	do {
 		struct page *page;
-		unsigned long offset;	/* Offset into pagecache page */
-		unsigned long bytes;	/* Bytes to write to page */
+		struct folio *folio;
+		size_t offset;		/* Offset into folio */
+		size_t bytes;		/* Bytes to write to folio */
 		size_t copied;		/* Bytes copied from user */
 		void *fsdata = NULL;
 
-		offset = (pos & (PAGE_SIZE - 1));
-		bytes = min_t(unsigned long, PAGE_SIZE - offset,
-						iov_iter_count(i));
+		bytes = iov_iter_count(i);
+retry:
+		offset = pos & (chunk - 1);
+		bytes = min(chunk - offset, bytes);
+		balance_dirty_pages_ratelimited(mapping);
 
-again:
 		/*
 		 * Bring in the user page that we will copy from _first_.
 		 * Otherwise there's a nasty deadlock on copying from the
@@ -3932,11 +3935,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 		if (unlikely(status < 0))
 			break;
 
+		folio = page_folio(page);
+		offset = offset_in_folio(folio, pos);
+		if (bytes > folio_size(folio) - offset)
+			bytes = folio_size(folio) - offset;
+
 		if (mapping_writably_mapped(mapping))
-			flush_dcache_page(page);
+			flush_dcache_folio(folio);
 
-		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
-		flush_dcache_page(page);
+		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
+		flush_dcache_folio(folio);
 
 		status = a_ops->write_end(file, mapping, pos, bytes, copied,
 						page, fsdata);
@@ -3954,14 +3962,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
 			 * halfway through, might be a race with munmap,
 			 * might be severe memory pressure.
 			 */
-			if (copied)
+			if (chunk > PAGE_SIZE)
+				chunk /= 2;
+			if (copied) {
 				bytes = copied;
-			goto again;
+				goto retry;
+			}
+		} else {
+			pos += status;
+			written += status;
 		}
-		pos += status;
-		written += status;
-
-		balance_dirty_pages_ratelimited(mapping);
 	} while (iov_iter_count(i));
 
 	if (!written)
-- 
2.43.0


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH] filemap: Convert generic_perform_write() to support large folios
  2024-01-11 19:25 [PATCH] filemap: Convert generic_perform_write() to support large folios Matthew Wilcox (Oracle)
@ 2024-01-11 19:57 ` Matthew Wilcox
  2024-01-11 23:03   ` Matthew Wilcox
  2024-01-19  2:36 ` kernel test robot
  1 sibling, 1 reply; 4+ messages in thread
From: Matthew Wilcox @ 2024-01-11 19:57 UTC (permalink / raw)
  To: David Howells, linux-fsdevel

On Thu, Jan 11, 2024 at 07:25:13PM +0000, Matthew Wilcox (Oracle) wrote:
> Modelled after the loop in iomap_write_iter(), copy larger chunks from
> userspace if the filesystem has created large folios.

Heh, I hadn't tested this on ext4, and it blows up spectacularly.

We hit the first of these two BUG_ONs:

+++ b/fs/buffer.c
@@ -2078,8 +2078,8 @@ int __block_write_begin_int(struct folio *folio, loff_t pos, unsigned len,
        struct buffer_head *bh, *head, *wait[2], **wait_bh=wait;

        BUG_ON(!folio_test_locked(folio));
-       BUG_ON(to > folio_size(folio));
-       BUG_ON(from > to);
+       if (from < to || to > folio_size(folio))
+               to = folio_size(folio);

        head = folio_create_buffers(folio, inode, 0);
        blocksize = head->b_size;

I'm just doing an xfstests run now, but I think this will fix the problem
(it's up to 400 seconds after crashing at 6 seconds the first time).
Essentially if we pass in a length larger than the folio we find, we
just clamp the amount of work we do to the size of the folio.

> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/filemap.c | 40 +++++++++++++++++++++++++---------------
>  1 file changed, 25 insertions(+), 15 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 750e779c23db..964d5d7b9721 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -3893,6 +3893,7 @@ EXPORT_SYMBOL(generic_file_direct_write);
>  ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  {
>  	struct file *file = iocb->ki_filp;
> +	size_t chunk = PAGE_SIZE << MAX_PAGECACHE_ORDER;
>  	loff_t pos = iocb->ki_pos;
>  	struct address_space *mapping = file->f_mapping;
>  	const struct address_space_operations *a_ops = mapping->a_ops;
> @@ -3901,16 +3902,18 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  
>  	do {
>  		struct page *page;
> -		unsigned long offset;	/* Offset into pagecache page */
> -		unsigned long bytes;	/* Bytes to write to page */
> +		struct folio *folio;
> +		size_t offset;		/* Offset into folio */
> +		size_t bytes;		/* Bytes to write to folio */
>  		size_t copied;		/* Bytes copied from user */
>  		void *fsdata = NULL;
>  
> -		offset = (pos & (PAGE_SIZE - 1));
> -		bytes = min_t(unsigned long, PAGE_SIZE - offset,
> -						iov_iter_count(i));
> +		bytes = iov_iter_count(i);
> +retry:
> +		offset = pos & (chunk - 1);
> +		bytes = min(chunk - offset, bytes);
> +		balance_dirty_pages_ratelimited(mapping);
>  
> -again:
>  		/*
>  		 * Bring in the user page that we will copy from _first_.
>  		 * Otherwise there's a nasty deadlock on copying from the
> @@ -3932,11 +3935,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  		if (unlikely(status < 0))
>  			break;
>  
> +		folio = page_folio(page);
> +		offset = offset_in_folio(folio, pos);
> +		if (bytes > folio_size(folio) - offset)
> +			bytes = folio_size(folio) - offset;
> +
>  		if (mapping_writably_mapped(mapping))
> -			flush_dcache_page(page);
> +			flush_dcache_folio(folio);
>  
> -		copied = copy_page_from_iter_atomic(page, offset, bytes, i);
> -		flush_dcache_page(page);
> +		copied = copy_folio_from_iter_atomic(folio, offset, bytes, i);
> +		flush_dcache_folio(folio);
>  
>  		status = a_ops->write_end(file, mapping, pos, bytes, copied,
>  						page, fsdata);
> @@ -3954,14 +3962,16 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i)
>  			 * halfway through, might be a race with munmap,
>  			 * might be severe memory pressure.
>  			 */
> -			if (copied)
> +			if (chunk > PAGE_SIZE)
> +				chunk /= 2;
> +			if (copied) {
>  				bytes = copied;
> -			goto again;
> +				goto retry;
> +			}
> +		} else {
> +			pos += status;
> +			written += status;
>  		}
> -		pos += status;
> -		written += status;
> -
> -		balance_dirty_pages_ratelimited(mapping);
>  	} while (iov_iter_count(i));
>  
>  	if (!written)
> -- 
> 2.43.0
> 

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] filemap: Convert generic_perform_write() to support large folios
  2024-01-11 19:57 ` Matthew Wilcox
@ 2024-01-11 23:03   ` Matthew Wilcox
  0 siblings, 0 replies; 4+ messages in thread
From: Matthew Wilcox @ 2024-01-11 23:03 UTC (permalink / raw)
  To: David Howells, linux-fsdevel

On Thu, Jan 11, 2024 at 07:57:08PM +0000, Matthew Wilcox wrote:
> I'm just doing an xfstests run now, but I think this will fix the problem
> (it's up to 400 seconds after crashing at 6 seconds the first time).
> Essentially if we pass in a length larger than the folio we find, we
> just clamp the amount of work we do to the size of the folio.

No, that's not enough.  It dies in submit_bh_wbc() at the
BUG_ON(buffer_delay(bh));  I have no idea why.

Since Dave tells me he's not using generic_perform_write(), and the only
possible user would be the bs>PS work, I'm going to abandon this for now.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH] filemap: Convert generic_perform_write() to support large folios
  2024-01-11 19:25 [PATCH] filemap: Convert generic_perform_write() to support large folios Matthew Wilcox (Oracle)
  2024-01-11 19:57 ` Matthew Wilcox
@ 2024-01-19  2:36 ` kernel test robot
  1 sibling, 0 replies; 4+ messages in thread
From: kernel test robot @ 2024-01-19  2:36 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle)
  Cc: oe-lkp, lkp, linux-fsdevel, David Howells,
	Matthew Wilcox (Oracle), oliver.sang



Hello,

kernel test robot noticed "kernel_BUG_at_fs/buffer.c" on:

commit: 5e90db780eee7cd4635d0b784603424f66d101e1 ("[PATCH] filemap: Convert generic_perform_write() to support large folios")
url: https://github.com/intel-lab-lkp/linux/commits/Matthew-Wilcox-Oracle/filemap-Convert-generic_perform_write-to-support-large-folios/20240112-032734
base: https://git.kernel.org/cgit/linux/kernel/git/akpm/mm.git mm-everything
patch link: https://lore.kernel.org/all/20240111192513.3505769-1-willy@infradead.org/
patch subject: [PATCH] filemap: Convert generic_perform_write() to support large folios

in testcase: xfstests
version: xfstests-x86_64-17324dbc-1_20240115
with following parameters:

	bp1_memmap: 4G!8G
	bp2_memmap: 4G!10G
	bp3_memmap: 4G!16G
	bp4_memmap: 4G!22G
	nr_pmem: 4
	fs: ext2
	test: generic-dax



compiler: gcc-12
test machine: 16 threads 1 sockets Intel(R) Xeon(R) CPU D-1541 @ 2.10GHz (Broadwell-DE) with 48G memory

(please refer to attached dmesg/kmsg for entire log/backtrace)



If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <oliver.sang@intel.com>
| Closes: https://lore.kernel.org/oe-lkp/202401191032.c8606703-oliver.sang@intel.com


[  264.977376][ T4263] ------------[ cut here ]------------
[  264.984538][ T4263] kernel BUG at fs/buffer.c:2081!
[  264.991338][ T4263] invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI
[  264.999394][ T4263] CPU: 1 PID: 4263 Comm: cp Tainted: G S                 6.7.0-01803-g5e90db780eee #1
[  265.010554][ T4263] Hardware name: Supermicro SYS-5018D-FN4T/X10SDV-8C-TLN4F, BIOS 1.1 03/02/2016
[ 265.021275][ T4263] RIP: 0010:__block_write_begin_int (fs/buffer.c:2081 (discriminator 1)) 
[ 265.029168][ T4263] Code: 65 48 2b 14 25 28 00 00 00 0f 85 3c 01 00 00 48 81 c4 c0 00 00 00 5b 5d 41 5c 41 5d 41 5e 41 5f c3 0f 0b e9 4d fb ff ff 0f 0b <0f> 0b 0f 0b 48 8b 54 24 08 48 8b 74 24 10 48 89 ef 89 04 24 e8 fd
All code
========
   0:	65 48 2b 14 25 28 00 	sub    %gs:0x28,%rdx
   7:	00 00 
   9:	0f 85 3c 01 00 00    	jne    0x14b
   f:	48 81 c4 c0 00 00 00 	add    $0xc0,%rsp
  16:	5b                   	pop    %rbx
  17:	5d                   	pop    %rbp
  18:	41 5c                	pop    %r12
  1a:	41 5d                	pop    %r13
  1c:	41 5e                	pop    %r14
  1e:	41 5f                	pop    %r15
  20:	c3                   	retq   
  21:	0f 0b                	ud2    
  23:	e9 4d fb ff ff       	jmpq   0xfffffffffffffb75
  28:	0f 0b                	ud2    
  2a:*	0f 0b                	ud2    		<-- trapping instruction
  2c:	0f 0b                	ud2    
  2e:	48 8b 54 24 08       	mov    0x8(%rsp),%rdx
  33:	48 8b 74 24 10       	mov    0x10(%rsp),%rsi
  38:	48 89 ef             	mov    %rbp,%rdi
  3b:	89 04 24             	mov    %eax,(%rsp)
  3e:	e8                   	.byte 0xe8
  3f:	fd                   	std    

Code starting with the faulting instruction
===========================================
   0:	0f 0b                	ud2    
   2:	0f 0b                	ud2    
   4:	48 8b 54 24 08       	mov    0x8(%rsp),%rdx
   9:	48 8b 74 24 10       	mov    0x10(%rsp),%rsi
   e:	48 89 ef             	mov    %rbp,%rdi
  11:	89 04 24             	mov    %eax,(%rsp)
  14:	e8                   	.byte 0xe8
  15:	fd                   	std    
[  265.052266][ T4263] RSP: 0018:ffffc9000c9dfaa0 EFLAGS: 00010287
[  265.059752][ T4263] RAX: 0000000000020000 RBX: 0000000000000000 RCX: ffffffff81b86bf2
[  265.069035][ T4263] RDX: 1ffffd4000bbc0b0 RSI: 0000000000000008 RDI: ffffea0005de0580
[  265.078345][ T4263] RBP: ffffea0005de0580 R08: 0000000000000000 R09: fffff94000bbc0b0
[  265.087668][ T4263] R10: ffffea0005de0587 R11: 0000000000000001 R12: 0000000000000000
[  265.096944][ T4263] R13: 0000000000001000 R14: ffffea0005de0588 R15: ffffffffc149b940
[  265.106163][ T4263] FS:  00007f4d3456c800(0000) GS:ffff888ba9c80000(0000) knlGS:0000000000000000
[  265.116366][ T4263] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[  265.124278][ T4263] CR2: 00007f4d3456a000 CR3: 0000000138c2c002 CR4: 00000000003706f0
[  265.133569][ T4263] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[  265.142841][ T4263] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[  265.152019][ T4263] Call Trace:
[  265.156536][ T4263]  <TASK>
[ 265.160748][ T4263] ? die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434 arch/x86/kernel/dumpstack.c:447) 
[ 265.165703][ T4263] ? do_trap (arch/x86/kernel/traps.c:112 arch/x86/kernel/traps.c:153) 
[ 265.171181][ T4263] ? __block_write_begin_int (fs/buffer.c:2081 (discriminator 1)) 
[ 265.178019][ T4263] ? do_error_trap (arch/x86/include/asm/traps.h:59 arch/x86/kernel/traps.c:174) 
[ 265.183817][ T4263] ? __block_write_begin_int (fs/buffer.c:2081 (discriminator 1)) 
[ 265.190674][ T4263] ? handle_invalid_op (arch/x86/kernel/traps.c:212) 
[ 265.196850][ T4263] ? __block_write_begin_int (fs/buffer.c:2081 (discriminator 1)) 
[ 265.203673][ T4263] ? exc_invalid_op (arch/x86/kernel/traps.c:265) 
[ 265.209523][ T4263] ? asm_exc_invalid_op (arch/x86/include/asm/idtentry.h:568) 
[ 265.215747][ T4263] ? ext2_iomap_begin (fs/ext2/inode.c:785) ext2
[ 265.222523][ T4263] ? __block_write_begin_int (arch/x86/include/asm/bitops.h:206 arch/x86/include/asm/bitops.h:238 include/asm-generic/bitops/instrumented-non-atomic.h:142 include/linux/page-flags.h:785 include/linux/page-flags.h:806 include/linux/mm.h:1090 include/linux/mm.h:2141 fs/buffer.c:2081) 
[ 265.229300][ T4263] ? __block_write_begin_int (fs/buffer.c:2081 (discriminator 1)) 
[ 265.236035][ T4263] ? mode_strip_sgid (fs/inode.c:1889) 
[ 265.242047][ T4263] ? ext2_iomap_begin (fs/ext2/inode.c:785) ext2
[ 265.248757][ T4263] ? invalidate_bh_lrus_cpu (fs/buffer.c:2070) 
[ 265.255144][ T4263] ? ext2_iomap_begin (fs/ext2/inode.c:785) ext2
[ 265.261773][ T4263] block_write_begin (fs/buffer.c:2152 fs/buffer.c:2211) 
[ 265.267589][ T4263] ext2_write_begin (fs/ext2/inode.c:924) ext2
[ 265.273935][ T4263] generic_perform_write (mm/filemap.c:3930) 
[ 265.280191][ T4263] ? preempt_notifier_dec (kernel/sched/core.c:10130) 
[ 265.286318][ T4263] ? do_sync_mmap_readahead (mm/filemap.c:3891) 
[ 265.292783][ T4263] ? file_update_time (fs/inode.c:2170) 
[ 265.298599][ T4263] generic_file_write_iter (include/linux/fs.h:807 mm/filemap.c:4059) 
[ 265.304874][ T4263] vfs_write (include/linux/fs.h:2085 fs/read_write.c:497 fs/read_write.c:590) 
[ 265.310003][ T4263] ? kernel_write (fs/read_write.c:571) 
[ 265.315543][ T4263] ksys_write (fs/read_write.c:643) 
[ 265.320595][ T4263] ? __ia32_sys_read (fs/read_write.c:633) 
[ 265.326141][ T4263] ? do_user_addr_fault (include/linux/mm.h:689 arch/x86/mm/fault.c:1366) 
[ 265.332146][ T4263] do_syscall_64 (arch/x86/entry/common.c:52 arch/x86/entry/common.c:83) 
[ 265.337434][ T4263] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) 
[  265.344113][ T4263] RIP: 0033:0x7f4d3471c473
[ 265.349297][ T4263] Code: 8b 15 21 2a 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b7 0f 1f 00 64 8b 04 25 18 00 00 00 85 c0 75 14 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 55 c3 0f 1f 40 00 48 83 ec 28 48 89 54 24 18
All code
========
   0:	8b 15 21 2a 0e 00    	mov    0xe2a21(%rip),%edx        # 0xe2a27
   6:	f7 d8                	neg    %eax
   8:	64 89 02             	mov    %eax,%fs:(%rdx)
   b:	48 c7 c0 ff ff ff ff 	mov    $0xffffffffffffffff,%rax
  12:	eb b7                	jmp    0xffffffffffffffcb
  14:	0f 1f 00             	nopl   (%rax)
  17:	64 8b 04 25 18 00 00 	mov    %fs:0x18,%eax
  1e:	00 
  1f:	85 c0                	test   %eax,%eax
  21:	75 14                	jne    0x37
  23:	b8 01 00 00 00       	mov    $0x1,%eax
  28:	0f 05                	syscall 
  2a:*	48 3d 00 f0 ff ff    	cmp    $0xfffffffffffff000,%rax		<-- trapping instruction
  30:	77 55                	ja     0x87
  32:	c3                   	retq   
  33:	0f 1f 40 00          	nopl   0x0(%rax)
  37:	48 83 ec 28          	sub    $0x28,%rsp
  3b:	48 89 54 24 18       	mov    %rdx,0x18(%rsp)

Code starting with the faulting instruction
===========================================
   0:	48 3d 00 f0 ff ff    	cmp    $0xfffffffffffff000,%rax
   6:	77 55                	ja     0x5d
   8:	c3                   	retq   
   9:	0f 1f 40 00          	nopl   0x0(%rax)
   d:	48 83 ec 28          	sub    $0x28,%rsp
  11:	48 89 54 24 18       	mov    %rdx,0x18(%rsp)


The kernel config and materials to reproduce are available at:
https://download.01.org/0day-ci/archive/20240119/202401191032.c8606703-oliver.sang@intel.com



-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2024-01-19  2:36 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2024-01-11 19:25 [PATCH] filemap: Convert generic_perform_write() to support large folios Matthew Wilcox (Oracle)
2024-01-11 19:57 ` Matthew Wilcox
2024-01-11 23:03   ` Matthew Wilcox
2024-01-19  2:36 ` kernel test robot

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).