public inbox for linux-kernel@vger.kernel.org
 help / color / mirror / Atom feed
From: David Howells <dhowells@redhat.com>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: dhowells@redhat.com, kernel test robot <oliver.sang@intel.com>,
	oe-lkp@lists.linux.dev, lkp@intel.com,
	linux-kernel@vger.kernel.org,
	Christian Brauner <brauner@kernel.org>,
	Alexander Viro <viro@zeniv.linux.org.uk>,
	Jens Axboe <axboe@kernel.dk>, Christoph Hellwig <hch@lst.de>,
	Christian Brauner <christian@brauner.io>,
	Matthew Wilcox <willy@infradead.org>,
	David Laight <David.Laight@aculab.com>,
	ying.huang@intel.com, feng.tang@intel.com, fengwei.yin@intel.com
Subject: Re: [linus:master] [iov_iter] c9eec08bac: vm-scalability.throughput -16.9% regression
Date: Wed, 15 Nov 2023 18:35:34 +0000	[thread overview]
Message-ID: <4007890.1700073334@warthog.procyon.org.uk> (raw)
In-Reply-To: <CAHk-=wjCUckvZUQf7gqp2ziJUWxVpikM_6srFdbcNdBJTxExRg@mail.gmail.com>

Linus Torvalds <torvalds@linux-foundation.org> wrote:

> which makes *zero* sense. It first checks that the the length is at
> least 8 bytes, then it moves *one* word by hand, then it aligns the
> code to 8 bytes remaining, and does the remaining (possibly
> overlapping at the beginning) words as one "rep movsq",

That's not what I see.  See attached for a dump of _copy_from_iter from my
kernel.  It's just using REP MOVSB.

For reference, the compiler is gcc-13.2.1-3.fc39.x86_64 with
binutils-2.40-13.fc39.x86_64.

David
---

(gdb) disas _copy_from_iter
Dump of assembler code for function _copy_from_iter:
    <+0>:	push   %r15
    <+2>:	push   %r14
    <+4>:	push   %r13
    <+6>:	push   %r12
    <+8>:	push   %rbp
    <+9>:	push   %rbx
    <+10>:	sub    $0x40,%rsp
    <+14>:	mov    %gs:0x28,%rax
    <+23>:	mov    %rax,0x38(%rsp)
    <+28>:	xor    %eax,%eax
    <+30>:	cmpb   $0x0,0x3(%rdx)
    <+34>:	je     0xffffffff81770334 <_copy_from_iter+50>
    <+36>:	cmpb   $0x0,0x1(%rdx)
    <+40>:	mov    %rdi,%r12
    <+43>:	mov    %rdx,%rbx
    <+46>:	je     0xffffffff81770364 <_copy_from_iter+98>
    <+48>:	jmp    0xffffffff8177033d <_copy_from_iter+59>
    <+50>:	ud2
    <+52>:	xor    %ebp,%ebp
    <+54>:	jmp    0xffffffff8177067e <_copy_from_iter+892>

    <+59>:	mov    0x38(%rsp),%rax
    <+64>:	sub    %gs:0x28,%rax
    <+73>:	jne    0xffffffff8177068e <_copy_from_iter+908>
    <+79>:	add    $0x40,%rsp
    <+83>:	pop    %rbx
    <+84>:	pop    %rbp
    <+85>:	pop    %r12
    <+87>:	pop    %r13
    <+89>:	pop    %r14
    <+91>:	pop    %r15
    <+93>:	jmp    0xffffffff8176ee3f <__copy_from_iter_mc>

    <+98>:	mov    0x18(%rdx),%rax
    <+102>:	cmp    %rax,%rsi
    <+105>:	cmova  %rax,%rsi
    <+109>:	test   %rsi,%rsi
    <+112>:	mov    %rsi,%rbp
    <+115>:	je     0xffffffff8177067e <_copy_from_iter+892>

    <+121>:	mov    (%rdx),%dl
    <+123>:	test   %dl,%dl		# ITER_UBUF
    <+125>:	jne    0xffffffff817703cc <_copy_from_iter+202>
    <+127>:	mov    0x8(%rbx),%rsi
    <+131>:	mov    %rbp,%rcx
    <+134>:	add    0x10(%rbx),%rsi
    <+138>:	mov    %rsi,%rdx
    <+141>:	mov    %rbp,%rsi
    <+144>:	mov    %rdx,%rdi
    <+147>:	call   0xffffffff8176ec9e <__access_ok>
    <+152>:	test   %al,%al
    <+154>:	je     0xffffffff817703af <_copy_from_iter+173>
    <+156>:	nop
    <+157>:	nop
    <+158>:	nop
    <+159>:	mov    %r12,%rdi
    <+162>:	mov    %rdx,%rsi
    <+165>:	rep movsb %ds:(%rsi),%es:(%rdi)
    <+167>:	nop
    <+168>:	nop
    <+169>:	nop
    <+170>:	nop
    <+171>:	nop
    <+172>:	nop
    <+173>:	mov    %rbp,%rax
    <+176>:	sub    %rcx,%rax
    <+179>:	add    0x18(%rbx),%rcx
    <+183>:	add    %rax,0x8(%rbx)
    <+187>:	sub    %rbp,%rcx
    <+190>:	mov    %rax,%rbp
    <+193>:	mov    %rcx,0x18(%rbx)
    <+197>:	jmp    0xffffffff8177067e <_copy_from_iter+892>

    <+202>:	cmp    $0x1,%dl		# ITER_IOVEC
    <+205>:	jne    0xffffffff8177044c <_copy_from_iter+330>
    <+207>:	mov    0x10(%rbx),%r9
    <+211>:	mov    %rsi,%r8
    <+214>:	xor    %ebp,%ebp
    <+216>:	mov    0x8(%rbx),%r10
    <+220>:	mov    0x8(%r9),%rdx
    <+224>:	sub    %r10,%rdx
    <+227>:	cmp    %r8,%rdx
    <+230>:	cmova  %r8,%rdx
    <+234>:	test   %rdx,%rdx
    <+237>:	je     0xffffffff81770433 <_copy_from_iter+305>
    <+239>:	mov    (%r9),%r11
    <+242>:	mov    %rdx,%rsi
    <+245>:	mov    %rdx,%rcx
    <+248>:	add    %r10,%r11
    <+251>:	mov    %r11,%rdi
    <+254>:	call   0xffffffff8176ec9e <__access_ok>
    <+259>:	test   %al,%al
    <+261>:	je     0xffffffff8177041b <_copy_from_iter+281>
    <+263>:	nop
    <+264>:	nop
    <+265>:	nop
    <+266>:	lea    (%r12,%rbp,1),%rdi
    <+270>:	mov    %r11,%rsi
    <+273>:	rep movsb %ds:(%rsi),%es:(%rdi)
    <+275>:	nop
    <+276>:	nop
    <+277>:	nop
    <+278>:	nop
    <+279>:	nop
    <+280>:	nop
    <+281>:	mov    %rdx,%rax
    <+284>:	sub    %rdx,%r8
    <+287>:	sub    %rcx,%rax
    <+290>:	add    %rcx,%r8
    <+293>:	add    %rax,%rbp
    <+296>:	add    %rax,%r10
    <+299>:	cmp    0x8(%r9),%r10
    <+303>:	jb     0xffffffff81770444 <_copy_from_iter+322>
    <+305>:	add    $0x10,%r9
    <+309>:	xor    %r10d,%r10d
    <+312>:	test   %r8,%r8
    <+315>:	jne    0xffffffff817703de <_copy_from_iter+220>
    <+317>:	jmp    0xffffffff81770544 <_copy_from_iter+578>
    <+322>:	mov    %r10,%r8
    <+325>:	jmp    0xffffffff81770544 <_copy_from_iter+578>

    <+330>:	cmp    $0x2,%dl		# ITER_BVEC
    <+333>:	jne    0xffffffff817704ee <_copy_from_iter+492>
    <+339>:	mov    0x10(%rbx),%r8
    <+343>:	mov    %rsi,%r11
    <+346>:	xor    %ebp,%ebp
    <+348>:	mov    $0x1000,%r10d
    <+354>:	mov    0x8(%rbx),%r9
    <+358>:	mov    0xc(%r8),%ecx
    <+362>:	add    %r9,%rcx
    <+365>:	mov    %rcx,%rdi
    <+368>:	and    $0xfff,%ecx
    <+374>:	shr    $0xc,%rdi
    <+378>:	shl    $0x6,%rdi
    <+382>:	add    (%r8),%rdi
    <+385>:	call   0xffffffff8176e72e <kmap_local_page>
    <+390>:	mov    %r10,%rdx
    <+393>:	mov    %rax,%rsi
    <+396>:	mov    0x8(%r8),%eax
    <+400>:	sub    %r9,%rax
    <+403>:	cmp    %r11,%rax
    <+406>:	cmova  %r11,%rax
    <+410>:	sub    %rcx,%rdx
    <+413>:	cmp    %rdx,%rax
    <+416>:	cmova  %rdx,%rax
    <+420>:	add    %rcx,%rsi
    <+423>:	lea    (%r12,%rbp,1),%rdx
    <+427>:	mov    %eax,%ecx
    <+429>:	sub    %rax,%r11
    <+432>:	mov    %rdx,%rdi
    <+435>:	add    %rax,%rbp
    <+438>:	add    %rax,%r9
    <+441>:	rep movsb %ds:(%rsi),%es:(%rdi)
    <+443>:	mov    0x8(%r8),%eax
    <+447>:	cmp    %rax,%r9
    <+450>:	jb     0xffffffff817704cd <_copy_from_iter+459>
    <+452>:	add    $0x10,%r8
    <+456>:	xor    %r9d,%r9d
    <+459>:	test   %r11,%r11
    <+462>:	jne    0xffffffff81770468 <_copy_from_iter+358>
    <+464>:	mov    %r8,%rax
    <+467>:	sub    0x10(%rbx),%rax
    <+471>:	mov    %r9,0x8(%rbx)
    <+475>:	mov    %r8,0x10(%rbx)
    <+479>:	sar    $0x4,%rax
    <+483>:	sub    %rax,0x20(%rbx)
    <+487>:	jmp    0xffffffff81770671 <_copy_from_iter+879>

    <+492>:	cmp    $0x3,%dl		# ITER_KVEC
    <+495>:	jne    0xffffffff81770560 <_copy_from_iter+606>
    <+497>:	mov    0x10(%rbx),%r9
    <+501>:	mov    %rsi,%r8
    <+504>:	xor    %ebp,%ebp
    <+506>:	mov    0x8(%rbx),%rdx
    <+510>:	mov    0x8(%r9),%rax
    <+514>:	sub    %rdx,%rax
    <+517>:	cmp    %r8,%rax
    <+520>:	cmova  %r8,%rax
    <+524>:	test   %rax,%rax
    <+527>:	je     0xffffffff81770534 <_copy_from_iter+562>
    <+529>:	mov    (%r9),%rsi
    <+532>:	lea    (%r12,%rbp,1),%r10
    <+536>:	mov    %rax,%rcx
    <+539>:	add    %rax,%rbp
    <+542>:	mov    %r10,%rdi
    <+545>:	sub    %rax,%r8
    <+548>:	add    %rdx,%rsi
    <+551>:	add    %rax,%rdx
    <+554>:	rep movsb %ds:(%rsi),%es:(%rdi)
    <+556>:	cmp    0x8(%r9),%rdx
    <+560>:	jb     0xffffffff81770541 <_copy_from_iter+575>
    <+562>:	add    $0x10,%r9
    <+566>:	xor    %edx,%edx
    <+568>:	test   %r8,%r8
    <+571>:	jne    0xffffffff81770500 <_copy_from_iter+510>
    <+573>:	jmp    0xffffffff81770544 <_copy_from_iter+578>
    <+575>:	mov    %rdx,%r8
    <+578>:	mov    %r9,%rax
    <+581>:	sub    0x10(%rbx),%rax
    <+585>:	mov    %r8,0x8(%rbx)
    <+589>:	mov    %r9,0x10(%rbx)
    <+593>:	sar    $0x4,%rax
    <+597>:	sub    %rax,0x20(%rbx)
    <+601>:	jmp    0xffffffff81770671 <_copy_from_iter+879>

    <+606>:	cmp    $0x4,%dl		# ITER_XARRAY
    <+609>:	jne    0xffffffff81770677 <_copy_from_iter+885>
    <+615>:	movq   $0x3,0x18(%rsp)
    <+624>:	mov    0x10(%rbx),%rax
    <+628>:	xor    %edx,%edx
    <+630>:	mov    0x8(%rbx),%r14
    <+634>:	mov    %rdx,0x20(%rsp)
    <+639>:	add    0x20(%rbx),%r14
    <+643>:	mov    %rdx,0x28(%rsp)
    <+648>:	mov    %rdx,0x30(%rsp)
    <+653>:	mov    %rax,(%rsp)
    <+657>:	mov    %r14,%rax
    <+660>:	shr    $0xc,%rax
    <+664>:	mov    %rax,0x8(%rsp)
    <+669>:	xor    %eax,%eax
    <+671>:	mov    %eax,0x10(%rsp)
    <+675>:	or     $0xffffffffffffffff,%rsi
    <+679>:	mov    %rsp,%rdi
    <+682>:	mov    %rbp,%r13
    <+685>:	call   0xffffffff81d617d1 <xas_find>
    <+690>:	xor    %ebp,%ebp
    <+692>:	mov    $0x1000,%r15d
    <+698>:	mov    %rax,%r8
    <+701>:	test   %r8,%r8
    <+704>:	je     0xffffffff8177066d <_copy_from_iter+875>
    <+710>:	mov    %r8,%rsi
    <+713>:	mov    %rsp,%rdi
    <+716>:	call   0xffffffff8176e7a8 <xas_retry>
    <+721>:	test   %al,%al
    <+723>:	jne    0xffffffff8177065d <_copy_from_iter+859>
    <+729>:	test   $0x1,%r8d
    <+736>:	jne    0xffffffff81770614 <_copy_from_iter+786>
    <+738>:	mov    %r8,%rdi
    <+741>:	call   0xffffffff8176e6a4 <folio_test_hugetlb>
    <+746>:	test   %al,%al
    <+748>:	jne    0xffffffff81770618 <_copy_from_iter+790>
    <+750>:	call   0xffffffff8176e714 <folio_size>
    <+755>:	lea    (%r14,%rbp,1),%rdx
    <+759>:	lea    -0x1(%rax),%r10
    <+763>:	call   0xffffffff8176e714 <folio_size>
    <+768>:	and    %rdx,%r10
    <+771>:	sub    %r10,%rax
    <+774>:	cmp    %r13,%rax
    <+777>:	cmova  %r13,%rax
    <+781>:	mov    %rax,%r9
    <+784>:	jmp    0xffffffff81770658 <_copy_from_iter+854>
    <+786>:	ud2
    <+788>:	jmp    0xffffffff8177066d <_copy_from_iter+875>
    <+790>:	ud2
    <+792>:	jmp    0xffffffff8177066d <_copy_from_iter+875>
    <+794>:	lea    (%r12,%rbp,1),%r11
    <+798>:	mov    %r10,%rsi
    <+801>:	mov    %r8,%rdi
    <+804>:	call   0xffffffff8176e753 <kmap_local_folio>
    <+809>:	mov    %r15,%rdx
    <+812>:	mov    %r11,%rdi
    <+815>:	mov    %rax,%rsi
    <+818>:	mov    %r10,%rax
    <+821>:	and    $0xfff,%eax
    <+826>:	sub    %rax,%rdx
    <+829>:	cmp    %r9,%rdx
    <+832>:	cmova  %r9,%rdx
    <+836>:	add    %rdx,%rbp
    <+839>:	sub    %rdx,%r13
    <+842>:	mov    %edx,%ecx
    <+844>:	rep movsb %ds:(%rsi),%es:(%rdi)
    <+846>:	je     0xffffffff8177066d <_copy_from_iter+875>
    <+848>:	sub    %rdx,%r9
    <+851>:	add    %rdx,%r10
    <+854>:	test   %r9,%r9
    <+857>:	jne    0xffffffff8177061c <_copy_from_iter+794>
    <+859>:	mov    %rsp,%rdi
    <+862>:	call   0xffffffff8176f307 <xas_next_entry>
    <+867>:	mov    %rax,%r8
    <+870>:	jmp    0xffffffff817705bf <_copy_from_iter+701>
    <+875>:	add    %rbp,0x8(%rbx)
    <+879>:	sub    %rbp,0x18(%rbx)
    <+883>:	jmp    0xffffffff8177067e <_copy_from_iter+892>

    # ITER_DISCARD / default
    <+885>:	sub    %rsi,%rax
    <+888>:	mov    %rax,0x18(%rbx)
    <+892>:	mov    0x38(%rsp),%rax
    <+897>:	sub    %gs:0x28,%rax
    <+906>:	je     0xffffffff81770693 <_copy_from_iter+913>
    <+908>:	call   0xffffffff81d67d8d <__stack_chk_fail>
    <+913>:	add    $0x40,%rsp
    <+917>:	mov    %rbp,%rax
    <+920>:	pop    %rbx
    <+921>:	pop    %rbp
    <+922>:	pop    %r12
    <+924>:	pop    %r13
    <+926>:	pop    %r14
    <+928>:	pop    %r15
    <+930>:	jmp    0xffffffff81d745a0 <__x86_return_thunk>


  reply	other threads:[~2023-11-15 18:35 UTC|newest]

Thread overview: 49+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-11-07  1:40 [linus:master] [iov_iter] c9eec08bac: vm-scalability.throughput -16.9% regression kernel test robot
2023-11-15 12:48 ` David Howells
2023-11-15 13:18 ` David Howells
2023-11-15 15:20 ` David Howells
2023-11-15 16:53   ` Linus Torvalds
2023-11-15 17:38     ` Linus Torvalds
2023-11-15 18:35       ` David Howells [this message]
2023-11-15 18:45         ` Linus Torvalds
2023-11-15 19:09           ` Linus Torvalds
2023-11-15 20:54           ` David Howells
2023-11-15 18:38       ` Linus Torvalds
2023-11-15 19:09         ` Borislav Petkov
2023-11-15 19:15           ` Linus Torvalds
2023-11-15 20:07             ` Linus Torvalds
2023-11-16 10:07               ` David Laight
2023-11-16 10:14                 ` David Howells
2023-11-16 11:38                   ` David Laight
2023-11-15 19:26           ` Linus Torvalds
2023-11-16 15:44             ` Borislav Petkov
2023-11-16 16:44               ` David Howells
2023-11-17 11:35                 ` Borislav Petkov
2023-11-17 14:12                   ` David Howells
2023-11-17 16:09                     ` Borislav Petkov
2023-11-17 16:32                       ` Linus Torvalds
2023-11-17 16:44                         ` Linus Torvalds
2023-11-17 19:12                           ` Borislav Petkov
2023-11-17 21:57                             ` Linus Torvalds
2023-11-20 13:32                               ` David Howells
2023-11-20 16:06                                 ` Linus Torvalds
2023-11-20 16:09                                 ` David Laight
2023-11-16 16:48               ` Linus Torvalds
2023-11-16 16:58                 ` David Laight
2023-11-17 11:44                 ` Borislav Petkov
2023-11-17 12:09                   ` Jakub Jelinek
2023-11-17 12:18                     ` Borislav Petkov
2023-11-17 13:09                   ` David Laight
2023-11-17 13:36                     ` Linus Torvalds
2023-11-17 15:20                       ` David Laight
2023-11-15 21:43         ` David Howells
2023-11-15 21:50           ` Linus Torvalds
2023-11-15 21:59             ` Borislav Petkov
2023-11-15 22:59             ` David Howells
2023-11-16  3:26               ` Linus Torvalds
2023-11-16 16:55                 ` David Laight
2023-11-16 17:24                   ` Linus Torvalds
2023-11-16 22:53                     ` David Laight
2023-11-16 21:09                 ` David Howells
2023-11-16 22:36                   ` Linus Torvalds
2023-11-20 11:52             ` Borislav Petkov

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=4007890.1700073334@warthog.procyon.org.uk \
    --to=dhowells@redhat.com \
    --cc=David.Laight@aculab.com \
    --cc=axboe@kernel.dk \
    --cc=brauner@kernel.org \
    --cc=christian@brauner.io \
    --cc=feng.tang@intel.com \
    --cc=fengwei.yin@intel.com \
    --cc=hch@lst.de \
    --cc=linux-kernel@vger.kernel.org \
    --cc=lkp@intel.com \
    --cc=oe-lkp@lists.linux.dev \
    --cc=oliver.sang@intel.com \
    --cc=torvalds@linux-foundation.org \
    --cc=viro@zeniv.linux.org.uk \
    --cc=willy@infradead.org \
    --cc=ying.huang@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox