* [PATCH next 1/3] iov: Remove access_ok() from import_iovec()
2025-03-21 22:45 [PATCH next 0/3] iov: Optimise user copies David Laight
@ 2025-03-21 22:45 ` David Laight
2025-03-21 22:45 ` [PATCH next 2/3] iov: Use masked user accesses David Laight
` (3 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-21 22:45 UTC (permalink / raw)
To: linux-kernel
Cc: David Laight, Linus Torvalds, Jens Axboe, David Howells,
Matthew Wilcox, Andrew Morton, Alexander Viro
There is no point checking the validity of the user address when reading
the iovec[] from usespace.
It is checked again before the user copy itself.
Added in 09fc68dc66f75 (iov_iter: saner checks on copyin/copyout)
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
lib/iov_iter.c | 9 ---------
1 file changed, 9 deletions(-)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 65f550cb5081..623ec43e049a 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1471,13 +1471,6 @@ ssize_t __import_iovec(int type, const struct iovec __user *uvec,
for (seg = 0; seg < nr_segs; seg++) {
ssize_t len = (ssize_t)iov[seg].iov_len;
- if (!access_ok(iov[seg].iov_base, len)) {
- if (iov != *iovp)
- kfree(iov);
- *iovp = NULL;
- return -EFAULT;
- }
-
if (len > MAX_RW_COUNT - total_len) {
len = MAX_RW_COUNT - total_len;
iov[seg].iov_len = len;
@@ -1528,8 +1521,6 @@ int import_ubuf(int rw, void __user *buf, size_t len, struct iov_iter *i)
{
if (len > MAX_RW_COUNT)
len = MAX_RW_COUNT;
- if (unlikely(!access_ok(buf, len)))
- return -EFAULT;
iov_iter_ubuf(i, rw, buf, len);
return 0;
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH next 2/3] iov: Use masked user accesses
2025-03-21 22:45 [PATCH next 0/3] iov: Optimise user copies David Laight
2025-03-21 22:45 ` [PATCH next 1/3] iov: Remove access_ok() from import_iovec() David Laight
@ 2025-03-21 22:45 ` David Laight
2025-03-21 22:45 ` [PATCH next 3/3] iov: Optimise __import_iovec_ubuf() David Laight
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-21 22:45 UTC (permalink / raw)
To: linux-kernel
Cc: David Laight, Linus Torvalds, Jens Axboe, David Howells,
Matthew Wilcox, Andrew Morton, Alexander Viro
Check can_do_masked_user_access() and use mask_user_address() or
masked_user_access_begin() to optimise user access checks.
Read iov->buf before iov->len to ensure accessing the 'masked'
address fails when kernel addresses are converted to ~0ul.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
lib/iov_iter.c | 44 ++++++++++++++++++++++++++++----------------
1 file changed, 28 insertions(+), 16 deletions(-)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 623ec43e049a..796ef647bff2 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -19,12 +19,15 @@ size_t copy_to_user_iter(void __user *iter_to, size_t progress,
{
if (should_fail_usercopy())
return len;
- if (access_ok(iter_to, len)) {
- from += progress;
- instrument_copy_to_user(iter_to, from, len);
- len = raw_copy_to_user(iter_to, from, len);
- }
- return len;
+
+ if (can_do_masked_user_access())
+ iter_to = mask_user_address(iter_to);
+ else if (!access_ok(iter_to, len))
+ return len;
+
+ from += progress;
+ instrument_copy_to_user(iter_to, from, len);
+ return raw_copy_to_user(iter_to, from, len);
}
static __always_inline
@@ -49,12 +52,17 @@ size_t copy_from_user_iter(void __user *iter_from, size_t progress,
if (should_fail_usercopy())
return len;
- if (access_ok(iter_from, len)) {
- to += progress;
- instrument_copy_from_user_before(to, iter_from, len);
- res = raw_copy_from_user(to, iter_from, len);
- instrument_copy_from_user_after(to, iter_from, len, res);
- }
+
+ if (can_do_masked_user_access())
+ iter_from = mask_user_address(iter_from);
+ else if (!access_ok(iter_from, len))
+ return len;
+
+ to += progress;
+ instrument_copy_from_user_before(to, iter_from, len);
+ res = raw_copy_from_user(to, iter_from, len);
+ instrument_copy_from_user_after(to, iter_from, len, res);
+
return res;
}
@@ -1326,15 +1334,17 @@ static __noclone int copy_compat_iovec_from_user(struct iovec *iov,
int ret = -EFAULT;
u32 i;
- if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
+ if (can_do_masked_user_access())
+ uiov = masked_user_access_begin(uiov);
+ else if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
return -EFAULT;
for (i = 0; i < nr_segs; i++) {
compat_uptr_t buf;
compat_ssize_t len;
- unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
unsafe_get_user(buf, &uiov[i].iov_base, uaccess_end);
+ unsafe_get_user(len, &uiov[i].iov_len, uaccess_end);
/* check for compat_size_t not fitting in compat_ssize_t .. */
if (len < 0) {
@@ -1356,15 +1366,17 @@ static __noclone int copy_iovec_from_user(struct iovec *iov,
{
int ret = -EFAULT;
- if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
+ if (can_do_masked_user_access())
+ uiov = masked_user_access_begin(uiov);
+ else if (!user_access_begin(uiov, nr_segs * sizeof(*uiov)))
return -EFAULT;
do {
void __user *buf;
ssize_t len;
- unsafe_get_user(len, &uiov->iov_len, uaccess_end);
unsafe_get_user(buf, &uiov->iov_base, uaccess_end);
+ unsafe_get_user(len, &uiov->iov_len, uaccess_end);
/* check for size_t not fitting in ssize_t .. */
if (unlikely(len < 0)) {
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread
* [PATCH next 3/3] iov: Optimise __import_iovec_ubuf()
2025-03-21 22:45 [PATCH next 0/3] iov: Optimise user copies David Laight
2025-03-21 22:45 ` [PATCH next 1/3] iov: Remove access_ok() from import_iovec() David Laight
2025-03-21 22:45 ` [PATCH next 2/3] iov: Use masked user accesses David Laight
@ 2025-03-21 22:45 ` David Laight
2025-03-21 23:35 ` [PATCH next 0/3] iov: Optimise user copies Linus Torvalds
2025-03-22 14:36 ` Jens Axboe
4 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-21 22:45 UTC (permalink / raw)
To: linux-kernel
Cc: David Laight, Linus Torvalds, Jens Axboe, David Howells,
Matthew Wilcox, Andrew Morton, Alexander Viro
__import_iovec_ubuf() is used to read a single entry iovec[] into
the simpler 'ubuf' format.
Inline simplified bodies of copy_iovec_from_user() and
copy_compat_iovec_from_user() to avoid the overhead of the loop.
Signed-off-by: David Laight <david.laight.linux@gmail.com>
---
I've left in the assignments to iov->iov_base and iov->iov_len
but they may not be needed.
lib/iov_iter.c | 44 +++++++++++++++++++++++++++++++++++---------
1 file changed, 35 insertions(+), 9 deletions(-)
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 796ef647bff2..5eff3a307b71 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1437,22 +1437,48 @@ static ssize_t __import_iovec_ubuf(int type, const struct iovec __user *uvec,
struct iovec **iovp, struct iov_iter *i,
bool compat)
{
+ const struct compat_iovec __user *compat_uvec;
struct iovec *iov = *iovp;
- ssize_t ret;
+ void __user *buf;
+ ssize_t len;
+ int ret;
*iovp = NULL;
- if (compat)
- ret = copy_compat_iovec_from_user(iov, uvec, 1);
- else
- ret = copy_iovec_from_user(iov, uvec, 1);
- if (unlikely(ret))
- return ret;
+ if (can_do_masked_user_access())
+ uvec = masked_user_access_begin(uvec);
+ else if (!user_access_begin(uvec, compat ? sizeof (*compat_uvec) : sizeof (*uvec)))
+ return -EFAULT;
+
+ if (unlikely(compat)) {
+ compat_uvec = (const void __user *)uvec;
+ compat_uptr_t compat_buf;
+ compat_ssize_t compat_len;
+
+ unsafe_get_user(compat_buf, &compat_uvec->iov_base, uaccess_end_efault);
+ unsafe_get_user(compat_len, &compat_uvec->iov_len, uaccess_end_efault);
+ buf = compat_ptr(compat_buf);
+ len = compat_len;
+ } else {
+ unsafe_get_user(buf, &uvec->iov_base, uaccess_end_efault);
+ unsafe_get_user(len, &uvec->iov_len, uaccess_end_efault);
+ }
+ user_access_end();
- ret = import_ubuf(type, iov->iov_base, iov->iov_len, i);
+ /* check for size_t not fitting in ssize_t .. */
+ if (unlikely(len < 0))
+ return -EINVAL;
+
+ iov->iov_base = buf;
+ iov->iov_len = len;
+ ret = import_ubuf(type, buf, len, i);
if (unlikely(ret))
return ret;
- return i->count;
+ return len;
+
+uaccess_end_efault:
+ user_access_end();
+ return -EFAULT;
}
ssize_t __import_iovec(int type, const struct iovec __user *uvec,
--
2.39.5
^ permalink raw reply related [flat|nested] 9+ messages in thread
* Re: [PATCH next 0/3] iov: Optimise user copies
2025-03-21 22:45 [PATCH next 0/3] iov: Optimise user copies David Laight
` (2 preceding siblings ...)
2025-03-21 22:45 ` [PATCH next 3/3] iov: Optimise __import_iovec_ubuf() David Laight
@ 2025-03-21 23:35 ` Linus Torvalds
2025-03-22 10:08 ` David Laight
` (2 more replies)
2025-03-22 14:36 ` Jens Axboe
4 siblings, 3 replies; 9+ messages in thread
From: Linus Torvalds @ 2025-03-21 23:35 UTC (permalink / raw)
To: David Laight
Cc: linux-kernel, Jens Axboe, David Howells, Matthew Wilcox,
Andrew Morton, Alexander Viro
On Fri, 21 Mar 2025 at 15:46, David Laight <david.laight.linux@gmail.com> wrote:
>
> The speculation barrier in access_ok() is expensive.
>
> The first patch removes the initial checks when reading the iovec[].
> The checks are repeated before the actual copy.
>
> The second patch uses 'user address masking' if supported.
>
> The third removes a lot of code for single entry iovec[].
Ack, except I'd really like to see numbers for things that claim to
remove expensive stuff.
But yeah, the patches look sane.
Linus
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH next 0/3] iov: Optimise user copies
2025-03-21 23:35 ` [PATCH next 0/3] iov: Optimise user copies Linus Torvalds
@ 2025-03-22 10:08 ` David Laight
2025-03-22 22:37 ` David Laight
2025-03-29 11:31 ` David Laight
2 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-22 10:08 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Jens Axboe, David Howells, Matthew Wilcox,
Andrew Morton, Alexander Viro
On Fri, 21 Mar 2025 16:35:52 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Fri, 21 Mar 2025 at 15:46, David Laight <david.laight.linux@gmail.com> wrote:
> >
> > The speculation barrier in access_ok() is expensive.
> >
> > The first patch removes the initial checks when reading the iovec[].
> > The checks are repeated before the actual copy.
> >
> > The second patch uses 'user address masking' if supported.
> >
> > The third removes a lot of code for single entry iovec[].
>
> Ack, except I'd really like to see numbers for things that claim to
> remove expensive stuff.
Testing readv() of /dev/zero or writev() of /dev/null probably show
most gain.
I did do some allmodconfig builds and got no change, but I might have the
lfence compiled out of access_ok().
In any case kernel builds are pretty much user space limited.
David
>
> But yeah, the patches look sane.
>
> Linus
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH next 0/3] iov: Optimise user copies
2025-03-21 23:35 ` [PATCH next 0/3] iov: Optimise user copies Linus Torvalds
2025-03-22 10:08 ` David Laight
@ 2025-03-22 22:37 ` David Laight
2025-03-29 11:31 ` David Laight
2 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-22 22:37 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Jens Axboe, David Howells, Matthew Wilcox,
Andrew Morton, Alexander Viro
On Fri, 21 Mar 2025 16:35:52 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Fri, 21 Mar 2025 at 15:46, David Laight <david.laight.linux@gmail.com> wrote:
> >
> > The speculation barrier in access_ok() is expensive.
> >
> > The first patch removes the initial checks when reading the iovec[].
> > The checks are repeated before the actual copy.
> >
> > The second patch uses 'user address masking' if supported.
> >
> > The third removes a lot of code for single entry iovec[].
>
> Ack, except I'd really like to see numbers for things that claim to
> remove expensive stuff.
Except that some of the 'expensive stuff' is missing!
copy_from_user_iter() does:
if (access_ok())
raw_copy_from_user();
So it is missing the barrier_nospec().
The error handling is also different from _inline_copy_from_user().
(It doesn't zero-fill after a partial read.)
The observant will also notice that it is missing the massive
performance hit (and code bloat) of check_copy_size() (usercopy hardening).
Talking of performance I've dug out my clock cycle measuring code
(still full of different ipcsum functions).
I'm sure I got 12 bytes/clock on my i7-7 for the loop in the current kernel,
but it is only giving 10 today (possibly I don't have the latest version).
OTOH my new zen5 runs the adxo/adxc loop at 16 bytes/clock (i7-7 manages 12).
I'm going to try to find time for some memcpy() experiments.
David
>
> But yeah, the patches look sane.
>
> Linus
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH next 0/3] iov: Optimise user copies
2025-03-21 23:35 ` [PATCH next 0/3] iov: Optimise user copies Linus Torvalds
2025-03-22 10:08 ` David Laight
2025-03-22 22:37 ` David Laight
@ 2025-03-29 11:31 ` David Laight
2 siblings, 0 replies; 9+ messages in thread
From: David Laight @ 2025-03-29 11:31 UTC (permalink / raw)
To: Linus Torvalds
Cc: linux-kernel, Jens Axboe, David Howells, Matthew Wilcox,
Andrew Morton, Alexander Viro
On Fri, 21 Mar 2025 16:35:52 -0700
Linus Torvalds <torvalds@linux-foundation.org> wrote:
> On Fri, 21 Mar 2025 at 15:46, David Laight <david.laight.linux@gmail.com> wrote:
> >
> > The speculation barrier in access_ok() is expensive.
> >
> > The first patch removes the initial checks when reading the iovec[].
> > The checks are repeated before the actual copy.
> >
> > The second patch uses 'user address masking' if supported.
> >
> > The third removes a lot of code for single entry iovec[].
>
> Ack, except I'd really like to see numbers for things that claim to
> remove expensive stuff.
I've finally managed to take some measurements that make sense.
(measurements on a zen5).
Values are from:
mfence; start = rdpmc; mfence;
// A bit of setup code and an indirect function call
syscall(SYS_gettid/pwritev/preadv, ...)
mfence; end = rdpmc; mfence;
cycles = end - start;
The prints are done after all the tests.
The preadv/pwritev are for 8 buffers of 1 byte at offset 0 to /dev/zero.
gettid() is used as a base, a completely empty test is ~180 clocks.
(I've deleted the other 30 results for gettid - they are very consistent.)
I think they show an improvement, but it is similar to the change in gettid.
original
bac 3790 396 397 397 397 397 397 397 397 397 gettid
8 8792 1088 739 678 674 674 669 669 669 669 pwrite
8 717 676 669 669 669 669 669 669 669 669 pwrite
8 692 669 669 669 669 669 671 669 669 669 pwrite
8 692 669 669 669 669 669 669 669 669 669 pwrite
8 16744 1221 805 827 824 847 823 819 846 838 pread
8 865 857 827 828 808 820 845 828 821 824 pread
8 823 804 802 813 811 845 836 839 815 813 pread
8 862 813 853 846 847 828 821 820 806 846 pread
address masking
bdc 3218 592 391 391 391 391 391 391 391 391 gettid
8 7701 1091 1913 676 672 665 665 665 665 665 pwrite
8 722 669 665 665 665 665 665 665 665 665 pwrite
8 690 665 665 665 665 665 665 665 665 665 pwrite
8 690 666 665 665 665 665 665 665 665 665 pwrite
8 13532 1114 802 812 798 810 824 806 825 829 pread
8 808 805 806 838 799 829 839 831 843 796 pread
8 812 825 835 802 803 806 809 829 827 806 pread
8 807 801 842 804 806 828 811 824 810 808 pread
I ran an extra test with a barrier_nospec() following the access_ok() in the
'copy from user' path.
I'm not sure if its absence is an oversight or a valid decision on the assumption
that the data being read is just 'data' and not used for any control decisions.
Oddly this makes pread more expensive even though the change in is the write path.
I suspect the cache alignment of all the code changed.
fenced
bc5 4681 394 395 395 395 395 395 395 395 395 gettid
8 8988 1005 1114 772 688 684 684 684 684 684 pwrite
8 742 688 684 684 684 684 684 684 684 684 pwrite
8 709 681 684 684 684 684 684 684 684 685 pwrite
8 690 684 684 684 684 684 684 681 684 684 pwrite
8 13259 1025 813 834 825 833 833 833 830 815 pread
8 821 819 837 827 837 837 837 837 837 837 pread
8 819 834 819 833 837 834 809 833 837 837 pread
8 816 819 833 837 837 837 837 837 834 809 pread
David
~
>
> But yeah, the patches look sane.
>
> Linus
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH next 0/3] iov: Optimise user copies
2025-03-21 22:45 [PATCH next 0/3] iov: Optimise user copies David Laight
` (3 preceding siblings ...)
2025-03-21 23:35 ` [PATCH next 0/3] iov: Optimise user copies Linus Torvalds
@ 2025-03-22 14:36 ` Jens Axboe
4 siblings, 0 replies; 9+ messages in thread
From: Jens Axboe @ 2025-03-22 14:36 UTC (permalink / raw)
To: David Laight, linux-kernel
Cc: Linus Torvalds, David Howells, Matthew Wilcox, Andrew Morton,
Alexander Viro
On 3/21/25 4:45 PM, David Laight wrote:
> The speculation barrier in access_ok() is expensive.
>
> The first patch removes the initial checks when reading the iovec[].
> The checks are repeated before the actual copy.
>
> The second patch uses 'user address masking' if supported.
>
> The third removes a lot of code for single entry iovec[].
Looks good to me:
Reviewed-by: Jens Axboe <axboe@kernel.dk>
--
Jens Axboe
^ permalink raw reply [flat|nested] 9+ messages in thread