Re: Optimizing small reads

linux-fsdevel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed

* Re: Optimizing small reads
       [not found]         ` <CAHk-=wg-eq7s8UMogFCS8OJQt9hwajwKP6kzW88avbx+4JXhcA@mail.gmail.com>
@ 2025-10-06 11:44           ` Kiryl Shutsemau
  2025-10-06 15:50             ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-06 11:44 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Fri, Oct 03, 2025 at 10:49:36AM -0700, Linus Torvalds wrote:
> I'd love it if somebody took a look. I'm definitely not going to spend
> any more time on this during the merge window...

Below is my take on this. Lightly tested.

Some notes:

 - Do we want a bounded retry on read_seqcount_retry()?
   Maybe upto 3 iterations?

 - HIGHMEM support is trivial with memcpy_from_file_folio();

 - I opted for late partial read check. It would be nice allow to read
   across PAGE_SIZE boundary as long as it is in the same folio;

 - Move i_size check after uptodate check. It seems to be required
   according to the comment in filemap_read(). But I cannot say I
   understand i_size implications here.

 - Size of area is 256 bytes. I wounder if we want to get the fast read
   to work on full page chunks. Can we dedicate a page per CPU for this?
   I expect it to cover substantially more cases.

Any comments are welcome.

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..fc26c6826392 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,6 +2661,57 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
+static inline unsigned long filemap_fast_read(struct address_space *mapping,
+					      loff_t pos, char *buffer,
+					      size_t size)
+{
+	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	lockdep_assert_in_rcu_read_lock();
+
+	seq = read_seqcount_begin(&mapping->i_pages_delete_seqcnt);
+
+	xas_reset(&xas);
+	folio = xas_load(&xas);
+	if (xas_retry(&xas, folio))
+		return 0;
+
+	if (!folio || xa_is_value(folio))
+		return 0;
+
+	if (!folio_test_uptodate(folio))
+		return 0;
+
+	/* No fast-case if readahead is supposed to started */
+	if (folio_test_readahead(folio))
+		return 0;
+	/* .. or mark it accessed */
+	if (!folio_test_referenced(folio))
+		return 0;
+
+	/* i_size check must be after folio_test_uptodate() */
+	file_size = i_size_read(mapping->host);
+	if (unlikely(pos >= file_size))
+		return 0;
+	if (size > file_size - pos)
+		size = file_size - pos;
+
+	/* Do the data copy */
+	if (memcpy_from_file_folio(buffer, folio, pos, size) != size) {
+		/* No partial reads */
+		return 0;
+	}
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+		return 0;
+
+	return size;
+}
+
 /**
  * filemap_read - Read data from the page cache.
  * @iocb: The iocb to read.
@@ -2679,7 +2732,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+	} area __uninitialized;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
@@ -2693,7 +2749,34 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+
+	/*
+	 * Try a quick lockless read into the 'area' union. Note that
+	 * this union is intentionally marked "__uninitialized", because
+	 * any compiler initialization would be pointless since this
+	 * can fill it will garbage.
+	 */
+	if (iov_iter_count(iter) <= sizeof(area)) {
+		size_t count = iov_iter_count(iter);
+
+		/* Let's see if we can just do the read under RCU */
+		rcu_read_lock();
+		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
+		rcu_read_unlock();
+		if (count) {
+			size_t copied = copy_to_iter(area.buffer, count, iter);
+			if (unlikely(!copied))
+				return already_read ? already_read : -EFAULT;
+			ra->prev_pos = iocb->ki_pos += copied;
+			file_accessed(filp);
+			return copied + already_read;
+		}
+	}
+
+	/*
+	 * This actually properly initializes the fbatch for the slow case
+	 */
+	folio_batch_init(&area.fbatch);
 
 	do {
 		cond_resched();
@@ -2709,7 +2792,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, &area.fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2737,11 +2820,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    area.fbatch.folios[0]))
+			folio_mark_accessed(area.fbatch.folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+			struct folio *folio = area.fbatch.folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,13 +2855,13 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+			struct folio *folio = area.fbatch.folios[i];
 
 			filemap_end_dropbehind_read(folio);
 			folio_put(folio);
 		}
-		folio_batch_init(&fbatch);
+		folio_batch_init(&area.fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-06 11:44           ` Optimizing small reads Kiryl Shutsemau
@ 2025-10-06 15:50             ` Linus Torvalds
  2025-10-06 18:04               ` Kiryl Shutsemau
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-06 15:50 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, 6 Oct 2025 at 04:45, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> Below is my take on this. Lightly tested.

Thanks.

That looked even simpler than what I thought it would be, although I
worry a bit about the effect on page_cache_delete() now being much
more expensive with that spinlock.

And the spinlock actually looks unnecessary, since page_cache_delete()
already relies on being serialized by holding the i_pages lock.

So I think you can just change the seqcount_spinlock_t to a plain
seqcount_t with no locking at all, and document that external locking.

>  - Do we want a bounded retry on read_seqcount_retry()?
>    Maybe upto 3 iterations?

No., I don't think it ever triggers, and I really like how this looks.

And I'd go even further, and change that first

        seq = read_seqcount_begin(&mapping->i_pages_delete_seqcnt);

into a

        if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt);
                return 0;

so that you don't even wait for any existing case.

That you could even do *outside* the RCU section, but I'm not sure
that buys us anything.

*If* somebody ever hits it we can revisit, but I really think the
whole point of this fast-path is to just deal with the common case
quickly.

There are going to be other things that are much more common and much
more realistic, like "this is the first read, so I need to set the
accessed bit".

>  - HIGHMEM support is trivial with memcpy_from_file_folio();

Good call. I didn't even want to think about it, and obviously never did.

>  - I opted for late partial read check. It would be nice allow to read
>    across PAGE_SIZE boundary as long as it is in the same folio

Sure,

When I wrote that patch, I actually worried more about the negative
overhead of it not hitting at all, so I tried very hard to minimize
the cases where we look up a folio speculatively only to then decide
we can't use it.

But as long as that

        if (iov_iter_count(iter) <= sizeof(area)) {

is there to protect the really basic rule, I guess it's not a huge deal.

>  - Move i_size check after uptodate check. It seems to be required
>    according to the comment in filemap_read(). But I cannot say I
>    understand i_size implications here.

I forget too, and it might be voodoo programming.

>  - Size of area is 256 bytes. I wounder if we want to get the fast read
>    to work on full page chunks. Can we dedicate a page per CPU for this?
>    I expect it to cover substantially more cases.

I guess a percpu page would be good, but I really liked using the
buffer we already ended up having for that page array.

Maybe worth playing around with.

> Any comments are welcome.

See above: the only think I think you should change - at least for a
first version - is to not even do the spinlock and just rely on the
locks we already hold in the removal path. That page_cache_delete()
already requires locks for the

        mapping->nrpages -= nr;

logic later (and for other reasons anyway).

And, obviously, this needs testing. I've never seen an issue with my
non-sequence case, so I think a lot of xfstests...

                     Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-06 15:50             ` Linus Torvalds
@ 2025-10-06 18:04               ` Kiryl Shutsemau
  2025-10-06 18:14                 ` Linus Torvalds
  2025-10-07 21:47                 ` Linus Torvalds
  0 siblings, 2 replies; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-06 18:04 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, Oct 06, 2025 at 08:50:55AM -0700, Linus Torvalds wrote:
> On Mon, 6 Oct 2025 at 04:45, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > Below is my take on this. Lightly tested.
> 
> Thanks.
> 
> That looked even simpler than what I thought it would be, although I
> worry a bit about the effect on page_cache_delete() now being much
> more expensive with that spinlock.
>
> And the spinlock actually looks unnecessary, since page_cache_delete()
> already relies on being serialized by holding the i_pages lock.
> 
> So I think you can just change the seqcount_spinlock_t to a plain
> seqcount_t with no locking at all, and document that external locking.

That is not a spinlock. It is lockdep annotation that we expect this
spinlock to be held there for seqcount write to be valid.

It is NOP with lockdep disabled.

pidfs does the same: pidmap_lock_seq tied to pidmap_lock. And for
example for code flow: free_pid() takes pidmap_lock and calls
pidfs_remove_pid() that takes pidmap_lock_seq.

It also takes care about preemption disabling if needed.

Unless I totally misunderstood how it works... :P

> >  - Do we want a bounded retry on read_seqcount_retry()?
> >    Maybe upto 3 iterations?
> 
> No., I don't think it ever triggers, and I really like how this looks.
> 
> And I'd go even further, and change that first
> 
>         seq = read_seqcount_begin(&mapping->i_pages_delete_seqcnt);
> 
> into a
> 
>         if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt);
>                 return 0;
> 
> so that you don't even wait for any existing case.

Ack.

> That you could even do *outside* the RCU section, but I'm not sure
> that buys us anything.
> 
> *If* somebody ever hits it we can revisit, but I really think the
> whole point of this fast-path is to just deal with the common case
> quickly.
> 
> There are going to be other things that are much more common and much
> more realistic, like "this is the first read, so I need to set the
> accessed bit".
> 
> >  - HIGHMEM support is trivial with memcpy_from_file_folio();
> 
> Good call. I didn't even want to think about it, and obviously never did.
> 
> >  - I opted for late partial read check. It would be nice allow to read
> >    across PAGE_SIZE boundary as long as it is in the same folio
> 
> Sure,
> 
> When I wrote that patch, I actually worried more about the negative
> overhead of it not hitting at all, so I tried very hard to minimize
> the cases where we look up a folio speculatively only to then decide
> we can't use it.

Consider it warming up CPU cache for slow path :P

> But as long as that
> 
>         if (iov_iter_count(iter) <= sizeof(area)) {
> 
> is there to protect the really basic rule, I guess it's not a huge deal.
> 
> >  - Move i_size check after uptodate check. It seems to be required
> >    according to the comment in filemap_read(). But I cannot say I
> >    understand i_size implications here.
> 
> I forget too, and it might be voodoo programming.
> 
> >  - Size of area is 256 bytes. I wounder if we want to get the fast read
> >    to work on full page chunks. Can we dedicate a page per CPU for this?
> >    I expect it to cover substantially more cases.
> 
> I guess a percpu page would be good, but I really liked using the
> buffer we already ended up having for that page array.
> 
> Maybe worth playing around with.

With page size buffer we might consider serving larger reads in the same
manner with loop around filemap_fast_read().

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-06 18:04               ` Kiryl Shutsemau
@ 2025-10-06 18:14                 ` Linus Torvalds
  2025-10-07 21:47                 ` Linus Torvalds
  1 sibling, 0 replies; 24+ messages in thread
From: Linus Torvalds @ 2025-10-06 18:14 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, 6 Oct 2025 at 11:04, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > So I think you can just change the seqcount_spinlock_t to a plain
> > seqcount_t with no locking at all, and document that external locking.
>
> That is not a spinlock. It is lockdep annotation that we expect this
> spinlock to be held there for seqcount write to be valid.
>
> It is NOP with lockdep disabled.

Ahh, right you are. Complaint withdrawn. I had the "lockref" kind of
thing in mind, but yes, the seqcount thing already have an external
lock model.

           Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-06 18:04               ` Kiryl Shutsemau
  2025-10-06 18:14                 ` Linus Torvalds
@ 2025-10-07 21:47                 ` Linus Torvalds
  2025-10-07 22:35                   ` Linus Torvalds
  1 sibling, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-07 21:47 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1623 bytes --]

On Mon, 6 Oct 2025 at 11:04, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> On Mon, Oct 06, 2025 at 08:50:55AM -0700, Linus Torvalds wrote:
> > On Mon, 6 Oct 2025 at 04:45, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> > >
> > >  - Size of area is 256 bytes. I wounder if we want to get the fast read
> > >    to work on full page chunks. Can we dedicate a page per CPU for this?
> > >    I expect it to cover substantially more cases.
> >
> > I guess a percpu page would be good, but I really liked using the
> > buffer we already ended up having for that page array.
> >
> > Maybe worth playing around with.
>
> With page size buffer we might consider serving larger reads in the same
> manner with loop around filemap_fast_read().

Actually, thinking much more about this, I definitely do *not* think
that doing a separate page buffer is a good idea. I think it will only
cause the "fast path" to pollute more of the L1 cache, and slow things
down.

The on-stack buffer, in contrast, will pretty much either already be
in the cache, or would be brought into it regardless. So there's no
extra cache footprint from using it.

But the "loop around filemap_fast_read()" might be a fine idea even
with "just" the 256 byte buffer.

Something ENTIRELY UNTESTED like this, perhaps? This is relative to your patch.

Note: this also ends up doing it all with pagefaults disabled, becasue
that way we can do the looping without dropping and re-taking the RCU
lock. I'm not sure that is sensible, but whatever. Considering that I
didn't test this AT ALL, please just consider this a "wild handwaving"
patch.

           Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1876 bytes --]

 mm/filemap.c | 36 +++++++++++++++++++++++++++---------
 1 file changed, 27 insertions(+), 9 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 60a7b9275741..541273388512 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2697,7 +2697,7 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-static inline unsigned long filemap_fast_read(struct address_space *mapping,
+static unsigned long filemap_fast_read(struct address_space *mapping,
 					      loff_t pos, char *buffer,
 					      size_t size)
 {
@@ -2792,20 +2792,38 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	 * any compiler initialization would be pointless since this
 	 * can fill it will garbage.
 	 */
-	if (iov_iter_count(iter) <= sizeof(area)) {
+	if (iov_iter_count(iter) <= PAGE_SIZE) {
 		size_t count = iov_iter_count(iter);
+		size_t fast_read = 0;
 
 		/* Let's see if we can just do the read under RCU */
 		rcu_read_lock();
-		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
+		pagefault_disable();
+		do {
+			size_t copied = min(count, sizeof(area));
+
+			copied = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, copied);
+			if (!copied)
+				break;
+			copied = copy_to_iter(area.buffer, copied, iter);
+			if (!copied)
+				break;
+			fast_read += copied;
+			iocb->ki_pos += copied;
+			already_read += copied;
+			count -= copied;
+		} while (count);
+		pagefault_enable();
 		rcu_read_unlock();
-		if (count) {
-			size_t copied = copy_to_iter(area.buffer, count, iter);
-			if (unlikely(!copied))
-				return already_read ? already_read : -EFAULT;
-			ra->prev_pos = iocb->ki_pos += copied;
+
+		if (fast_read) {
+			ra->prev_pos += fast_read;
+			already_read += fast_read;
 			file_accessed(filp);
-			return copied + already_read;
+
+			/* All done? */
+			if (!count)
+				return already_read;
 		}
 	}
 

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-07 21:47                 ` Linus Torvalds
@ 2025-10-07 22:35                   ` Linus Torvalds
  2025-10-07 22:54                     ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-07 22:35 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Tue, 7 Oct 2025 at 14:47, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> Note: this also ends up doing it all with pagefaults disabled, becasue
> that way we can do the looping without dropping and re-taking the RCU
> lock. I'm not sure that is sensible, but whatever. Considering that I
> didn't test this AT ALL, please just consider this a "wild handwaving"
> patch.

It might be worth noting that the real reason for disabling page
faults is that I think the whole loop should be moved into
filemap_fast_read(), and then the sequence count read - with the
memory barrier - would be done only once.

Plus at that point the code can do the page lookup only once too.

So that patch is not only untested, it's also just a "step one".

But I think I'll try to boot it next. Wish me luck.

               Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-07 22:35                   ` Linus Torvalds
@ 2025-10-07 22:54                     ` Linus Torvalds
  2025-10-07 23:30                       ` Linus Torvalds
  2025-10-08 10:28                       ` Kiryl Shutsemau
  0 siblings, 2 replies; 24+ messages in thread
From: Linus Torvalds @ 2025-10-07 22:54 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 473 bytes --]

On Tue, 7 Oct 2025 at 15:35, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> But I think I'll try to boot it next. Wish me luck.

Bah. It boots - after you fix the stupid double increment of 'already_copied'.

I didn't remove the update inside the loop when I made it update it
after the loop.

So here's the slightly fixed patch that actually does boot - and that
I'm running right now. But I wouldn't call it exactly "tested".

Caveat patchor.

          Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 1539 bytes --]


 mm/filemap.c | 33 +++++++++++++++++++++++++--------
 1 file changed, 25 insertions(+), 8 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 60a7b9275741..ba11f018ca6b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2792,20 +2792,37 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	 * any compiler initialization would be pointless since this
 	 * can fill it will garbage.
 	 */
-	if (iov_iter_count(iter) <= sizeof(area)) {
+	if (iov_iter_count(iter) <= PAGE_SIZE) {
 		size_t count = iov_iter_count(iter);
+		size_t fast_read = 0;
 
 		/* Let's see if we can just do the read under RCU */
 		rcu_read_lock();
-		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
+		pagefault_disable();
+		do {
+			size_t copied = min(count, sizeof(area));
+
+			copied = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, copied);
+			if (!copied)
+				break;
+			copied = copy_to_iter(area.buffer, copied, iter);
+			if (!copied)
+				break;
+			fast_read += copied;
+			iocb->ki_pos += copied;
+			count -= copied;
+		} while (count);
+		pagefault_enable();
 		rcu_read_unlock();
-		if (count) {
-			size_t copied = copy_to_iter(area.buffer, count, iter);
-			if (unlikely(!copied))
-				return already_read ? already_read : -EFAULT;
-			ra->prev_pos = iocb->ki_pos += copied;
+
+		if (fast_read) {
+			ra->prev_pos += fast_read;
+			already_read += fast_read;
 			file_accessed(filp);
-			return copied + already_read;
+
+			/* All done? */
+			if (!count)
+				return already_read;
 		}
 	}
 

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-07 22:54                     ` Linus Torvalds
@ 2025-10-07 23:30                       ` Linus Torvalds
  2025-10-08 14:54                         ` Kiryl Shutsemau
  2025-10-08 10:28                       ` Kiryl Shutsemau
  1 sibling, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-07 23:30 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Tue, 7 Oct 2025 at 15:54, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> So here's the slightly fixed patch that actually does boot - and that
> I'm running right now. But I wouldn't call it exactly "tested".
>
> Caveat patchor.

From a quick look at profiles, the major issue is that clac/stac is
very expensive on the machine I'm testing this on, and that makes the
looping over smaller copies unnecessarily costly.

And the iov_iter overhead is quite costly too.

Both would be fixed by instead of just checking the iov_iter_count(),
we should likely check just the first iov_iter entry, and make sure
it's a user space iterator.

Then we'd be able to use the usual - and *much* cheaper -
user_access_begin/end() and unsafe_copy_to_user() functions, and do
the iter update at the end outside the loop.

Anyway, this all feels fairly easily fixable and not some difficult
fundamental issue, but it just requires being careful and getting the
small details right. Not difficult, just "care needed".

But even without that, and in this simplistic form, this should
*scale* beautifully, because all the overheads are purely CPU-local.
So it does avoid the whole atomic page reference stuff etc
synchronization.

                  Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-07 23:30                       ` Linus Torvalds
@ 2025-10-08 14:54                         ` Kiryl Shutsemau
  2025-10-08 16:27                           ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-08 14:54 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Tue, Oct 07, 2025 at 04:30:20PM -0700, Linus Torvalds wrote:
> On Tue, 7 Oct 2025 at 15:54, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > So here's the slightly fixed patch that actually does boot - and that
> > I'm running right now. But I wouldn't call it exactly "tested".
> >
> > Caveat patchor.
> 
> From a quick look at profiles, the major issue is that clac/stac is
> very expensive on the machine I'm testing this on, and that makes the
> looping over smaller copies unnecessarily costly.
> 
> And the iov_iter overhead is quite costly too.
> 
> Both would be fixed by instead of just checking the iov_iter_count(),
> we should likely check just the first iov_iter entry, and make sure
> it's a user space iterator.
> 
> Then we'd be able to use the usual - and *much* cheaper -
> user_access_begin/end() and unsafe_copy_to_user() functions, and do
> the iter update at the end outside the loop.
> 
> Anyway, this all feels fairly easily fixable and not some difficult
> fundamental issue, but it just requires being careful and getting the
> small details right. Not difficult, just "care needed".
> 
> But even without that, and in this simplistic form, this should
> *scale* beautifully, because all the overheads are purely CPU-local.
> So it does avoid the whole atomic page reference stuff etc
> synchronization.

I tried to look at numbers too.

The best case scenario looks great. 16 threads hammering the same 4k
with 256 bytes read:

Baseline:	2892MiB/s
Kiryl:		7751MiB/s
Linus:		7787MiB/s

But when I tried something outside of the best case, it doesn't look
good. 16 threads read from 512k file with 4k:

Baseline:	99.4GiB/s
Kiryl:		40.0GiB/s
Linus:		44.0GiB/s

I have not profiled it yet.

Disabling SMAP (clearcpuid=smap) makes it 45.7GiB/s for mine patch and
50.9GiB/s for yours. So it cannot be fully attributed to SMAP.

Other candidates are iov overhead and multiple xarray lookups.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-08 14:54                         ` Kiryl Shutsemau
@ 2025-10-08 16:27                           ` Linus Torvalds
  2025-10-08 17:03                             ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-08 16:27 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Wed, 8 Oct 2025 at 07:54, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> The best case scenario looks great. 16 threads hammering the same 4k
> with 256 bytes read:
>
> Baseline:       2892MiB/s
> Kiryl:          7751MiB/s
> Linus:          7787MiB/s

Yeah, that certainly fixes the performance problem people saw with
contention on the page count.

> But when I tried something outside of the best case, it doesn't look
> good. 16 threads read from 512k file with 4k:
>
> Baseline:       99.4GiB/s
> Kiryl:          40.0GiB/s
> Linus:          44.0GiB/s
>
> I have not profiled it yet.
>
> Disabling SMAP (clearcpuid=smap) makes it 45.7GiB/s for mine patch and
> 50.9GiB/s for yours. So it cannot be fully attributed to SMAP.

It's not just smap. It's the iov iterator overheads I mentioned.

Those iov iterators are *slow*. Well, compared to a small memcpy they
are. They are out-of-line, but they also do a lot of tests for the
different cases.

So the baseline obviously still uses the iov_iter code, but it does it
on full pages, so the overhead is much less noticeable.

This is why I said that I think the next step is to just do the
user-space case, so that the loop can not only avoid the SMAP overhead
by using the user_access_begin() model, but so that it also avoids all
the iov iter costs.

                    Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-08 16:27                           ` Linus Torvalds
@ 2025-10-08 17:03                             ` Linus Torvalds
  2025-10-09 16:22                               ` Kiryl Shutsemau
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-08 17:03 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Wed, 8 Oct 2025 at 09:27, Linus Torvalds
<torvalds@linux-foundation.org> wrote:
>
> On Wed, 8 Oct 2025 at 07:54, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > Disabling SMAP (clearcpuid=smap) makes it 45.7GiB/s for mine patch and
> > 50.9GiB/s for yours. So it cannot be fully attributed to SMAP.
>
> It's not just smap. It's the iov iterator overheads I mentioned.

I also suspect that if the smap and iov overhead are fixed, the next
thing in line is the folio lookup.

That should be trivial to fix by just having an additional copy loop
inside the "look up page".

So you'd have two levels of looping: the outer loop over the "look up
a folio at a time", and then the inner loop that does the smaller
chunk size copies.

One remaining pain point might be the sequence count retry read - just
because it has that smp_rmb().

Because while the *initial* sequence count read can be moved out of
any loops - so you'd start with just one fixed value that you check -
we do need to check that value before copying the chunk to user space.

So you'd have one smp_rmb() per that inner loop iteration. That sounds
unavoidable, but should be unnoticeable.

SMAP would done in the outer loop (so once per folio).

RCU and page fault protection would be at the outermost levels (so
once per the whole low-latency thing).

At least that's my gut feel.

              Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-08 17:03                             ` Linus Torvalds
@ 2025-10-09 16:22                               ` Kiryl Shutsemau
  2025-10-09 17:29                                 ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-09 16:22 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Wed, Oct 08, 2025 at 10:03:47AM -0700, Linus Torvalds wrote:
> On Wed, 8 Oct 2025 at 09:27, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > On Wed, 8 Oct 2025 at 07:54, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> > >
> > > Disabling SMAP (clearcpuid=smap) makes it 45.7GiB/s for mine patch and
> > > 50.9GiB/s for yours. So it cannot be fully attributed to SMAP.
> >
> > It's not just smap. It's the iov iterator overheads I mentioned.
> 
> I also suspect that if the smap and iov overhead are fixed, the next
> thing in line is the folio lookup.

Below is the patch I currently have.

I went for more clear separation of fast and slow path.

Objtool is not happy about calling random stuff within UACCESS. I
ignored it for now.

I am not sure if I use user_access_begin()/_end() correctly. Let me know
if I misunderstood or misimplemented your idea.

This patch brings 4k reads from 512k files to ~60GiB/s. Making the
buffer 4k, brings it ~95GiB/s (baseline is 100GiB/s).

I tried to optimized folio walk, but it got slower for some reason. I
don't yet understand why. Maybe something silly. Will play with it more.

Any other ideas?

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..732756116b6a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,41 +2661,106 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache.  If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller.  If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
-		ssize_t already_read)
+static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+			      char *buffer, size_t buffer_size,
+			      ssize_t *already_read)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	loff_t last_pos = ra->prev_pos;
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	/* Don't bother with flush_dcache_folio() */
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (!iter_is_ubuf(iter))
+		return false;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+		return false;
+
+	if (!user_access_begin(iter->ubuf + iter->iov_offset, iter->count))
+		return false;
+
+	rcu_read_lock();
+	pagefault_disable();
+
+	do {
+		size_t to_read, read;
+		XA_STATE(xas, &mapping->i_pages, iocb->ki_pos >> PAGE_SHIFT);
+
+		xas_reset(&xas);
+		folio = xas_load(&xas);
+		if (xas_retry(&xas, folio))
+			break;
+
+		if (!folio || xa_is_value(folio))
+			break;
+
+		if (!folio_test_uptodate(folio))
+			break;
+
+		/* No fast-case if readahead is supposed to started */
+		if (folio_test_readahead(folio))
+			break;
+		/* .. or mark it accessed */
+		if (!folio_test_referenced(folio))
+			break;
+
+		/* i_size check must be after folio_test_uptodate() */
+		file_size = i_size_read(mapping->host);
+
+		do {
+			if (unlikely(iocb->ki_pos >= file_size))
+				goto out;
+
+			to_read = min(iov_iter_count(iter), buffer_size);
+			if (to_read > file_size - iocb->ki_pos)
+				to_read = file_size - iocb->ki_pos;
+
+			read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
+
+			/* Give up and go to slow path if raced with page_cache_delete() */
+			if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+				goto out;
+
+			unsafe_copy_to_user(iter->ubuf + iter->iov_offset, buffer,
+					    read, out);
+
+			iter->iov_offset += read;
+			iter->count -= read;
+			*already_read += read;
+			iocb->ki_pos += read;
+			last_pos = iocb->ki_pos;
+		} while (iov_iter_count(iter) && iocb->ki_pos % folio_size(folio));
+	} while (iov_iter_count(iter));
+out:
+	pagefault_enable();
+	rcu_read_unlock();
+	user_access_end();
+
+	file_accessed(iocb->ki_filp);
+	ra->prev_pos = last_pos;
+	return !iov_iter_count(iter);
+}
+
+static ssize_t filemap_read_slow(struct kiocb *iocb, struct iov_iter *iter,
+			      struct folio_batch *fbatch, ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
 	loff_t last_pos = ra->prev_pos;
 
-	if (unlikely(iocb->ki_pos < 0))
-		return -EINVAL;
-	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
-		return 0;
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
-
-	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+	folio_batch_init(fbatch);
 
 	do {
 		cond_resched();
@@ -2709,7 +2776,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2737,11 +2804,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    fbatch->folios[0]))
+			folio_mark_accessed(fbatch->folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,19 +2839,57 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 
 			filemap_end_dropbehind_read(folio);
 			folio_put(folio);
 		}
-		folio_batch_init(&fbatch);
+		folio_batch_init(fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
 }
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+		//char __buffer[4096];
+	} area __uninitialized;
+
+	if (unlikely(iocb->ki_pos < 0))
+		return -EINVAL;
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+		return 0;
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+	if (filemap_read_fast(iocb, iter, area.buffer, sizeof(area), &already_read))
+		return already_read;
+
+	return filemap_read_slow(iocb, iter, &area.fbatch, already_read);
+}
 EXPORT_SYMBOL_GPL(filemap_read);
 
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-09 16:22                               ` Kiryl Shutsemau
@ 2025-10-09 17:29                                 ` Linus Torvalds
  2025-10-10 10:10                                   ` Kiryl Shutsemau
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-09 17:29 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

[-- Attachment #1: Type: text/plain, Size: 1429 bytes --]

On Thu, 9 Oct 2025 at 09:22, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> Objtool is not happy about calling random stuff within UACCESS. I
> ignored it for now.

Yeah, that needs to be done inside the other stuff - including, very
much, the folio lookup.

> I am not sure if I use user_access_begin()/_end() correctly. Let me know
> if I misunderstood or misimplemented your idea.

Close. Except I'd have gotten rid of the iov stuff by making the inner
helper just get a 'void __user *' pointer and a length, and then
updating the iov state outside that helper.

> This patch brings 4k reads from 512k files to ~60GiB/s. Making the
> buffer 4k, brings it ~95GiB/s (baseline is 100GiB/s).

Note that right now, 'unsafe_copy_to_user()' is a horrible thing. It's
almost entirely unoptimized, see the hacky unsafe_copy_loop
implementation in <asm/uaccess.h>.

Because before this code, it was only used for readdir() to copy
individual filenames, I think.

Anyway, I'd have organized things a bit differently. Incremental
UNTESTED patch attached.

objtool still complains about SMAP issues, because
memcpy_from_file_folio() ends up resulting in a external call to
memcpy. Not great.

I don't love how complicated this all got, and even with your bigger
buffer it's slower than the baseline/

So honestly I'd be inclined to go back to "just deal with the
trivially small reads", and scratch this extra complexity.

       Linus

[-- Attachment #2: patch.diff --]
[-- Type: text/x-patch, Size: 3464 bytes --]

 mm/filemap.c | 81 +++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 53 insertions(+), 28 deletions(-)

diff --git a/mm/filemap.c b/mm/filemap.c
index 13c5de94c884..64def0dd3b97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2697,7 +2697,41 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+static size_t inner_read_loop(struct kiocb *iocb, struct folio *folio,
+				void __user *dst, size_t dst_size,
+				char *buffer, size_t buffer_size,
+				struct address_space *mapping, unsigned int seq)
+{
+	size_t read = 0;
+
+	if (can_do_masked_user_access())
+		dst = masked_user_access_begin(dst);
+	else if (!user_access_begin(dst, dst_size))
+		return 0;
+
+	do {
+		size_t to_read = min(dst_size, buffer_size);
+
+		to_read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
+
+		/* Give up and go to slow path if raced with page_cache_delete() */
+		if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+			break;
+
+		unsafe_copy_to_user(dst, buffer, to_read, Efault);
+
+		dst += read;
+		dst_size -= read;
+
+		iocb->ki_pos += read;
+	} while (dst_size && iocb->ki_pos % folio_size(folio));
+
+Efault:
+	user_access_end();
+	return read;
+}
+
+static bool noinline filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 			      char *buffer, size_t buffer_size,
 			      ssize_t *already_read)
 {
@@ -2719,14 +2753,12 @@ static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
 		return false;
 
-	if (!user_access_begin(iter->ubuf + iter->iov_offset, iter->count))
-		return false;
-
 	rcu_read_lock();
 	pagefault_disable();
 
 	do {
 		size_t to_read, read;
+		void __user *dst;
 		XA_STATE(xas, &mapping->i_pages, iocb->ki_pos >> PAGE_SHIFT);
 
 		xas_reset(&xas);
@@ -2750,34 +2782,27 @@ static bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
 		/* i_size check must be after folio_test_uptodate() */
 		file_size = i_size_read(mapping->host);
 
-		do {
-			if (unlikely(iocb->ki_pos >= file_size))
-				goto out;
+		if (unlikely(iocb->ki_pos >= file_size))
+			break;
+		file_size -= iocb->ki_pos;
+		to_read = iov_iter_count(iter);
+		if (to_read > file_size)
+			to_read = file_size;
 
-			to_read = min(iov_iter_count(iter), buffer_size);
-			if (to_read > file_size - iocb->ki_pos)
-				to_read = file_size - iocb->ki_pos;
-
-			read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
-
-			/* Give up and go to slow path if raced with page_cache_delete() */
-			if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
-				goto out;
-
-			unsafe_copy_to_user(iter->ubuf + iter->iov_offset, buffer,
-					    read, out);
-
-			iter->iov_offset += read;
-			iter->count -= read;
-			*already_read += read;
-			iocb->ki_pos += read;
-			last_pos = iocb->ki_pos;
-		} while (iov_iter_count(iter) && iocb->ki_pos % folio_size(folio));
+		dst = iter->ubuf + iter->iov_offset;
+		read = inner_read_loop(iocb, folio,
+			dst, to_read, buffer, buffer_size,
+			mapping, seq);
+		if (!read)
+			break;
+		iter->iov_offset += read;
+		iter->count -= read;
+		*already_read += read;
+		last_pos = iocb->ki_pos;
 	} while (iov_iter_count(iter));
-out:
+
 	pagefault_enable();
 	rcu_read_unlock();
-	user_access_end();
 
 	file_accessed(iocb->ki_filp);
 	ra->prev_pos = last_pos;

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-09 17:29                                 ` Linus Torvalds
@ 2025-10-10 10:10                                   ` Kiryl Shutsemau
  2025-10-10 17:51                                     ` Linus Torvalds
  0 siblings, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-10 10:10 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Thu, Oct 09, 2025 at 10:29:12AM -0700, Linus Torvalds wrote:
> On Thu, 9 Oct 2025 at 09:22, Kiryl Shutsemau <kirill@shutemov.name> wrote:
> >
> > Objtool is not happy about calling random stuff within UACCESS. I
> > ignored it for now.
> 
> Yeah, that needs to be done inside the other stuff - including, very
> much, the folio lookup.
> 
> > I am not sure if I use user_access_begin()/_end() correctly. Let me know
> > if I misunderstood or misimplemented your idea.
> 
> Close. Except I'd have gotten rid of the iov stuff by making the inner
> helper just get a 'void __user *' pointer and a length, and then
> updating the iov state outside that helper.
> 
> > This patch brings 4k reads from 512k files to ~60GiB/s. Making the
> > buffer 4k, brings it ~95GiB/s (baseline is 100GiB/s).
> 
> Note that right now, 'unsafe_copy_to_user()' is a horrible thing. It's
> almost entirely unoptimized, see the hacky unsafe_copy_loop
> implementation in <asm/uaccess.h>.

Right. The patch below brings numbers to to 64GiB/s with 256 bytes
buffer and 109GiB/s with 4k buffer. 1k buffer breaks even with
unpatched kernel at ~100GiB/s.

> So honestly I'd be inclined to go back to "just deal with the
> trivially small reads", and scratch this extra complexity.

I will play with it a bit more, but, yes, this my feel too.

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3a7755c1a441..ae09777d96d7 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -612,10 +612,12 @@ do {									\
 	char __user *__ucu_dst = (_dst);				\
 	const char *__ucu_src = (_src);					\
 	size_t __ucu_len = (_len);					\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);	\
+	asm goto(							\
+		     "1:	rep movsb\n"				\
+		     _ASM_EXTABLE_UA(1b, %l[label])			\
+		     : "+D" (__ucu_dst), "+S" (__ucu_src),		\
+		       "+c" (__ucu_len)					\
+		     : : "memory" : label);				\
 } while (0)
 
 #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-10 10:10                                   ` Kiryl Shutsemau
@ 2025-10-10 17:51                                     ` Linus Torvalds
  2025-10-13 15:35                                       ` Kiryl Shutsemau
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-10 17:51 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Fri, 10 Oct 2025 at 03:10, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> > So honestly I'd be inclined to go back to "just deal with the
> > trivially small reads", and scratch this extra complexity.
>
> I will play with it a bit more, but, yes, this my feel too.

There's a couple of reasons I think this optimization ends up being
irrelevant for larger reads, with the obvious one being that ocne you
have bigger reads, the cost of the copy will swamp the other latency
issues.

But perhaps most importantly, if you do reads that are page-sized (or
bigger), you by definition are no longer doing the thing that started
this whole thing in the first place: hammering over and over on the
same page reference count in multiple threads.

(Of course, you can do exactly one page read over and over again, but
at some point it just has to be called outright stupid and an
artificial load)

IOW, I think the only reasonable way that you actually get that
cacheline ping-pong case is that you have some load that really does
access some *small* data in a file from multiple threads, where there
is then patterns where there are lots of those small fields on the
same page (eg it's some metadata that everybody ends up accessing).

If I recall correctly, the case Willy had a customer doing was reading
64-byte entries in a page. And then I really can see multiple threads
just reading the same page concurrently.

But even if the entries are "just" one page in size, suddenly it makes
no sense for a half-way competent app to re-read the whole page over
and over and over again in threads. If an application really does
that, then it's on the application to fix its silly behavior, not the
kernel to try to optimize for that insanity.

So that's why I think that original "just 256 bytes" is likely
perfectly sufficient.

Bigger IO simply doesn't make much sense for this particular "avoid
reference counting", and while the RCU path is certainly clever and
low-latency, and avoiding atomics is always a good thing, at the same
time it's also a very limited thing that then can't do some basic
things (like the whole "mark page accessed" etc)

Anyway, I may well be wrong, but let's start out with a minimal patch.
I think your first version with just the sequence number fixed would
likely be perfectly fine for integration into 6.19 - possibly with any
tweaking you come up with.

And any benchmarking should probably do exactly that "threaded 64-byte
read" that Willy had a real use-case for.

Then, if people come up with actual real loads where it would make
sense to expand on this, we can do so later.

Sounds like a plan?

                Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-10 17:51                                     ` Linus Torvalds
@ 2025-10-13 15:35                                       ` Kiryl Shutsemau
  2025-10-13 15:39                                         ` Kiryl Shutsemau
                                                           ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-13 15:35 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Fri, Oct 10, 2025 at 10:51:40AM -0700, Linus Torvalds wrote:
> Sounds like a plan?

The patch is below. Can I use your Signed-off-by for it?

Here's some numbers. I cannot explain some of this. Like, there should
be zero changes for block size above 256, but...

I am not confident enough in these measurements and will work on making
them reproducible. Looks like there's sizable boot-to-boot difference.

I also need to run xfstests. Unless someone can help me with this?
I don't have ready-to-go setup.

16 threads, reads from 4k file(s), MiB/s

 ---------------------------------------------------------
| Block | Baseline  | Baseline   | Patched   |  Patched   |
| size  | same file | diff files | same file | diff files |
 ---------------------------------------------------------
|     1 |      11.6 |       27.6 |      33.5 |       33.4 |
|    32 |     375   |      880   |    1027   |     1028   |
|   256 |    2940   |     6932   |    7872   |     7884   |
|  1024 |   11500   |    26900   |   11400   |    28300   |
|  4096 |   46500   |   103000   |   45700   |   108000   |
 ---------------------------------------------------------


16 threads, reads from 1M file(s), MiB/s

 ---------------------------------------------------------
| Block | Baseline  | Baseline   | Patched   |  Patched   |
| size  | same file | diff files | same file | diff files |
 ---------------------------------------------------------
|     1 |      26.8 |       27.4 |      32.0 |       32.2 |
|    32 |     840   |      872   |    1034   |     1033   |
|   256 |    6606   |     6904   |    7872   |     7919   |
|  1024 |   25700   |    26600   |   25300   |    28300   |
|  4096 |   96900   |    99000   |  103000   |   106000   |
 ---------------------------------------------------------

diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..8c39a9445471 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,6 +2661,92 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
+static inline unsigned long filemap_read_fast_rcu(struct address_space *mapping,
+						  loff_t pos, char *buffer,
+						  size_t size)
+{
+	XA_STATE(xas, &mapping->i_pages, pos >> PAGE_SHIFT);
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	lockdep_assert_in_rcu_read_lock();
+
+	seq = read_seqcount_begin(&mapping->i_pages_delete_seqcnt);
+
+	xas_reset(&xas);
+	folio = xas_load(&xas);
+	if (xas_retry(&xas, folio))
+		return 0;
+
+	if (!folio || xa_is_value(folio))
+		return 0;
+
+	if (!folio_test_uptodate(folio))
+		return 0;
+
+	/* No fast-case if readahead is supposed to started */
+	if (folio_test_readahead(folio))
+		return 0;
+	/* .. or mark it accessed */
+	if (!folio_test_referenced(folio))
+		return 0;
+
+	/* i_size check must be after folio_test_uptodate() */
+	file_size = i_size_read(mapping->host);
+	if (unlikely(pos >= file_size))
+		return 0;
+	if (size > file_size - pos)
+		size = file_size - pos;
+
+	/* Do the data copy */
+	size = memcpy_from_file_folio(buffer, folio, pos, size);
+	if (!size)
+		return 0;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+		return 0;
+
+	return size;
+}
+
+static inline bool filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+				ssize_t *already_read,
+				char *buffer, size_t buffer_size)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	size_t count;
+
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (iov_iter_count(iter) > buffer_size)
+		return false;
+
+	count = iov_iter_count(iter);
+
+	/* Let's see if we can just do the read under RCU */
+	rcu_read_lock();
+	count = filemap_read_fast_rcu(mapping, iocb->ki_pos, buffer, count);
+	rcu_read_unlock();
+
+	if (!count)
+		return false;
+
+	count = copy_to_iter(buffer, count, iter);
+	if (unlikely(!count))
+		return false;
+
+	iocb->ki_pos += count;
+	ra->prev_pos = iocb->ki_pos;
+	file_accessed(iocb->ki_filp);
+	*already_read += count;
+
+	return !iov_iter_count(iter);
+}
+
 /**
  * filemap_read - Read data from the page cache.
  * @iocb: The iocb to read.
@@ -2679,7 +2767,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+	} area __uninitialized;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
@@ -2693,7 +2784,20 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+
+	/*
+	 * Try a quick lockless read into the 'area' union. Note that
+	 * this union is intentionally marked "__uninitialized", because
+	 * any compiler initialization would be pointless since this
+	 * can fill it will garbage.
+	 */
+	if (filemap_read_fast(iocb, iter, &already_read, area.buffer, sizeof(area)))
+	    return already_read;
+
+	/*
+	 * This actually properly initializes the fbatch for the slow case
+	 */
+	folio_batch_init(&area.fbatch);
 
 	do {
 		cond_resched();
@@ -2709,7 +2813,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, &area.fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2737,11 +2841,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    area.fbatch.folios[0]))
+			folio_mark_accessed(area.fbatch.folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+			struct folio *folio = area.fbatch.folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,13 +2876,13 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(&area.fbatch); i++) {
+			struct folio *folio = area.fbatch.folios[i];
 
 			filemap_end_dropbehind_read(folio);
 			folio_put(folio);
 		}
-		folio_batch_init(&fbatch);
+		folio_batch_init(&area.fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 15:35                                       ` Kiryl Shutsemau
@ 2025-10-13 15:39                                         ` Kiryl Shutsemau
  2025-10-13 16:19                                           ` Linus Torvalds
  2025-10-13 16:06                                         ` Linus Torvalds
  2025-10-13 17:26                                         ` Theodore Ts'o
  2 siblings, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-13 15:39 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, Oct 13, 2025 at 04:35:20PM +0100, Kiryl Shutsemau wrote:
> On Fri, Oct 10, 2025 at 10:51:40AM -0700, Linus Torvalds wrote:
> > Sounds like a plan?
> 
> The patch is below. Can I use your Signed-off-by for it?

And, for archiving purposes, here is the last version of the patch that
supports large blocks.

Do you think it makes sense to submit unsafe_copy_to_user() optimization
as a standalone thingy?

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 3a7755c1a441..48bd31bac20e 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -607,15 +607,24 @@ _label:									\
 		len -= sizeof(type);						\
 	}
 
-#define unsafe_copy_to_user(_dst,_src,_len,label)			\
-do {									\
-	char __user *__ucu_dst = (_dst);				\
-	const char *__ucu_src = (_src);					\
-	size_t __ucu_len = (_len);					\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);	\
-	unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);	\
+#define unsafe_copy_to_user(_dst,_src,_len,label)				\
+do {										\
+	char __user *__ucu_dst = (_dst);					\
+	const char *__ucu_src = (_src);						\
+	size_t __ucu_len = (_len);						\
+	if (cpu_feature_enabled(X86_FEATURE_FSRM)) {				\
+		asm goto(							\
+			     "1:	rep movsb\n"				\
+			     _ASM_EXTABLE_UA(1b, %l[label])			\
+			     : "+D" (__ucu_dst), "+S" (__ucu_src),		\
+			       "+c" (__ucu_len)					\
+			     : : "memory" : label);				\
+	} else {								\
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label);  \
+		unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label);   \
+	}									\
 } while (0)
 
 #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
diff --git a/fs/inode.c b/fs/inode.c
index ec9339024ac3..52163d28d630 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -482,6 +482,8 @@ EXPORT_SYMBOL(inc_nlink);
 static void __address_space_init_once(struct address_space *mapping)
 {
 	xa_init_flags(&mapping->i_pages, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT);
+	seqcount_spinlock_init(&mapping->i_pages_delete_seqcnt,
+			       &mapping->i_pages->xa_lock);
 	init_rwsem(&mapping->i_mmap_rwsem);
 	INIT_LIST_HEAD(&mapping->i_private_list);
 	spin_lock_init(&mapping->i_private_lock);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 9e9d7c757efe..a900214f0f3a 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -522,6 +522,7 @@ struct address_space {
 	struct list_head	i_private_list;
 	struct rw_semaphore	i_mmap_rwsem;
 	void *			i_private_data;
+	seqcount_spinlock_t	i_pages_delete_seqcnt;
 } __attribute__((aligned(sizeof(long)))) __randomize_layout;
 	/*
 	 * On most architectures that alignment is already the case; but
diff --git a/mm/filemap.c b/mm/filemap.c
index 751838ef05e5..08ace2cca696 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -138,8 +138,10 @@ static void page_cache_delete(struct address_space *mapping,
 
 	VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
 
+	write_seqcount_begin(&mapping->i_pages_delete_seqcnt);
 	xas_store(&xas, shadow);
 	xas_init_marks(&xas);
+	write_seqcount_end(&mapping->i_pages_delete_seqcnt);
 
 	folio->mapping = NULL;
 	/* Leave folio->index set: truncation lookup relies upon it */
@@ -2659,41 +2661,132 @@ static void filemap_end_dropbehind_read(struct folio *folio)
 	}
 }
 
-/**
- * filemap_read - Read data from the page cache.
- * @iocb: The iocb to read.
- * @iter: Destination for the data.
- * @already_read: Number of bytes already read by the caller.
- *
- * Copies data from the page cache.  If the data is not currently present,
- * uses the readahead and read_folio address_space operations to fetch it.
- *
- * Return: Total number of bytes copied, including those already read by
- * the caller.  If an error happens before any bytes are copied, returns
- * a negative error number.
- */
-ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
-		ssize_t already_read)
+static size_t inner_read_loop(struct kiocb *iocb, struct folio *folio,
+				void __user *dst, size_t dst_size,
+				char *buffer, size_t buffer_size,
+				struct address_space *mapping, unsigned int seq)
+{
+	size_t read = 0;
+
+	if (can_do_masked_user_access())
+		dst = masked_user_access_begin(dst);
+	else if (!user_access_begin(dst, dst_size))
+		return 0;
+
+	do {
+		size_t to_read = min(dst_size, buffer_size);
+
+		to_read = memcpy_from_file_folio(buffer, folio, iocb->ki_pos, to_read);
+
+		/* Give up and go to slow path if raced with page_cache_delete() */
+		if (read_seqcount_retry(&mapping->i_pages_delete_seqcnt, seq))
+			break;
+
+		unsafe_copy_to_user(dst, buffer, to_read, Efault);
+
+		dst += to_read;
+		dst_size -= to_read;
+
+		iocb->ki_pos += to_read;
+		read += to_read;
+	} while (dst_size && iocb->ki_pos % folio_size(folio));
+
+Efault:
+	user_access_end();
+	return read;
+}
+
+static bool noinline filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
+			      char *buffer, size_t buffer_size,
+			      ssize_t *already_read)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct file_ra_state *ra = &iocb->ki_filp->f_ra;
+	loff_t last_pos = ra->prev_pos;
+	struct folio *folio;
+	loff_t file_size;
+	unsigned int seq;
+
+	/* Don't bother with flush_dcache_folio() */
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		return false;
+
+	if (!iter_is_ubuf(iter))
+		return false;
+
+	/* Give up and go to slow path if raced with page_cache_delete() */
+	if (!raw_seqcount_try_begin(&mapping->i_pages_delete_seqcnt, seq))
+		return false;
+
+	rcu_read_lock();
+	pagefault_disable();
+
+	do {
+		size_t to_read, read;
+		void __user *dst;
+		XA_STATE(xas, &mapping->i_pages, iocb->ki_pos >> PAGE_SHIFT);
+
+		xas_reset(&xas);
+		folio = xas_load(&xas);
+		if (xas_retry(&xas, folio))
+			break;
+
+		if (!folio || xa_is_value(folio))
+			break;
+
+		if (!folio_test_uptodate(folio))
+			break;
+
+		/* No fast-case if readahead is supposed to started */
+		if (folio_test_readahead(folio))
+			break;
+		/* .. or mark it accessed */
+		if (!folio_test_referenced(folio))
+			break;
+
+		/* i_size check must be after folio_test_uptodate() */
+		file_size = i_size_read(mapping->host);
+
+		if (unlikely(iocb->ki_pos >= file_size))
+			break;
+		file_size -= iocb->ki_pos;
+		to_read = iov_iter_count(iter);
+		if (to_read > file_size)
+			to_read = file_size;
+
+		dst = iter->ubuf + iter->iov_offset;
+		read = inner_read_loop(iocb, folio,
+			dst, to_read, buffer, buffer_size,
+			mapping, seq);
+		if (!read)
+			break;
+		iter->iov_offset += read;
+		iter->count -= read;
+		*already_read += read;
+		last_pos = iocb->ki_pos;
+	} while (iov_iter_count(iter));
+
+	pagefault_enable();
+	rcu_read_unlock();
+
+	file_accessed(iocb->ki_filp);
+	ra->prev_pos = last_pos;
+	return !iov_iter_count(iter);
+}
+
+static ssize_t filemap_read_slow(struct kiocb *iocb, struct iov_iter *iter,
+			      struct folio_batch *fbatch, ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct folio_batch fbatch;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
 	loff_t last_pos = ra->prev_pos;
 
-	if (unlikely(iocb->ki_pos < 0))
-		return -EINVAL;
-	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
-		return 0;
-	if (unlikely(!iov_iter_count(iter)))
-		return 0;
-
-	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
-	folio_batch_init(&fbatch);
+	folio_batch_init(fbatch);
 
 	do {
 		cond_resched();
@@ -2709,7 +2802,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if (unlikely(iocb->ki_pos >= i_size_read(inode)))
 			break;
 
-		error = filemap_get_pages(iocb, iter->count, &fbatch, false);
+		error = filemap_get_pages(iocb, iter->count, fbatch, false);
 		if (error < 0)
 			break;
 
@@ -2737,11 +2830,11 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 * mark it as accessed the first time.
 		 */
 		if (!pos_same_folio(iocb->ki_pos, last_pos - 1,
-				    fbatch.folios[0]))
-			folio_mark_accessed(fbatch.folios[0]);
+				    fbatch->folios[0]))
+			folio_mark_accessed(fbatch->folios[0]);
 
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 			size_t fsize = folio_size(folio);
 			size_t offset = iocb->ki_pos & (fsize - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2772,19 +2865,57 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_folios:
-		for (i = 0; i < folio_batch_count(&fbatch); i++) {
-			struct folio *folio = fbatch.folios[i];
+		for (i = 0; i < folio_batch_count(fbatch); i++) {
+			struct folio *folio = fbatch->folios[i];
 
 			filemap_end_dropbehind_read(folio);
 			folio_put(folio);
 		}
-		folio_batch_init(&fbatch);
+		folio_batch_init(fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
 }
+
+/**
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
+ *
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and read_folio address_space operations to fetch it.
+ *
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
+ */
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
+{
+	struct inode *inode = iocb->ki_filp->f_mapping->host;
+	union {
+		struct folio_batch fbatch;
+		__DECLARE_FLEX_ARRAY(char, buffer);
+		//char __buffer[4096];
+	} area __uninitialized;
+
+	if (unlikely(iocb->ki_pos < 0))
+		return -EINVAL;
+	if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
+		return 0;
+	if (unlikely(!iov_iter_count(iter)))
+		return 0;
+
+	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
+
+	if (filemap_read_fast(iocb, iter, area.buffer, sizeof(area), &already_read))
+		return already_read;
+
+	return filemap_read_slow(iocb, iter, &area.fbatch, already_read);
+}
 EXPORT_SYMBOL_GPL(filemap_read);
 
 int kiocb_write_and_wait(struct kiocb *iocb, size_t count)
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 15:39                                         ` Kiryl Shutsemau
@ 2025-10-13 16:19                                           ` Linus Torvalds
  2025-10-14 12:58                                             ` Kiryl Shutsemau
  0 siblings, 1 reply; 24+ messages in thread
From: Linus Torvalds @ 2025-10-13 16:19 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, 13 Oct 2025 at 08:39, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> And, for archiving purposes, here is the last version of the patch that
> supports large blocks.
>
> Do you think it makes sense to submit unsafe_copy_to_user() optimization
> as a standalone thingy?

Without a user, I'm not convinced. I think right now there are zero
cases of unsafe_copy_to_user() that are large enough for this to
matter. It looks like we have exactly two cases; the readdir() case I
knew about (because I did that one), and one other user in put_cmsg(),
which was introduced to networking with the commit message being

 "Calling two copy_to_user() for very small regions has very high overhead"

so that one is very small too.

HOWEVER.

What I like in this patch is how you split up filemap_read() itself
further, and I think that part might be worth it, except I think you
did the "noinline" parts wrong:

> +static bool noinline filemap_read_fast(struct kiocb *iocb, struct iov_iter *iter,
...
> +static ssize_t filemap_read_slow(struct kiocb *iocb, struct iov_iter *iter,
...
> +ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
> +               ssize_t already_read)
> +{
> +       struct inode *inode = iocb->ki_filp->f_mapping->host;
> +       union {
> +               struct folio_batch fbatch;
> +               __DECLARE_FLEX_ARRAY(char, buffer);
> +               //char __buffer[4096];
> +       } area __uninitialized;
> +
> +       if (unlikely(iocb->ki_pos < 0))
> +               return -EINVAL;
> +       if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
> +               return 0;
> +       if (unlikely(!iov_iter_count(iter)))
> +               return 0;
> +
> +       iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
> +
> +       if (filemap_read_fast(iocb, iter, area.buffer, sizeof(area), &already_read))
> +               return already_read;
> +
> +       return filemap_read_slow(iocb, iter, &area.fbatch, already_read);
> +}
>  EXPORT_SYMBOL_GPL(filemap_read);

Look, the reason you did this was presumably for stack usage when you
have a big 4k allocation, but the part you *really* needed to protect
was filemap_read_slow() that then has much deeper call chains.

So filemap_read_slow() should *not* ever see the big 4k stack slot
that it never needs or uses, and that eats into our fairly small
kernel stack.

BUT with this organization, what you could have done is:

 - don't share the buffers between filemap_read_slow/filemap_read_fast
at all, so the 'union' trick with a flex array etc would go away

 - both filemap_read_slow and filemap_read_fast would be 'noinline' so
that they don't share a stack frame

 - filemap_read_fast could have the on-stack byte buffer - I think 4k
is still excessive for on-stack, but it could certainly be larger than
256 bytes

 - filemap_read_slow would have just the 'struct folio_batch' thing.

Anyway, I think *that* part of this patch might actually be worth it
independently of anything else.

Of course, that re-organization does cause that extra level of
function calls in order to not have excessive stack usage, and the CPU
errata can make 'ret' instructions expensive, but whatever.

So I don't know. I like how it splits up those two cases more clearly
and makes the baseline 'filemap_read()' much simpler and very
straightforward, and I like how it also splits up the stack usage and
would make that union trick unnecessary.

But it's not a big deal either way.

                Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 16:19                                           ` Linus Torvalds
@ 2025-10-14 12:58                                             ` Kiryl Shutsemau
  0 siblings, 0 replies; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-14 12:58 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, Oct 13, 2025 at 09:19:47AM -0700, Linus Torvalds wrote:
>  - both filemap_read_slow and filemap_read_fast would be 'noinline' so
> that they don't share a stack frame

clang-19 actually does pretty good job re-using stack space without
additional function call.

noinline:

../mm/filemap.c:2883:filemap_read	32	static
../mm/filemap.c:2714:filemap_read_fast	392	static
../mm/filemap.c:2763:filemap_read_slow	408	static

no modifiers:

../mm/filemap.c:2883:filemap_read	456	static

And if we increase buffer size to 1k Clang uninlines it:

../mm/filemap.c:2870:9:filemap_read	32	static
../mm/filemap.c:2714:13:filemap_read_fast	1168	static
../mm/filemap.c:2750:16:filemap_read_slow	384	static

gcc-14, on other hand, doesn't want to inline these functions, even with
'inline' specified. And '__always_inline' doesn't look good.

no modifiers / inline:

../mm/filemap.c:2883:9:filemap_read	32	static
../mm/filemap.c:2714:13:filemap_read_fast	400	static
../mm/filemap.c:2763:16:filemap_read_slow	384	static

__always_inline:

../mm/filemap.c:2883:9:filemap_read	696	static

There's room for improvement for GCC.

I am inclined leave it without modifiers. It gives reasonable result for
both compilers.

-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 15:35                                       ` Kiryl Shutsemau
  2025-10-13 15:39                                         ` Kiryl Shutsemau
@ 2025-10-13 16:06                                         ` Linus Torvalds
  2025-10-13 17:26                                         ` Theodore Ts'o
  2 siblings, 0 replies; 24+ messages in thread
From: Linus Torvalds @ 2025-10-13 16:06 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Mon, 13 Oct 2025 at 08:35, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> The patch is below. Can I use your Signed-off-by for it?

Sure. But you can also just take ownership.

> Here's some numbers. I cannot explain some of this. Like, there should
> be zero changes for block size above 256, but...

I'd assume it's the usual "random cache/code placement" thing. The
numbers seem reasonably consistent, and guessing from your table you
did basically a "random file / random offset" thing.

And even if you go "but with random files and random offsets, cache
placement should be random too", that's not true for the I$ side.

So you could have some unlucky placement of the helper routines (like
the iov stuff) that hits one case but not the other.

> 16 threads, reads from 4k file(s), MiB/s
>
>  ---------------------------------------------------------
> | Block | Baseline  | Baseline   | Patched   |  Patched   |
> | size  | same file | diff files | same file | diff files |
>  ---------------------------------------------------------
> |     1 |      11.6 |       27.6 |      33.5 |       33.4 |
> |    32 |     375   |      880   |    1027   |     1028   |
> |   256 |    2940   |     6932   |    7872   |     7884   |
> |  1024 |   11500   |    26900   |   11400   |    28300   |
> |  4096 |   46500   |   103000   |   45700   |   108000   |

The high-level pattern is fairly clear, with the big improvement being
in the "small reads hitting same file", so at least the numbers don't
look crazy.

                    Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 15:35                                       ` Kiryl Shutsemau
  2025-10-13 15:39                                         ` Kiryl Shutsemau
  2025-10-13 16:06                                         ` Linus Torvalds
@ 2025-10-13 17:26                                         ` Theodore Ts'o
  2025-10-14  3:20                                           ` Theodore Ts'o
  2 siblings, 1 reply; 24+ messages in thread
From: Theodore Ts'o @ 2025-10-13 17:26 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Linus Torvalds, Matthew Wilcox, Luis Chamberlain, Linux-MM,
	linux-fsdevel

On Mon, Oct 13, 2025 at 04:35:17PM +0100, Kiryl Shutsemau wrote:
> I also need to run xfstests. Unless someone can help me with this?
> I don't have ready-to-go setup.

Sure, I've kicked off xfstests with your patch for ext4 and xfs.
Results should be available in 2-3 hours.

	       	  	       	   - Ted

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-13 17:26                                         ` Theodore Ts'o
@ 2025-10-14  3:20                                           ` Theodore Ts'o
  0 siblings, 0 replies; 24+ messages in thread
From: Theodore Ts'o @ 2025-10-14  3:20 UTC (permalink / raw)
  To: Kiryl Shutsemau
  Cc: Linus Torvalds, Matthew Wilcox, Luis Chamberlain, Linux-MM,
	linux-fsdevel

On Mon, Oct 13, 2025 at 01:26:54PM -0400, Theodore Ts'o wrote:
> On Mon, Oct 13, 2025 at 04:35:17PM +0100, Kiryl Shutsemau wrote:
> > I also need to run xfstests. Unless someone can help me with this?
> > I don't have ready-to-go setup.
> 
> Sure, I've kicked off xfstests with your patch for ext4 and xfs.
> Results should be available in 2-3 hours.

No xfstests regressions were noted for ext4; the xfs test run didn't
get started due to hiccup.  I've restarted it, and can let you know
tomorrow morning.

						- Ted

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-07 22:54                     ` Linus Torvalds
  2025-10-07 23:30                       ` Linus Torvalds
@ 2025-10-08 10:28                       ` Kiryl Shutsemau
  2025-10-08 16:24                         ` Linus Torvalds
  1 sibling, 1 reply; 24+ messages in thread
From: Kiryl Shutsemau @ 2025-10-08 10:28 UTC (permalink / raw)
  To: Linus Torvalds; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Tue, Oct 07, 2025 at 03:54:19PM -0700, Linus Torvalds wrote:
> On Tue, 7 Oct 2025 at 15:35, Linus Torvalds
> <torvalds@linux-foundation.org> wrote:
> >
> > But I think I'll try to boot it next. Wish me luck.
> 
> Bah. It boots - after you fix the stupid double increment of 'already_copied'.
> 
> I didn't remove the update inside the loop when I made it update it
> after the loop.
> 
> So here's the slightly fixed patch that actually does boot - and that
> I'm running right now. But I wouldn't call it exactly "tested".
> 
> Caveat patchor.

My take on the same is below.

The biggest difference is that I drop RCU lock between iterations. But
as you said, not sure if it is sensible. It allows page faults.

Other thing that I noticed that we don't do flush_dcache_folio() in fast
path. I bypassed fast path if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE.

>  mm/filemap.c | 33 +++++++++++++++++++++++++--------
>  1 file changed, 25 insertions(+), 8 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 60a7b9275741..ba11f018ca6b 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2792,20 +2792,37 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
>  	 * any compiler initialization would be pointless since this
>  	 * can fill it will garbage.
>  	 */
> -	if (iov_iter_count(iter) <= sizeof(area)) {
> +	if (iov_iter_count(iter) <= PAGE_SIZE) {

PAGE_SIZE is somewhat arbitrary here. We might want to see if we can do
full length (or until the first failure). But holding RCU read lock whole
time might not be a good idea in this case.

>  		size_t count = iov_iter_count(iter);
> +		size_t fast_read = 0;
>  
>  		/* Let's see if we can just do the read under RCU */
>  		rcu_read_lock();
> -		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
> +		pagefault_disable();
> +		do {
> +			size_t copied = min(count, sizeof(area));
> +
> +			copied = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, copied);
> +			if (!copied)
> +				break;

filemap_fast_read() will only read short on EOF. So if it reads short we
don't need additional iterations.

> +			copied = copy_to_iter(area.buffer, copied, iter);
> +			if (!copied)
> +				break;
> +			fast_read += copied;
> +			iocb->ki_pos += copied;
> +			count -= copied;
> +		} while (count);
> +		pagefault_enable();
>  		rcu_read_unlock();
> -		if (count) {
> -			size_t copied = copy_to_iter(area.buffer, count, iter);
> -			if (unlikely(!copied))
> -				return already_read ? already_read : -EFAULT;
> -			ra->prev_pos = iocb->ki_pos += copied;
> +
> +		if (fast_read) {
> +			ra->prev_pos += fast_read;
> +			already_read += fast_read;
>  			file_accessed(filp);
> -			return copied + already_read;
> +
> +			/* All done? */
> +			if (!count)
> +				return already_read;
>  		}
>  	}
>  

diff --git a/mm/filemap.c b/mm/filemap.c
index d9fda3c3ae2c..6b9627cf47af 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2752,29 +2752,48 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes - iocb->ki_pos);
 
+	/* Don't bother with flush_dcache_folio() */
+	if (ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE)
+		goto slowpath;
+
 	/*
 	 * Try a quick lockless read into the 'area' union. Note that
 	 * this union is intentionally marked "__uninitialized", because
 	 * any compiler initialization would be pointless since this
 	 * can fill it will garbage.
 	 */
-	if (iov_iter_count(iter) <= sizeof(area)) {
-		size_t count = iov_iter_count(iter);
+	do {
+		size_t to_read, read, copied;
+
+		to_read = min(iov_iter_count(iter), sizeof(area));
 
 		/* Let's see if we can just do the read under RCU */
 		rcu_read_lock();
-		count = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, count);
+		read = filemap_fast_read(mapping, iocb->ki_pos, area.buffer, to_read);
 		rcu_read_unlock();
-		if (count) {
-			size_t copied = copy_to_iter(area.buffer, count, iter);
-			if (unlikely(!copied))
-				return already_read ? already_read : -EFAULT;
-			ra->prev_pos = iocb->ki_pos += copied;
-			file_accessed(filp);
-			return copied + already_read;
-		}
-	}
 
+		if (!read)
+			break;
+
+		copied = copy_to_iter(area.buffer, read, iter);
+
+		already_read += copied;
+		iocb->ki_pos += copied;
+		last_pos = iocb->ki_pos;
+
+		if (copied < read) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		/* filemap_fast_read() only reads short at EOF: Stop. */
+		if (read != to_read)
+			goto out;
+	} while (iov_iter_count(iter));
+
+	if (!iov_iter_count(iter))
+		goto out;
+slowpath:
 	/*
 	 * This actually properly initializes the fbatch for the slow case
 	 */
@@ -2865,7 +2884,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		}
 		folio_batch_init(&area.fbatch);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
-
+out:
 	file_accessed(filp);
 	ra->prev_pos = last_pos;
 	return already_read ? already_read : error;
-- 
  Kiryl Shutsemau / Kirill A. Shutemov

^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: Optimizing small reads
  2025-10-08 10:28                       ` Kiryl Shutsemau
@ 2025-10-08 16:24                         ` Linus Torvalds
  0 siblings, 0 replies; 24+ messages in thread
From: Linus Torvalds @ 2025-10-08 16:24 UTC (permalink / raw)
  To: Kiryl Shutsemau; +Cc: Matthew Wilcox, Luis Chamberlain, Linux-MM, linux-fsdevel

On Wed, 8 Oct 2025 at 03:28, Kiryl Shutsemau <kirill@shutemov.name> wrote:
>
> PAGE_SIZE is somewhat arbitrary here. We might want to see if we can do
> full length (or until the first failure). But holding RCU read lock whole
> time might not be a good idea in this case.

There are other considerations too. If you do large reads, then you
might want to do the gang lookup of pages etc.

This low-latency path is unlikely to be the best one for big reads, in
other words.

I agree that the exact point where you do that is pretty arbitrary.

> filemap_fast_read() will only read short on EOF. So if it reads short we
> don't need additional iterations.

That's only true in the current stupid implementation.

If we actualyl do page crossing reads etc, then filemap_fast_read()
will return partial reads for when it hits a folio end etc.

Right now it does that

                /* No partial reads */
                return 0;

but that's a *current* limitation and only makes sense in the "oh,
we're going to have to fall back to the slow case anyway" model.

In the "we'll try again" model it's actually not a sensible thing.

            Linus

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2025-10-14 12:58 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
     [not found] <CAHk-=wj00-nGmXEkxY=-=Z_qP6kiGUziSFvxHJ9N-cLWry5zpA@mail.gmail.com>
     [not found] ` <flg637pjmcnxqpgmsgo5yvikwznak2rl4il2srddcui2564br5@zmpwmxibahw2>
     [not found]   ` <CAHk-=wgy=oOSu+A3cMfVhBK66zdFsstDV3cgVO-=RF4cJ2bZ+A@mail.gmail.com>
     [not found]     ` <CAHk-=whThZaXqDdum21SEWXjKQXmBcFN8E5zStX8W-EMEhAFdQ@mail.gmail.com>
     [not found]       ` <a3nryktlvr6raisphhw56mdkvff6zr5athu2bsyiotrtkjchf3@z6rdwygtybft>
     [not found]         ` <CAHk-=wg-eq7s8UMogFCS8OJQt9hwajwKP6kzW88avbx+4JXhcA@mail.gmail.com>
2025-10-06 11:44           ` Optimizing small reads Kiryl Shutsemau
2025-10-06 15:50             ` Linus Torvalds
2025-10-06 18:04               ` Kiryl Shutsemau
2025-10-06 18:14                 ` Linus Torvalds
2025-10-07 21:47                 ` Linus Torvalds
2025-10-07 22:35                   ` Linus Torvalds
2025-10-07 22:54                     ` Linus Torvalds
2025-10-07 23:30                       ` Linus Torvalds
2025-10-08 14:54                         ` Kiryl Shutsemau
2025-10-08 16:27                           ` Linus Torvalds
2025-10-08 17:03                             ` Linus Torvalds
2025-10-09 16:22                               ` Kiryl Shutsemau
2025-10-09 17:29                                 ` Linus Torvalds
2025-10-10 10:10                                   ` Kiryl Shutsemau
2025-10-10 17:51                                     ` Linus Torvalds
2025-10-13 15:35                                       ` Kiryl Shutsemau
2025-10-13 15:39                                         ` Kiryl Shutsemau
2025-10-13 16:19                                           ` Linus Torvalds
2025-10-14 12:58                                             ` Kiryl Shutsemau
2025-10-13 16:06                                         ` Linus Torvalds
2025-10-13 17:26                                         ` Theodore Ts'o
2025-10-14  3:20                                           ` Theodore Ts'o
2025-10-08 10:28                       ` Kiryl Shutsemau
2025-10-08 16:24                         ` Linus Torvalds

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).