- * [PATCH v2 01/18] mm/filemap: Rename generic_file_buffered_read subfunctions
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:07   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox (Oracle)
                   ` (16 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
The recent split of generic_file_buffered_read() created some very
long function names which are hard to distinguish from each other.
Rename as follows:
generic_file_buffered_read_readpage -> filemap_read_page
generic_file_buffered_read_pagenotuptodate -> filemap_update_page
generic_file_buffered_read_no_cached_page -> filemap_create_page
generic_file_buffered_read_get_pages -> filemap_get_pages
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 44 +++++++++++++++-----------------------------
 1 file changed, 15 insertions(+), 29 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index a68516ddeddc..23e3781b3aef 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2176,11 +2176,8 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
 		return lock_page_killable(page);
 }
 
-static struct page *
-generic_file_buffered_read_readpage(struct kiocb *iocb,
-				    struct file *filp,
-				    struct address_space *mapping,
-				    struct page *page)
+static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp,
+		struct address_space *mapping, struct page *page)
 {
 	struct file_ra_state *ra = &filp->f_ra;
 	int error;
@@ -2231,12 +2228,9 @@ generic_file_buffered_read_readpage(struct kiocb *iocb,
 	return page;
 }
 
-static struct page *
-generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
-					   struct file *filp,
-					   struct iov_iter *iter,
-					   struct page *page,
-					   loff_t pos, loff_t count)
+static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
+		struct iov_iter *iter, struct page *page, loff_t pos,
+		loff_t count)
 {
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
@@ -2299,12 +2293,11 @@ generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
 		return page;
 	}
 
-	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+	return filemap_read_page(iocb, filp, mapping, page);
 }
 
-static struct page *
-generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
-					  struct iov_iter *iter)
+static struct page *filemap_create_page(struct kiocb *iocb,
+		struct iov_iter *iter)
 {
 	struct file *filp = iocb->ki_filp;
 	struct address_space *mapping = filp->f_mapping;
@@ -2315,10 +2308,6 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
 	if (iocb->ki_flags & IOCB_NOIO)
 		return ERR_PTR(-EAGAIN);
 
-	/*
-	 * Ok, it wasn't cached, so we need to create a new
-	 * page..
-	 */
 	page = page_cache_alloc(mapping);
 	if (!page)
 		return ERR_PTR(-ENOMEM);
@@ -2330,13 +2319,11 @@ generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
 		return error != -EEXIST ? ERR_PTR(error) : NULL;
 	}
 
-	return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+	return filemap_read_page(iocb, filp, mapping, page);
 }
 
-static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
-						struct iov_iter *iter,
-						struct page **pages,
-						unsigned int nr)
+static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
+		struct page **pages, unsigned int nr)
 {
 	struct file *filp = iocb->ki_filp;
 	struct address_space *mapping = filp->f_mapping;
@@ -2363,7 +2350,7 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 	if (nr_got)
 		goto got_pages;
 
-	pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+	pages[0] = filemap_create_page(iocb, iter);
 	err = PTR_ERR_OR_ZERO(pages[0]);
 	if (!IS_ERR_OR_NULL(pages[0]))
 		nr_got = 1;
@@ -2397,8 +2384,8 @@ static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
 				break;
 			}
 
-			page = generic_file_buffered_read_pagenotuptodate(iocb,
-					filp, iter, page, pg_pos, pg_count);
+			page = filemap_update_page(iocb, filp, iter, page,
+					pg_pos, pg_count);
 			if (IS_ERR_OR_NULL(page)) {
 				for (j = i + 1; j < nr_got; j++)
 					put_page(pages[j]);
@@ -2474,8 +2461,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			iocb->ki_flags |= IOCB_NOWAIT;
 
 		i = 0;
-		pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
-							     pages, nr_pages);
+		pg_nr = filemap_get_pages(iocb, iter, pages, nr_pages);
 		if (pg_nr < 0) {
 			error = pg_nr;
 			break;
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 01/18] mm/filemap: Rename generic_file_buffered_read subfunctions Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 21:30   ` Kent Overstreet
  2020-11-06  8:07   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 03/18] mm/filemap: Convert filemap_get_pages to take a pagevec Matthew Wilcox (Oracle)
                   ` (15 subsequent siblings)
  17 siblings, 2 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Increasing the batch size runs into diminishing returns.  It's probably
better to make, eg, three calls to filemap_get_pages() than it is to
call into kmalloc().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/filemap.c | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 23e3781b3aef..bb1c42d0223c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2429,8 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
-	unsigned int nr_pages = min_t(unsigned int, 512,
+	struct page *pages[PAGEVEC_SIZE];
+	unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
 			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
 			(iocb->ki_pos >> PAGE_SHIFT));
 	int i, pg_nr, error = 0;
@@ -2441,14 +2441,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		return 0;
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
-	if (nr_pages > ARRAY_SIZE(pages_onstack))
-		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
-
-	if (!pages) {
-		pages = pages_onstack;
-		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
-	}
-
 	do {
 		cond_resched();
 
@@ -2533,9 +2525,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 	file_accessed(filp);
 
-	if (pages != pages_onstack)
-		kfree(pages);
-
 	return written ? written : error;
 }
 EXPORT_SYMBOL_GPL(generic_file_buffered_read);
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 20:42 ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox (Oracle)
@ 2020-11-04 21:30   ` Kent Overstreet
  2020-11-04 21:43     ` Amy Parker
                       ` (2 more replies)
  2020-11-06  8:07   ` Christoph Hellwig
  1 sibling, 3 replies; 41+ messages in thread
From: Kent Overstreet @ 2020-11-04 21:30 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch
On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> Increasing the batch size runs into diminishing returns.  It's probably
> better to make, eg, three calls to filemap_get_pages() than it is to
> call into kmalloc().
I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
like working with 4k pages today, and have you actually read the slub code for
the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
doesn't even have to disable preemption - which is why you never see it showing
up in profiles ever since we switched to slub.
It would however be better to have a standard abstraction for this rather than
open coding it - perhaps adding it to the pagevec code. Please don't just drop
it, though.
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  mm/filemap.c | 15 ++-------------
>  1 file changed, 2 insertions(+), 13 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 23e3781b3aef..bb1c42d0223c 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2429,8 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  	struct file_ra_state *ra = &filp->f_ra;
>  	struct address_space *mapping = filp->f_mapping;
>  	struct inode *inode = mapping->host;
> -	struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
> -	unsigned int nr_pages = min_t(unsigned int, 512,
> +	struct page *pages[PAGEVEC_SIZE];
> +	unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
>  			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
>  			(iocb->ki_pos >> PAGE_SHIFT));
>  	int i, pg_nr, error = 0;
> @@ -2441,14 +2441,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  		return 0;
>  	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
>  
> -	if (nr_pages > ARRAY_SIZE(pages_onstack))
> -		pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
> -
> -	if (!pages) {
> -		pages = pages_onstack;
> -		nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
> -	}
> -
>  	do {
>  		cond_resched();
>  
> @@ -2533,9 +2525,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
>  
>  	file_accessed(filp);
>  
> -	if (pages != pages_onstack)
> -		kfree(pages);
> -
>  	return written ? written : error;
>  }
>  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
> -- 
> 2.28.0
> 
^ permalink raw reply	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 21:30   ` Kent Overstreet
@ 2020-11-04 21:43     ` Amy Parker
  2020-11-05  0:13     ` Matthew Wilcox
  2020-11-05  4:52     ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox
  2 siblings, 0 replies; 41+ messages in thread
From: Amy Parker @ 2020-11-04 21:43 UTC (permalink / raw)
  To: Kent Overstreet
  Cc: Matthew Wilcox (Oracle), linux-fsdevel, Linux MM,
	Christoph Hellwig
On Wed, Nov 4, 2020 at 1:34 PM Kent Overstreet
<kent.overstreet@gmail.com> wrote:
>
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
>
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.
Yeah, kmalloc's fast path is extremely fast. Let's stay off
PAGEVEC_SIZE for now.
>
> It would however be better to have a standard abstraction for this rather than
> open coding it - perhaps adding it to the pagevec code. Please don't just drop
> it, though.
>
>
> > Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> > ---
> >  mm/filemap.c | 15 ++-------------
> >  1 file changed, 2 insertions(+), 13 deletions(-)
> >
> > diff --git a/mm/filemap.c b/mm/filemap.c
> > index 23e3781b3aef..bb1c42d0223c 100644
> > --- a/mm/filemap.c
> > +++ b/mm/filemap.c
> > @@ -2429,8 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >       struct file_ra_state *ra = &filp->f_ra;
> >       struct address_space *mapping = filp->f_mapping;
> >       struct inode *inode = mapping->host;
> > -     struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
> > -     unsigned int nr_pages = min_t(unsigned int, 512,
> > +     struct page *pages[PAGEVEC_SIZE];
> > +     unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
> >                       ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
> >                       (iocb->ki_pos >> PAGE_SHIFT));
> >       int i, pg_nr, error = 0;
> > @@ -2441,14 +2441,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >               return 0;
> >       iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
> >
> > -     if (nr_pages > ARRAY_SIZE(pages_onstack))
> > -             pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
> > -
> > -     if (!pages) {
> > -             pages = pages_onstack;
> > -             nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
> > -     }
> > -
> >       do {
> >               cond_resched();
> >
> > @@ -2533,9 +2525,6 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
> >
> >       file_accessed(filp);
> >
> > -     if (pages != pages_onstack)
> > -             kfree(pages);
> > -
> >       return written ? written : error;
> >  }
> >  EXPORT_SYMBOL_GPL(generic_file_buffered_read);
> > --
> > 2.28.0
> >
Best regards,
Amy Parker
(they/them)
^ permalink raw reply	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 21:30   ` Kent Overstreet
  2020-11-04 21:43     ` Amy Parker
@ 2020-11-05  0:13     ` Matthew Wilcox
  2020-11-06  8:08       ` Christoph Hellwig
  2020-11-05  4:52     ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox
  2 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox @ 2020-11-05  0:13 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-fsdevel, linux-mm, hch
On Wed, Nov 04, 2020 at 04:30:05PM -0500, Kent Overstreet wrote:
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
> 
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.
> 
> It would however be better to have a standard abstraction for this rather than
> open coding it - perhaps adding it to the pagevec code. Please don't just drop
> it, though.
I have the beginnings of a patch for that, but I got busy with other stuff
and didn't finish it.
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 081d934eda64..b067d8aab874 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -18,13 +18,21 @@ struct page;
 struct address_space;
 
 struct pagevec {
-	unsigned char nr;
-	bool percpu_pvec_drained;
-	struct page *pages[PAGEVEC_SIZE];
+	union {
+		struct {
+			unsigned char sz;
+			unsigned char nr;
+			bool percpu_pvec_drained;
+			struct page *pages[];
+		};
+		void *__p[PAGEVEC_SIZE + 1];
+	};
 };
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+struct pagevec *pagevec_alloc(unsigned int sz, gfp_t gfp);
+void pagevec_free(struct pagevec *);
 unsigned pagevec_lookup_entries(struct pagevec *pvec,
 				struct address_space *mapping,
 				pgoff_t start, unsigned nr_entries,
@@ -54,6 +62,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 
 static inline void pagevec_init(struct pagevec *pvec)
 {
+	pvec->sz = PAGEVEC_SIZE;
 	pvec->nr = 0;
 	pvec->percpu_pvec_drained = false;
 }
@@ -63,6 +72,11 @@ static inline void pagevec_reinit(struct pagevec *pvec)
 	pvec->nr = 0;
 }
 
+static inline unsigned pagevec_size(struct pagevec *pvec)
+{
+	return pvec->sz;
+}
+
 static inline unsigned pagevec_count(struct pagevec *pvec)
 {
 	return pvec->nr;
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-05  0:13     ` Matthew Wilcox
@ 2020-11-06  8:08       ` Christoph Hellwig
  2020-11-06 12:30         ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Matthew Wilcox (Oracle)
  0 siblings, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:08 UTC (permalink / raw)
  To: Matthew Wilcox; +Cc: Kent Overstreet, linux-fsdevel, linux-mm, hch
On Thu, Nov 05, 2020 at 12:13:02AM +0000, Matthew Wilcox wrote:
> I have the beginnings of a patch for that, but I got busy with other stuff
> and didn't finish it.
If we have numbers that we want larger batch sizes than the current
pagevec PAGEVEC_SIZE this is the way to go insted of special casing
it in one path.
^ permalink raw reply	[flat|nested] 41+ messages in thread 
- * [PATCH 1/4] pagevec: Allow pagevecs to be different sizes
  2020-11-06  8:08       ` Christoph Hellwig
@ 2020-11-06 12:30         ` Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 2/4] pagevec: Increase the size of LRU pagevecs Matthew Wilcox (Oracle)
                             ` (3 more replies)
  0 siblings, 4 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-06 12:30 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, hch, kent.overstreet; +Cc: Matthew Wilcox (Oracle)
Declaring a pagevec continues to create a pagevec which is the same size,
but functions which manipulate pagevecs no longer rely on this.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/pagevec.h | 20 ++++++++++++++++----
 mm/swap.c               |  8 ++++++++
 2 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 875a3f0d9dd2..ee5d3c4da8da 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -18,9 +18,15 @@ struct page;
 struct address_space;
 
 struct pagevec {
-	unsigned char nr;
-	bool percpu_pvec_drained;
-	struct page *pages[PAGEVEC_SIZE];
+	union {
+		struct {
+			unsigned char sz;
+			unsigned char nr;
+			bool percpu_pvec_drained;
+			struct page *pages[PAGEVEC_SIZE];
+		};
+		void *__p[PAGEVEC_SIZE + 1];
+	};
 };
 
 void __pagevec_release(struct pagevec *pvec);
@@ -41,6 +47,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
 
 static inline void pagevec_init(struct pagevec *pvec)
 {
+	pvec->sz = PAGEVEC_SIZE;
 	pvec->nr = 0;
 	pvec->percpu_pvec_drained = false;
 }
@@ -50,6 +57,11 @@ static inline void pagevec_reinit(struct pagevec *pvec)
 	pvec->nr = 0;
 }
 
+static inline unsigned pagevec_size(struct pagevec *pvec)
+{
+	return pvec->sz;
+}
+
 static inline unsigned pagevec_count(struct pagevec *pvec)
 {
 	return pvec->nr;
@@ -57,7 +69,7 @@ static inline unsigned pagevec_count(struct pagevec *pvec)
 
 static inline unsigned pagevec_space(struct pagevec *pvec)
 {
-	return PAGEVEC_SIZE - pvec->nr;
+	return pvec->sz - pvec->nr;
 }
 
 /*
diff --git a/mm/swap.c b/mm/swap.c
index 2ee3522a7170..d093fb30f038 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -52,6 +52,7 @@ struct lru_rotate {
 };
 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
 	.lock = INIT_LOCAL_LOCK(lock),
+	.pvec.sz = PAGEVEC_SIZE,
 };
 
 /*
@@ -70,6 +71,13 @@ struct lru_pvecs {
 };
 static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 	.lock = INIT_LOCAL_LOCK(lock),
+	.lru_add.sz = PAGEVEC_SIZE,
+	.lru_deactivate_file.sz = PAGEVEC_SIZE,
+	.lru_deactivate.sz = PAGEVEC_SIZE,
+	.lru_lazyfree.sz = PAGEVEC_SIZE,
+#ifdef CONFIG_SMP
+	.activate_page.sz = PAGEVEC_SIZE,
+#endif
 };
 
 /*
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH 2/4] pagevec: Increase the size of LRU pagevecs
  2020-11-06 12:30         ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Matthew Wilcox (Oracle)
@ 2020-11-06 12:30           ` Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 3/4] pagevec: Add dynamically allocated pagevecs Matthew Wilcox (Oracle)
                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-06 12:30 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, hch, kent.overstreet; +Cc: Matthew Wilcox (Oracle)
Tim Chen reports that increasing the size of LRU pagevecs is advantageous
with a workload he has:
https://lore.kernel.org/linux-mm/d1cc9f12a8ad6c2a52cb600d93b06b064f2bbc57.1593205965.git.tim.c.chen@linux.intel.com/
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/pagevec.h |  5 +++++
 mm/swap.c               | 27 +++++++++++++++------------
 2 files changed, 20 insertions(+), 12 deletions(-)
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index ee5d3c4da8da..4dc45392d776 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -29,6 +29,11 @@ struct pagevec {
 	};
 };
 
+#define declare_pagevec(name, size) union {				\
+	struct pagevec name;						\
+	void *__p ##name [size + 1];					\
+}
+
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
 void pagevec_remove_exceptionals(struct pagevec *pvec);
diff --git a/mm/swap.c b/mm/swap.c
index d093fb30f038..1e6f50b312ea 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -45,14 +45,17 @@
 /* How many pages do we try to swap or page in/out together? */
 int page_cluster;
 
+#define LRU_PAGEVEC_SIZE 63
+#define lru_pagevec(name)	declare_pagevec(name, LRU_PAGEVEC_SIZE)
+
 /* Protecting only lru_rotate.pvec which requires disabling interrupts */
 struct lru_rotate {
 	local_lock_t lock;
-	struct pagevec pvec;
+	lru_pagevec(pvec);
 };
 static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
 	.lock = INIT_LOCAL_LOCK(lock),
-	.pvec.sz = PAGEVEC_SIZE,
+	.pvec.sz = LRU_PAGEVEC_SIZE,
 };
 
 /*
@@ -61,22 +64,22 @@ static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
  */
 struct lru_pvecs {
 	local_lock_t lock;
-	struct pagevec lru_add;
-	struct pagevec lru_deactivate_file;
-	struct pagevec lru_deactivate;
-	struct pagevec lru_lazyfree;
+	lru_pagevec(lru_add);
+	lru_pagevec(lru_deactivate_file);
+	lru_pagevec(lru_deactivate);
+	lru_pagevec(lru_lazyfree);
 #ifdef CONFIG_SMP
-	struct pagevec activate_page;
+	lru_pagevec(activate_page);
 #endif
 };
 static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
 	.lock = INIT_LOCAL_LOCK(lock),
-	.lru_add.sz = PAGEVEC_SIZE,
-	.lru_deactivate_file.sz = PAGEVEC_SIZE,
-	.lru_deactivate.sz = PAGEVEC_SIZE,
-	.lru_lazyfree.sz = PAGEVEC_SIZE,
+	.lru_add.sz = LRU_PAGEVEC_SIZE,
+	.lru_deactivate_file.sz = LRU_PAGEVEC_SIZE,
+	.lru_deactivate.sz = LRU_PAGEVEC_SIZE,
+	.lru_lazyfree.sz = LRU_PAGEVEC_SIZE,
 #ifdef CONFIG_SMP
-	.activate_page.sz = PAGEVEC_SIZE,
+	.activate_page.sz = LRU_PAGEVEC_SIZE,
 #endif
 };
 
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH 3/4] pagevec: Add dynamically allocated pagevecs
  2020-11-06 12:30         ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 2/4] pagevec: Increase the size of LRU pagevecs Matthew Wilcox (Oracle)
@ 2020-11-06 12:30           ` Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 4/4] mm/filemap: Use a dynamically allocated pagevec in filemap_read Matthew Wilcox (Oracle)
  2020-11-07 17:08           ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Kent Overstreet
  3 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-06 12:30 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, hch, kent.overstreet; +Cc: Matthew Wilcox (Oracle)
Add pagevec_alloc() and pagevec_free() to allow for pagevecs up to 255
entries in size.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 include/linux/pagevec.h |  7 +++++++
 mm/swap.c               | 28 ++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)
diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index 4dc45392d776..4d5a48d7a372 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -36,6 +36,13 @@ struct pagevec {
 
 void __pagevec_release(struct pagevec *pvec);
 void __pagevec_lru_add(struct pagevec *pvec);
+struct pagevec *pagevec_alloc(unsigned long sz, gfp_t gfp);
+
+static inline void pagevec_free(struct pagevec *pvec)
+{
+	kfree(pvec);
+}
+
 void pagevec_remove_exceptionals(struct pagevec *pvec);
 
 unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
diff --git a/mm/swap.c b/mm/swap.c
index 1e6f50b312ea..3f856a272cb2 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -988,6 +988,34 @@ void __pagevec_release(struct pagevec *pvec)
 }
 EXPORT_SYMBOL(__pagevec_release);
 
+/**
+ * pagevec_alloc - Allocate a pagevec.
+ * @sz: Number of pages wanted.
+ * @gfp: Memory allocation flags.
+ *
+ * Allocates a new pagevec.  The @sz parameter is advisory; this function
+ * may allocate a pagevec that can contain fewer pages than requested.  If
+ * the caller cares how many were allocated, it can check pagevec_size(),
+ * but most callers will simply use as many as were allocated.
+ *
+ * Return: A new pagevec, or NULL if memory allocation failed.
+ */
+struct pagevec *pagevec_alloc(unsigned long sz, gfp_t gfp)
+{
+	struct pagevec *pvec;
+
+	if (sz > 255)
+		sz = 255;
+	pvec = kmalloc_array(sz + 1, sizeof(void *), gfp);
+	if (!pvec)
+		return NULL;
+	pvec->nr = 0;
+	pvec->sz = sz;
+
+	return pvec;
+}
+EXPORT_SYMBOL(pagevec_alloc);
+
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 /* used by __split_huge_page_refcount() */
 void lru_add_page_tail(struct page *page, struct page *page_tail,
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH 4/4] mm/filemap: Use a dynamically allocated pagevec in filemap_read
  2020-11-06 12:30         ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 2/4] pagevec: Increase the size of LRU pagevecs Matthew Wilcox (Oracle)
  2020-11-06 12:30           ` [PATCH 3/4] pagevec: Add dynamically allocated pagevecs Matthew Wilcox (Oracle)
@ 2020-11-06 12:30           ` Matthew Wilcox (Oracle)
  2020-11-07 17:08           ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Kent Overstreet
  3 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-06 12:30 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm, hch, kent.overstreet; +Cc: Matthew Wilcox (Oracle)
Restore Kent's optimisation for I/O sizes between 64kB and 1MB.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/filemap.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index f41de0759e86..2b4d8ed241bd 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2427,7 +2427,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct pagevec pvec;
+	struct pagevec pvec_stack, *pvec = NULL;
 	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
@@ -2436,6 +2436,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		return 0;
 	iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
 
+	if (iter->count / PAGE_SIZE > PAGEVEC_SIZE)
+		pvec = pagevec_alloc(iter->count / PAGE_SIZE + 1, GFP_KERNEL);
+	if (!pvec)
+		pvec = &pvec_stack;
 	do {
 		cond_resched();
 
@@ -2447,7 +2451,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
-		error = filemap_get_pages(iocb, iter, &pvec);
+		error = filemap_get_pages(iocb, iter, pvec);
 		if (error < 0)
 			break;
 
@@ -2476,10 +2480,10 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 		 */
 		if (iocb->ki_pos >> PAGE_SHIFT !=
 		    ra->prev_pos >> PAGE_SHIFT)
-			mark_page_accessed(pvec.pages[0]);
+			mark_page_accessed(pvec->pages[0]);
 
-		for (i = 0; i < pagevec_count(&pvec); i++) {
-			struct page *page = pvec.pages[i];
+		for (i = 0; i < pagevec_count(pvec); i++) {
+			struct page *page = pvec->pages[i];
 			size_t page_size = thp_size(page);
 			size_t offset = iocb->ki_pos & (page_size - 1);
 			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
@@ -2514,7 +2518,7 @@ ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
 			}
 		}
 put_pages:
-		pagevec_release(&pvec);
+		pagevec_release(pvec);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH 1/4] pagevec: Allow pagevecs to be different sizes
  2020-11-06 12:30         ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Matthew Wilcox (Oracle)
                             ` (2 preceding siblings ...)
  2020-11-06 12:30           ` [PATCH 4/4] mm/filemap: Use a dynamically allocated pagevec in filemap_read Matthew Wilcox (Oracle)
@ 2020-11-07 17:08           ` Kent Overstreet
  2020-11-07 17:20             ` Matthew Wilcox
  3 siblings, 1 reply; 41+ messages in thread
From: Kent Overstreet @ 2020-11-07 17:08 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch
On Fri, Nov 06, 2020 at 12:30:37PM +0000, Matthew Wilcox (Oracle) wrote:
> Declaring a pagevec continues to create a pagevec which is the same size,
> but functions which manipulate pagevecs no longer rely on this.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> ---
>  include/linux/pagevec.h | 20 ++++++++++++++++----
>  mm/swap.c               |  8 ++++++++
>  2 files changed, 24 insertions(+), 4 deletions(-)
> 
> diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
> index 875a3f0d9dd2..ee5d3c4da8da 100644
> --- a/include/linux/pagevec.h
> +++ b/include/linux/pagevec.h
> @@ -18,9 +18,15 @@ struct page;
>  struct address_space;
>  
>  struct pagevec {
> -	unsigned char nr;
> -	bool percpu_pvec_drained;
> -	struct page *pages[PAGEVEC_SIZE];
> +	union {
> +		struct {
> +			unsigned char sz;
> +			unsigned char nr;
> +			bool percpu_pvec_drained;
This should probably be removed, it's only used by the swap code and I don't
think it belongs in the generic data structure. That would mean nr and size (and
let's please use more standard naming...) can be u32, not u8s.
> +			struct page *pages[PAGEVEC_SIZE];
> +		};
> +		void *__p[PAGEVEC_SIZE + 1];
What's up with this union?
> +	};
>  };
>  
>  void __pagevec_release(struct pagevec *pvec);
> @@ -41,6 +47,7 @@ static inline unsigned pagevec_lookup_tag(struct pagevec *pvec,
>  
>  static inline void pagevec_init(struct pagevec *pvec)
>  {
> +	pvec->sz = PAGEVEC_SIZE;
>  	pvec->nr = 0;
>  	pvec->percpu_pvec_drained = false;
>  }
> @@ -50,6 +57,11 @@ static inline void pagevec_reinit(struct pagevec *pvec)
>  	pvec->nr = 0;
>  }
>  
> +static inline unsigned pagevec_size(struct pagevec *pvec)
> +{
> +	return pvec->sz;
> +}
> +
>  static inline unsigned pagevec_count(struct pagevec *pvec)
>  {
>  	return pvec->nr;
> @@ -57,7 +69,7 @@ static inline unsigned pagevec_count(struct pagevec *pvec)
>  
>  static inline unsigned pagevec_space(struct pagevec *pvec)
>  {
> -	return PAGEVEC_SIZE - pvec->nr;
> +	return pvec->sz - pvec->nr;
>  }
>  
>  /*
> diff --git a/mm/swap.c b/mm/swap.c
> index 2ee3522a7170..d093fb30f038 100644
> --- a/mm/swap.c
> +++ b/mm/swap.c
> @@ -52,6 +52,7 @@ struct lru_rotate {
>  };
>  static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
>  	.lock = INIT_LOCAL_LOCK(lock),
> +	.pvec.sz = PAGEVEC_SIZE,
>  };
>  
>  /*
> @@ -70,6 +71,13 @@ struct lru_pvecs {
>  };
>  static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
>  	.lock = INIT_LOCAL_LOCK(lock),
> +	.lru_add.sz = PAGEVEC_SIZE,
> +	.lru_deactivate_file.sz = PAGEVEC_SIZE,
> +	.lru_deactivate.sz = PAGEVEC_SIZE,
> +	.lru_lazyfree.sz = PAGEVEC_SIZE,
> +#ifdef CONFIG_SMP
> +	.activate_page.sz = PAGEVEC_SIZE,
> +#endif
>  };
>  
>  /*
> -- 
> 2.28.0
> 
^ permalink raw reply	[flat|nested] 41+ messages in thread
- * Re: [PATCH 1/4] pagevec: Allow pagevecs to be different sizes
  2020-11-07 17:08           ` [PATCH 1/4] pagevec: Allow pagevecs to be different sizes Kent Overstreet
@ 2020-11-07 17:20             ` Matthew Wilcox
  0 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox @ 2020-11-07 17:20 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-fsdevel, linux-mm, hch
On Sat, Nov 07, 2020 at 12:08:13PM -0500, Kent Overstreet wrote:
> On Fri, Nov 06, 2020 at 12:30:37PM +0000, Matthew Wilcox (Oracle) wrote:
> >  struct pagevec {
> > -	unsigned char nr;
> > -	bool percpu_pvec_drained;
> > -	struct page *pages[PAGEVEC_SIZE];
> > +	union {
> > +		struct {
> > +			unsigned char sz;
> > +			unsigned char nr;
> > +			bool percpu_pvec_drained;
> This should probably be removed, it's only used by the swap code and I don't
> think it belongs in the generic data structure. That would mean nr and size (and
> let's please use more standard naming...) can be u32, not u8s.
Nevertheless, that's a very important user of pagevecs!  You and I have no
need for it in the fs code, but it doesn't hurt to leave it here.
I really don't think that increasing the size above 255 is worth anything.
See my earlir analysis of the effect of increasing the batch size.
> > +			struct page *pages[PAGEVEC_SIZE];
> > +		};
> > +		void *__p[PAGEVEC_SIZE + 1];
> 
> What's up with this union?
*blink*.  That was supposed to be 'struct page *pages[];'
And the reason to do it that way is to avoid overly-clever instrumentation
like kmsan from saying "Hey, nr is bigger than 16, this is a buffer
overrun".  Clearly I didn't build a kernel with the various sanitisers
enabled, but I'll do that now.
^ permalink raw reply	[flat|nested] 41+ messages in thread
 
 
 
 
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 21:30   ` Kent Overstreet
  2020-11-04 21:43     ` Amy Parker
  2020-11-05  0:13     ` Matthew Wilcox
@ 2020-11-05  4:52     ` Matthew Wilcox
  2 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox @ 2020-11-05  4:52 UTC (permalink / raw)
  To: Kent Overstreet; +Cc: linux-fsdevel, linux-mm, hch
On Wed, Nov 04, 2020 at 04:30:05PM -0500, Kent Overstreet wrote:
> On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> > Increasing the batch size runs into diminishing returns.  It's probably
> > better to make, eg, three calls to filemap_get_pages() than it is to
> > call into kmalloc().
> 
> I have to disagree. Working with PAGEVEC_SIZE pages is eventually going to be
> like working with 4k pages today, and have you actually read the slub code for
> the kmalloc fast path? It's _really_ fast, there's no atomic operations and it
> doesn't even have to disable preemption - which is why you never see it showing
> up in profiles ever since we switched to slub.
I've been puzzling over this, and trying to run some benchmarks to figure
it out.  My test VM is too noisy though; the error bars are too large to
get solid data.
There are three reasons why I think we hit diminishing returns:
1. Cost of going into the slab allocator (one alloc, one free).
Maybe that's not as high as I think it is.
2. Let's say the per-page overhead of walking i_pages is 10% of the
CPU time for a 128kB I/O with a batch size of 1.  Increasing the batch
size to 15 means we walk the array 3 times instead of 32 times, or 0.7%
of the CPU time -- total reduction in CPU time of 9.3%.  Increasing the
batch size to 32 means we only walk the array once, which cuts it down
from 10% to 0.3% -- reduction in CPU time of 9.7%.
If we are doing 2MB I/Os (and most applications I've looked at recently
only do 128kB), and the 10% remains constant, then the batch-size-15
case walks the tree 17 times instead of 512 times -- 0.6%, whereas the
batch-size-512 case walks the tree once -- 0.02%.  But that only loks
like an overall savings of 9.98% versus 9.4%.  And is an extra 0.6%
saving worth it?
3. By the time we're doing such large I/Os, we're surely dominated by
memcpy() and not walking the tree.  Even if the file you're working on
is a terabyte in size, the radix tree is only 5 layers deep.  So that's
five pointer dereferences to find the struct page, and they should stay
in cache (maybe they'd fall out to L2, but surely not as far as L3).
And generally radix tree cachelines stay clean so there shouldn't be any
contention on them from other CPUs unless they're dirtying the pages or
writing them back.
^ permalink raw reply	[flat|nested] 41+ messages in thread 
 
- * Re: [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read
  2020-11-04 20:42 ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox (Oracle)
  2020-11-04 21:30   ` Kent Overstreet
@ 2020-11-06  8:07   ` Christoph Hellwig
  1 sibling, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:07 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch, kent.overstreet
On Wed, Nov 04, 2020 at 08:42:03PM +0000, Matthew Wilcox (Oracle) wrote:
> Increasing the batch size runs into diminishing returns.  It's probably
> better to make, eg, three calls to filemap_get_pages() than it is to
> call into kmalloc().
Yes, this always looked weird.
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply	[flat|nested] 41+ messages in thread 
 
- * [PATCH v2 03/18] mm/filemap: Convert filemap_get_pages to take a pagevec
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 01/18] mm/filemap: Rename generic_file_buffered_read subfunctions Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 02/18] mm/filemap: Remove dynamically allocated array from filemap_read Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:08   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 04/18] mm/filemap: Use THPs in generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (14 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Using a pagevec lets us keep the pages and the number of pages together
which simplifies a lot of the calling conventions.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
---
 mm/filemap.c | 82 ++++++++++++++++++++++++----------------------------
 1 file changed, 38 insertions(+), 44 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index bb1c42d0223c..bd02820601f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2323,22 +2323,22 @@ static struct page *filemap_create_page(struct kiocb *iocb,
 }
 
 static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
-		struct page **pages, unsigned int nr)
+		struct pagevec *pvec)
 {
 	struct file *filp = iocb->ki_filp;
 	struct address_space *mapping = filp->f_mapping;
 	struct file_ra_state *ra = &filp->f_ra;
 	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 	pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
-	int i, j, nr_got, err = 0;
+	unsigned int nr = min_t(unsigned long, last_index - index, PAGEVEC_SIZE);
+	int i, j, err = 0;
 
-	nr = min_t(unsigned long, last_index - index, nr);
 find_page:
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
-	nr_got = find_get_pages_contig(mapping, index, nr, pages);
-	if (nr_got)
+	pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages);
+	if (pvec->nr)
 		goto got_pages;
 
 	if (iocb->ki_flags & IOCB_NOIO)
@@ -2346,17 +2346,17 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 
 	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
 
-	nr_got = find_get_pages_contig(mapping, index, nr, pages);
-	if (nr_got)
+	pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages);
+	if (pvec->nr)
 		goto got_pages;
 
-	pages[0] = filemap_create_page(iocb, iter);
-	err = PTR_ERR_OR_ZERO(pages[0]);
-	if (!IS_ERR_OR_NULL(pages[0]))
-		nr_got = 1;
+	pvec->pages[0] = filemap_create_page(iocb, iter);
+	err = PTR_ERR_OR_ZERO(pvec->pages[0]);
+	if (!IS_ERR_OR_NULL(pvec->pages[0]))
+		pvec->nr = 1;
 got_pages:
-	for (i = 0; i < nr_got; i++) {
-		struct page *page = pages[i];
+	for (i = 0; i < pvec->nr; i++) {
+		struct page *page = pvec->pages[i];
 		pgoff_t pg_index = index + i;
 		loff_t pg_pos = max(iocb->ki_pos,
 				    (loff_t) pg_index << PAGE_SHIFT);
@@ -2364,9 +2364,9 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (PageReadahead(page)) {
 			if (iocb->ki_flags & IOCB_NOIO) {
-				for (j = i; j < nr_got; j++)
-					put_page(pages[j]);
-				nr_got = i;
+				for (j = i; j < pvec->nr; j++)
+					put_page(pvec->pages[j]);
+				pvec->nr = i;
 				err = -EAGAIN;
 				break;
 			}
@@ -2377,9 +2377,9 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 		if (!PageUptodate(page)) {
 			if ((iocb->ki_flags & IOCB_NOWAIT) ||
 			    ((iocb->ki_flags & IOCB_WAITQ) && i)) {
-				for (j = i; j < nr_got; j++)
-					put_page(pages[j]);
-				nr_got = i;
+				for (j = i; j < pvec->nr; j++)
+					put_page(pvec->pages[j]);
+				pvec->nr = i;
 				err = -EAGAIN;
 				break;
 			}
@@ -2387,17 +2387,17 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 			page = filemap_update_page(iocb, filp, iter, page,
 					pg_pos, pg_count);
 			if (IS_ERR_OR_NULL(page)) {
-				for (j = i + 1; j < nr_got; j++)
-					put_page(pages[j]);
-				nr_got = i;
+				for (j = i + 1; j < pvec->nr; j++)
+					put_page(pvec->pages[j]);
+				pvec->nr = i;
 				err = PTR_ERR_OR_ZERO(page);
 				break;
 			}
 		}
 	}
 
-	if (likely(nr_got))
-		return nr_got;
+	if (likely(pvec->nr))
+		return 0;
 	if (err)
 		return err;
 	/*
@@ -2429,11 +2429,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 	struct file_ra_state *ra = &filp->f_ra;
 	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
-	struct page *pages[PAGEVEC_SIZE];
-	unsigned int nr_pages = min_t(unsigned int, PAGEVEC_SIZE,
-			((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
-			(iocb->ki_pos >> PAGE_SHIFT));
-	int i, pg_nr, error = 0;
+	struct pagevec pvec;
+	int i, error = 0;
 	bool writably_mapped;
 	loff_t isize, end_offset;
 
@@ -2452,12 +2449,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		if ((iocb->ki_flags & IOCB_WAITQ) && written)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
-		i = 0;
-		pg_nr = filemap_get_pages(iocb, iter, pages, nr_pages);
-		if (pg_nr < 0) {
-			error = pg_nr;
+		error = filemap_get_pages(iocb, iter, &pvec);
+		if (error < 0)
 			break;
-		}
 
 		/*
 		 * i_size must be checked after we know the pages are Uptodate.
@@ -2473,9 +2467,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-		while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+		while ((iocb->ki_pos >> PAGE_SHIFT) + pvec.nr >
 		       (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
-			put_page(pages[--pg_nr]);
+			put_page(pvec.pages[--pvec.nr]);
 
 		/*
 		 * Once we start copying data, we don't want to be touching any
@@ -2489,11 +2483,11 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		 */
 		if (iocb->ki_pos >> PAGE_SHIFT !=
 		    ra->prev_pos >> PAGE_SHIFT)
-			mark_page_accessed(pages[0]);
-		for (i = 1; i < pg_nr; i++)
-			mark_page_accessed(pages[i]);
+			mark_page_accessed(pvec.pages[0]);
+		for (i = 1; i < pagevec_count(&pvec); i++)
+			mark_page_accessed(pvec.pages[i]);
 
-		for (i = 0; i < pg_nr; i++) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
 			unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
 			unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
 						   PAGE_SIZE - offset);
@@ -2505,9 +2499,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			 * before reading the page on the kernel side.
 			 */
 			if (writably_mapped)
-				flush_dcache_page(pages[i]);
+				flush_dcache_page(pvec.pages[i]);
 
-			copied = copy_page_to_iter(pages[i], offset, bytes, iter);
+			copied = copy_page_to_iter(pvec.pages[i], offset, bytes, iter);
 
 			written += copied;
 			iocb->ki_pos += copied;
@@ -2519,8 +2513,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 			}
 		}
 put_pages:
-		for (i = 0; i < pg_nr; i++)
-			put_page(pages[i]);
+		for (i = 0; i < pagevec_count(&pvec); i++)
+			put_page(pvec.pages[i]);
 	} while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
 
 	file_accessed(filp);
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 04/18] mm/filemap: Use THPs in generic_file_buffered_read
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (2 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 03/18] mm/filemap: Convert filemap_get_pages to take a pagevec Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 05/18] mm/filemap: Pass a sleep state to put_and_wait_on_page_locked Matthew Wilcox (Oracle)
                   ` (13 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Add filemap_get_read_batch() which returns the THPs which represent a
contiguous array of bytes in the file.  It also stops when encountering
a page marked as Readahead or !Uptodate (but does return that page)
so it can be handled appropriately by filemap_get_pages().  That lets us
remove the loop in filemap_get_pages() and check only the last page.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 121 +++++++++++++++++++++++++++++++++++----------------
 1 file changed, 84 insertions(+), 37 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index bd02820601f8..def9c5513975 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2176,6 +2176,51 @@ static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
 		return lock_page_killable(page);
 }
 
+/*
+ * filemap_get_read_batch - Get a batch of pages for read
+ *
+ * Get a batch of pages which represent a contiguous range of bytes
+ * in the file.  No tail pages will be returned.  If @index is in the
+ * middle of a THP, the entire THP will be returned.  The last page in
+ * the batch may have Readahead set or be not Uptodate so that the
+ * caller can take the appropriate action.
+ */
+static void filemap_get_read_batch(struct address_space *mapping,
+		pgoff_t index, pgoff_t max, struct pagevec *pvec)
+{
+	XA_STATE(xas, &mapping->i_pages, index);
+	struct page *head;
+
+	rcu_read_lock();
+	for (head = xas_load(&xas); head; head = xas_next(&xas)) {
+		if (xas_retry(&xas, head))
+			continue;
+		if (xas.xa_index > max || xa_is_value(head))
+			break;
+		if (!page_cache_get_speculative(head))
+			goto retry;
+
+		/* Has the page moved or been split? */
+		if (unlikely(head != xas_reload(&xas)))
+			goto put_page;
+
+		if (!pagevec_add(pvec, head))
+			break;
+		if (!PageUptodate(head))
+			break;
+		if (PageReadahead(head))
+			break;
+		xas.xa_index = head->index + thp_nr_pages(head) - 1;
+		xas.xa_offset = (xas.xa_index >> xas.xa_shift) & XA_CHUNK_MASK;
+		continue;
+put_page:
+		put_page(head);
+retry:
+		xas_reset(&xas);
+	}
+	rcu_read_unlock();
+}
+
 static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp,
 		struct address_space *mapping, struct page *page)
 {
@@ -2329,15 +2374,16 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 	struct address_space *mapping = filp->f_mapping;
 	struct file_ra_state *ra = &filp->f_ra;
 	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
-	pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
-	unsigned int nr = min_t(unsigned long, last_index - index, PAGEVEC_SIZE);
-	int i, j, err = 0;
+	pgoff_t last_index;
+	int err = 0;
 
+	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
 find_page:
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
-	pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages);
+	pagevec_init(pvec);
+	filemap_get_read_batch(mapping, index, last_index, pvec);
 	if (pvec->nr)
 		goto got_pages;
 
@@ -2346,29 +2392,30 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 
 	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
 
-	pvec->nr = find_get_pages_contig(mapping, index, nr, pvec->pages);
+	filemap_get_read_batch(mapping, index, last_index, pvec);
 	if (pvec->nr)
 		goto got_pages;
 
 	pvec->pages[0] = filemap_create_page(iocb, iter);
 	err = PTR_ERR_OR_ZERO(pvec->pages[0]);
-	if (!IS_ERR_OR_NULL(pvec->pages[0]))
-		pvec->nr = 1;
+	if (IS_ERR_OR_NULL(pvec->pages[0]))
+		goto err;
+	pvec->nr = 1;
+	return 0;
 got_pages:
-	for (i = 0; i < pvec->nr; i++) {
-		struct page *page = pvec->pages[i];
-		pgoff_t pg_index = index + i;
+	{
+		struct page *page = pvec->pages[pvec->nr - 1];
+		pgoff_t pg_index = page->index;
 		loff_t pg_pos = max(iocb->ki_pos,
 				    (loff_t) pg_index << PAGE_SHIFT);
 		loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
 
 		if (PageReadahead(page)) {
 			if (iocb->ki_flags & IOCB_NOIO) {
-				for (j = i; j < pvec->nr; j++)
-					put_page(pvec->pages[j]);
-				pvec->nr = i;
+				put_page(page);
+				pvec->nr--;
 				err = -EAGAIN;
-				break;
+				goto err;
 			}
 			page_cache_async_readahead(mapping, ra, filp, page,
 					pg_index, last_index - pg_index);
@@ -2376,26 +2423,23 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 
 		if (!PageUptodate(page)) {
 			if ((iocb->ki_flags & IOCB_NOWAIT) ||
-			    ((iocb->ki_flags & IOCB_WAITQ) && i)) {
-				for (j = i; j < pvec->nr; j++)
-					put_page(pvec->pages[j]);
-				pvec->nr = i;
+			    ((iocb->ki_flags & IOCB_WAITQ) && pvec->nr > 1)) {
+				put_page(page);
+				pvec->nr--;
 				err = -EAGAIN;
-				break;
+				goto err;
 			}
 
 			page = filemap_update_page(iocb, filp, iter, page,
 					pg_pos, pg_count);
 			if (IS_ERR_OR_NULL(page)) {
-				for (j = i + 1; j < pvec->nr; j++)
-					put_page(pvec->pages[j]);
-				pvec->nr = i;
+				pvec->nr--;
 				err = PTR_ERR_OR_ZERO(page);
-				break;
 			}
 		}
 	}
 
+err:
 	if (likely(pvec->nr))
 		return 0;
 	if (err)
@@ -2464,13 +2508,8 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		isize = i_size_read(inode);
 		if (unlikely(iocb->ki_pos >= isize))
 			goto put_pages;
-
 		end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
 
-		while ((iocb->ki_pos >> PAGE_SHIFT) + pvec.nr >
-		       (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
-			put_page(pvec.pages[--pvec.nr]);
-
 		/*
 		 * Once we start copying data, we don't want to be touching any
 		 * cachelines that might be contended:
@@ -2484,24 +2523,32 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		if (iocb->ki_pos >> PAGE_SHIFT !=
 		    ra->prev_pos >> PAGE_SHIFT)
 			mark_page_accessed(pvec.pages[0]);
-		for (i = 1; i < pagevec_count(&pvec); i++)
-			mark_page_accessed(pvec.pages[i]);
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
-			unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
-			unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
-						   PAGE_SIZE - offset);
-			unsigned int copied;
+			struct page *page = pvec.pages[i];
+			size_t page_size = thp_size(page);
+			size_t offset = iocb->ki_pos & (page_size - 1);
+			size_t bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+					     page_size - offset);
+			size_t copied;
 
+			if (end_offset < page_offset(page))
+				break;
+			if (i > 0)
+				mark_page_accessed(page);
 			/*
 			 * If users can be writing to this page using arbitrary
 			 * virtual addresses, take care about potential aliasing
 			 * before reading the page on the kernel side.
 			 */
-			if (writably_mapped)
-				flush_dcache_page(pvec.pages[i]);
+			if (writably_mapped) {
+				int j;
+
+				for (j = 0; j < thp_nr_pages(page); j++)
+					flush_dcache_page(page + j);
+			}
 
-			copied = copy_page_to_iter(pvec.pages[i], offset, bytes, iter);
+			copied = copy_page_to_iter(page, offset, bytes, iter);
 
 			written += copied;
 			iocb->ki_pos += copied;
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 05/18] mm/filemap: Pass a sleep state to put_and_wait_on_page_locked
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (3 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 04/18] mm/filemap: Use THPs in generic_file_buffered_read Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 06/18] mm/filemap: Support readpage splitting a page Matthew Wilcox (Oracle)
                   ` (12 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
This is prep work for the next patch, but I think at least one of the
current callers would prefer a killable sleep to an uninterruptible one.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 include/linux/pagemap.h | 3 +--
 mm/filemap.c            | 7 +++++--
 mm/huge_memory.c        | 4 ++--
 mm/migrate.c            | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 00288ed24698..71b36b275e4d 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -681,8 +681,7 @@ static inline int wait_on_page_locked_killable(struct page *page)
 	return wait_on_page_bit_killable(compound_head(page), PG_locked);
 }
 
-extern void put_and_wait_on_page_locked(struct page *page);
-
+int put_and_wait_on_page_locked(struct page *page, int state);
 void wait_on_page_writeback(struct page *page);
 extern void end_page_writeback(struct page *page);
 void wait_for_stable_page(struct page *page);
diff --git a/mm/filemap.c b/mm/filemap.c
index def9c5513975..bc35de079dd0 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1358,20 +1358,23 @@ static int wait_on_page_locked_async(struct page *page,
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
+ * @state: The sleep state (TASK_KILLABLE, TASK_UNINTERRUPTIBLE, etc).
  *
  * The caller should hold a reference on @page.  They expect the page to
  * become unlocked relatively soon, but do not wish to hold up migration
  * (for example) by holding the reference while waiting for the page to
  * come unlocked.  After this function returns, the caller should not
  * dereference @page.
+ *
+ * Return: 0 if the page was unlocked or -EINTR if interrupted by a signal.
  */
-void put_and_wait_on_page_locked(struct page *page)
+int put_and_wait_on_page_locked(struct page *page, int state)
 {
 	wait_queue_head_t *q;
 
 	page = compound_head(page);
 	q = page_waitqueue(page);
-	wait_on_page_bit_common(q, page, PG_locked, TASK_UNINTERRUPTIBLE, DROP);
+	return wait_on_page_bit_common(q, page, PG_locked, state, DROP);
 }
 
 /**
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 616102ba3682..ac114d265950 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1432,7 +1432,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
@@ -1468,7 +1468,7 @@ vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd)
 		if (!get_page_unless_zero(page))
 			goto out_unlock;
 		spin_unlock(vmf->ptl);
-		put_and_wait_on_page_locked(page);
+		put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 		goto out;
 	}
 
diff --git a/mm/migrate.c b/mm/migrate.c
index 39663dfbc273..a50bbb0e029b 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -335,7 +335,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
 	if (!get_page_unless_zero(page))
 		goto out;
 	pte_unmap_unlock(ptep, ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 out:
 	pte_unmap_unlock(ptep, ptl);
@@ -369,7 +369,7 @@ void pmd_migration_entry_wait(struct mm_struct *mm, pmd_t *pmd)
 	if (!get_page_unless_zero(page))
 		goto unlock;
 	spin_unlock(ptl);
-	put_and_wait_on_page_locked(page);
+	put_and_wait_on_page_locked(page, TASK_UNINTERRUPTIBLE);
 	return;
 unlock:
 	spin_unlock(ptl);
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 06/18] mm/filemap: Support readpage splitting a page
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (4 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 05/18] mm/filemap: Pass a sleep state to put_and_wait_on_page_locked Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 07/18] mm/filemap: Inline __wait_on_page_locked_async into caller Matthew Wilcox (Oracle)
                   ` (11 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
For page splitting to succeed, the thread asking to split the
page has to be the only one with a reference to the page.  Calling
wait_on_page_locked() while holding a reference to the page will
effectively prevent this from happening with sufficient threads waiting
on the same page.  Use put_and_wait_on_page_locked() to sleep without
holding a reference to the page, then retry the page lookup after the
page is unlocked.
Since we now get the page lock a little earlier in filemap_update_page(),
we can eliminate a number of duplicate checks.  The original intent
(commit ebded02788b5 ("avoid unnecessary calls to lock_page when waiting
for IO to complete during a read")) behind getting the page lock later
was to avoid re-locking the page after it has been brought uptodate by
another thread.  We still avoid that because we go through the normal
lookup path again after the winning thread has brought the page uptodate.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 76 ++++++++++++++++------------------------------------
 1 file changed, 23 insertions(+), 53 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index bc35de079dd0..02fe068c434b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1347,14 +1347,6 @@ static int __wait_on_page_locked_async(struct page *page,
 	return ret;
 }
 
-static int wait_on_page_locked_async(struct page *page,
-				     struct wait_page_queue *wait)
-{
-	if (!PageLocked(page))
-		return 0;
-	return __wait_on_page_locked_async(compound_head(page), wait, false);
-}
-
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
@@ -2284,64 +2276,42 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
 	struct inode *inode = mapping->host;
 	int error;
 
-	/*
-	 * See comment in do_read_cache_page on why
-	 * wait_on_page_locked is used to avoid unnecessarily
-	 * serialisations and why it's safe.
-	 */
 	if (iocb->ki_flags & IOCB_WAITQ) {
-		error = wait_on_page_locked_async(page,
-						iocb->ki_waitq);
+		error = lock_page_async(page, iocb->ki_waitq);
+		if (error) {
+			put_page(page);
+			return ERR_PTR(error);
+		}
 	} else {
-		error = wait_on_page_locked_killable(page);
-	}
-	if (unlikely(error)) {
-		put_page(page);
-		return ERR_PTR(error);
+		if (!trylock_page(page)) {
+			put_and_wait_on_page_locked(page, TASK_KILLABLE);
+			return NULL;
+		}
 	}
-	if (PageUptodate(page))
-		return page;
 
+	if (!page->mapping)
+		goto truncated;
+	if (PageUptodate(page))
+		goto uptodate;
 	if (inode->i_blkbits == PAGE_SHIFT ||
 			!mapping->a_ops->is_partially_uptodate)
-		goto page_not_up_to_date;
+		goto readpage;
 	/* pipes can't handle partially uptodate pages */
 	if (unlikely(iov_iter_is_pipe(iter)))
-		goto page_not_up_to_date;
-	if (!trylock_page(page))
-		goto page_not_up_to_date;
-	/* Did it get truncated before we got the lock? */
-	if (!page->mapping)
-		goto page_not_up_to_date_locked;
+		goto readpage;
 	if (!mapping->a_ops->is_partially_uptodate(page,
-				pos & ~PAGE_MASK, count))
-		goto page_not_up_to_date_locked;
+				pos & (thp_size(page) - 1), count))
+		goto readpage;
+uptodate:
 	unlock_page(page);
 	return page;
 
-page_not_up_to_date:
-	/* Get exclusive access to the page ... */
-	error = lock_page_for_iocb(iocb, page);
-	if (unlikely(error)) {
-		put_page(page);
-		return ERR_PTR(error);
-	}
-
-page_not_up_to_date_locked:
-	/* Did it get truncated before we got the lock? */
-	if (!page->mapping) {
-		unlock_page(page);
-		put_page(page);
-		return NULL;
-	}
-
-	/* Did somebody else fill it already? */
-	if (PageUptodate(page)) {
-		unlock_page(page);
-		return page;
-	}
-
+readpage:
 	return filemap_read_page(iocb, filp, mapping, page);
+truncated:
+	unlock_page(page);
+	put_page(page);
+	return NULL;
 }
 
 static struct page *filemap_create_page(struct kiocb *iocb,
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 07/18] mm/filemap: Inline __wait_on_page_locked_async into caller
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (5 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 06/18] mm/filemap: Support readpage splitting a page Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 08/18] mm/filemap: Don't call ->readpage if IOCB_WAITQ is set Matthew Wilcox (Oracle)
                   ` (10 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
The previous patch removed wait_on_page_locked_async(), so inline
__wait_on_page_locked_async into __lock_page_async().
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 53 ++++++++++++++++++++++------------------------------
 1 file changed, 22 insertions(+), 31 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 02fe068c434b..6ad067fe5d47 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1317,36 +1317,6 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
 }
 EXPORT_SYMBOL(wait_on_page_bit_killable);
 
-static int __wait_on_page_locked_async(struct page *page,
-				       struct wait_page_queue *wait, bool set)
-{
-	struct wait_queue_head *q = page_waitqueue(page);
-	int ret = 0;
-
-	wait->page = page;
-	wait->bit_nr = PG_locked;
-
-	spin_lock_irq(&q->lock);
-	__add_wait_queue_entry_tail(q, &wait->wait);
-	SetPageWaiters(page);
-	if (set)
-		ret = !trylock_page(page);
-	else
-		ret = PageLocked(page);
-	/*
-	 * If we were succesful now, we know we're still on the
-	 * waitqueue as we're still under the lock. This means it's
-	 * safe to remove and return success, we know the callback
-	 * isn't going to trigger.
-	 */
-	if (!ret)
-		__remove_wait_queue(q, &wait->wait);
-	else
-		ret = -EIOCBQUEUED;
-	spin_unlock_irq(&q->lock);
-	return ret;
-}
-
 /**
  * put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
  * @page: The page to wait for.
@@ -1514,7 +1484,28 @@ EXPORT_SYMBOL_GPL(__lock_page_killable);
 
 int __lock_page_async(struct page *page, struct wait_page_queue *wait)
 {
-	return __wait_on_page_locked_async(page, wait, true);
+	struct wait_queue_head *q = page_waitqueue(page);
+	int ret = 0;
+
+	wait->page = page;
+	wait->bit_nr = PG_locked;
+
+	spin_lock_irq(&q->lock);
+	__add_wait_queue_entry_tail(q, &wait->wait);
+	SetPageWaiters(page);
+	ret = !trylock_page(page);
+	/*
+	 * If we were succesful now, we know we're still on the
+	 * waitqueue as we're still under the lock. This means it's
+	 * safe to remove and return success, we know the callback
+	 * isn't going to trigger.
+	 */
+	if (!ret)
+		__remove_wait_queue(q, &wait->wait);
+	else
+		ret = -EIOCBQUEUED;
+	spin_unlock_irq(&q->lock);
+	return ret;
 }
 
 /*
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 08/18] mm/filemap: Don't call ->readpage if IOCB_WAITQ is set
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (6 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 07/18] mm/filemap: Inline __wait_on_page_locked_async into caller Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 09/18] mm/filemap: Change filemap_read_page calling conventions Matthew Wilcox (Oracle)
                   ` (9 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
The readpage operation can block in many (most?) filesystems, so we
should punt to a work queue instead of calling it.  This was the last
caller of lock_page_async(), so remove it.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6ad067fe5d47..6699581bd4f5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2152,16 +2152,6 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
 	ra->ra_pages /= 4;
 }
 
-static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
-{
-	if (iocb->ki_flags & IOCB_WAITQ)
-		return lock_page_async(page, iocb->ki_waitq);
-	else if (iocb->ki_flags & IOCB_NOWAIT)
-		return trylock_page(page) ? 0 : -EAGAIN;
-	else
-		return lock_page_killable(page);
-}
-
 /*
  * filemap_get_read_batch - Get a batch of pages for read
  *
@@ -2213,7 +2203,7 @@ static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp,
 	struct file_ra_state *ra = &filp->f_ra;
 	int error;
 
-	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
 		unlock_page(page);
 		put_page(page);
 		return ERR_PTR(-EAGAIN);
@@ -2234,7 +2224,7 @@ static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp,
 	}
 
 	if (!PageUptodate(page)) {
-		error = lock_page_for_iocb(iocb, page);
+		error = lock_page_killable(page);
 		if (unlikely(error)) {
 			put_page(page);
 			return ERR_PTR(error);
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 09/18] mm/filemap: Change filemap_read_page calling conventions
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (7 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 08/18] mm/filemap: Don't call ->readpage if IOCB_WAITQ is set Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:11   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 10/18] mm/filemap: Change filemap_create_page " Matthew Wilcox (Oracle)
                   ` (8 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Make this function more generic by passing the file instead of the iocb.
Check in the callers whether we should call readpage or not.  Also make
it return an errno / 0 / AOP_TRUNCATED_PAGE, and make calling put_page()
the caller's responsibility.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 89 +++++++++++++++++++++++++---------------------------
 1 file changed, 42 insertions(+), 47 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 6699581bd4f5..8c6444a6296c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2197,56 +2197,38 @@ static void filemap_get_read_batch(struct address_space *mapping,
 	rcu_read_unlock();
 }
 
-static struct page *filemap_read_page(struct kiocb *iocb, struct file *filp,
-		struct address_space *mapping, struct page *page)
+static int filemap_read_page(struct file *file, struct address_space *mapping,
+		struct page *page)
 {
-	struct file_ra_state *ra = &filp->f_ra;
 	int error;
 
-	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
-		unlock_page(page);
-		put_page(page);
-		return ERR_PTR(-EAGAIN);
-	}
-
 	/*
-	 * A previous I/O error may have been due to temporary
-	 * failures, eg. multipath errors.
-	 * PG_error will be set again if readpage fails.
+	 * A previous I/O error may have been due to temporary failures,
+	 * eg. multipath errors.  PG_error will be set again if readpage
+	 * fails.
 	 */
 	ClearPageError(page);
 	/* Start the actual read. The read will unlock the page. */
-	error = mapping->a_ops->readpage(filp, page);
-
-	if (unlikely(error)) {
-		put_page(page);
-		return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
-	}
+	error = mapping->a_ops->readpage(file, page);
+	if (error)
+		return error;
+	if (PageUptodate(page))
+		return 0;
 
+	error = lock_page_killable(page);
+	if (error)
+		return error;
 	if (!PageUptodate(page)) {
-		error = lock_page_killable(page);
-		if (unlikely(error)) {
-			put_page(page);
-			return ERR_PTR(error);
-		}
-		if (!PageUptodate(page)) {
-			if (page->mapping == NULL) {
-				/*
-				 * invalidate_mapping_pages got it
-				 */
-				unlock_page(page);
-				put_page(page);
-				return NULL;
-			}
-			unlock_page(page);
-			shrink_readahead_size_eio(ra);
-			put_page(page);
-			return ERR_PTR(-EIO);
+		if (page->mapping == NULL) {
+			/* page truncated */
+			error = AOP_TRUNCATED_PAGE;
+		} else {
+			shrink_readahead_size_eio(&file->f_ra);
+			error = -EIO;
 		}
-		unlock_page(page);
 	}
-
-	return page;
+	unlock_page(page);
+	return error;
 }
 
 static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
@@ -2288,7 +2270,18 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
 	return page;
 
 readpage:
-	return filemap_read_page(iocb, filp, mapping, page);
+	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
+		unlock_page(page);
+		put_page(page);
+		return ERR_PTR(-EAGAIN);
+	}
+	error = filemap_read_page(iocb->ki_filp, mapping, page);
+	if (!error)
+		return page;
+	put_page(page);
+	if (error == AOP_TRUNCATED_PAGE)
+		return NULL;
+	return ERR_PTR(error);
 truncated:
 	unlock_page(page);
 	put_page(page);
@@ -2304,7 +2297,7 @@ static struct page *filemap_create_page(struct kiocb *iocb,
 	struct page *page;
 	int error;
 
-	if (iocb->ki_flags & IOCB_NOIO)
+	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
 		return ERR_PTR(-EAGAIN);
 
 	page = page_cache_alloc(mapping);
@@ -2313,12 +2306,14 @@ static struct page *filemap_create_page(struct kiocb *iocb,
 
 	error = add_to_page_cache_lru(page, mapping, index,
 				      mapping_gfp_constraint(mapping, GFP_KERNEL));
-	if (error) {
-		put_page(page);
-		return error != -EEXIST ? ERR_PTR(error) : NULL;
-	}
-
-	return filemap_read_page(iocb, filp, mapping, page);
+	if (!error)
+		error = filemap_read_page(iocb->ki_filp, mapping, page);
+	if (!error)
+		return page;
+	put_page(page);
+	if (error == -EEXIST || error == AOP_TRUNCATED_PAGE)
+		return NULL;
+	return ERR_PTR(error);
 }
 
 static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 09/18] mm/filemap: Change filemap_read_page calling conventions
  2020-11-04 20:42 ` [PATCH v2 09/18] mm/filemap: Change filemap_read_page calling conventions Matthew Wilcox (Oracle)
@ 2020-11-06  8:11   ` Christoph Hellwig
  0 siblings, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:11 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch, kent.overstreet
On Wed, Nov 04, 2020 at 08:42:10PM +0000, Matthew Wilcox (Oracle) wrote:
> @@ -2288,7 +2270,18 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
>  	return page;
>  
>  readpage:
> +	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
> +		unlock_page(page);
> +		put_page(page);
> +		return ERR_PTR(-EAGAIN);
> +	}
> +	error = filemap_read_page(iocb->ki_filp, mapping, page);
> +	if (!error)
I still find the eraly return here weird..
Otherwise looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply	[flat|nested] 41+ messages in thread
 
- * [PATCH v2 10/18] mm/filemap: Change filemap_create_page calling conventions
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (8 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 09/18] mm/filemap: Change filemap_read_page calling conventions Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno Matthew Wilcox (Oracle)
                   ` (7 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
By moving the iocb flag checks to the caller, we can pass the
file and the page index instead of the iocb.  It never needed the iter.
By passing the pagevec, we can return an errno (or AOP_TRUNCATED_PAGE)
instead of an ERR_PTR.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 53 ++++++++++++++++++++++++++--------------------------
 1 file changed, 27 insertions(+), 26 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 8c6444a6296c..25945fefdd39 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2288,32 +2288,33 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
 	return NULL;
 }
 
-static struct page *filemap_create_page(struct kiocb *iocb,
-		struct iov_iter *iter)
+static int filemap_create_page(struct file *file,
+		struct address_space *mapping, pgoff_t index,
+		struct pagevec *pvec)
 {
-	struct file *filp = iocb->ki_filp;
-	struct address_space *mapping = filp->f_mapping;
-	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 	struct page *page;
 	int error;
 
-	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
-		return ERR_PTR(-EAGAIN);
-
 	page = page_cache_alloc(mapping);
 	if (!page)
-		return ERR_PTR(-ENOMEM);
+		return -ENOMEM;
 
 	error = add_to_page_cache_lru(page, mapping, index,
-				      mapping_gfp_constraint(mapping, GFP_KERNEL));
-	if (!error)
-		error = filemap_read_page(iocb->ki_filp, mapping, page);
-	if (!error)
-		return page;
+			mapping_gfp_constraint(mapping, GFP_KERNEL));
+	if (error == -EEXIST)
+		error = AOP_TRUNCATED_PAGE;
+	if (error)
+		goto error;
+
+	error = filemap_read_page(file, mapping, page);
+	if (error)
+		goto error;
+
+	pagevec_add(pvec, page);
+	return 0;
+error:
 	put_page(page);
-	if (error == -EEXIST || error == AOP_TRUNCATED_PAGE)
-		return NULL;
-	return ERR_PTR(error);
+	return error;
 }
 
 static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
@@ -2342,15 +2343,15 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
 
 	filemap_get_read_batch(mapping, index, last_index, pvec);
-	if (pvec->nr)
-		goto got_pages;
-
-	pvec->pages[0] = filemap_create_page(iocb, iter);
-	err = PTR_ERR_OR_ZERO(pvec->pages[0]);
-	if (IS_ERR_OR_NULL(pvec->pages[0]))
-		goto err;
-	pvec->nr = 1;
-	return 0;
+	if (!pagevec_count(pvec)) {
+		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
+			return -EAGAIN;
+		err = filemap_create_page(filp, mapping,
+				iocb->ki_pos >> PAGE_SHIFT, pvec);
+		if (err == AOP_TRUNCATED_PAGE)
+			goto find_page;
+		return err;
+	}
 got_pages:
 	{
 		struct page *page = pvec->pages[pvec->nr - 1];
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (9 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 10/18] mm/filemap: Change filemap_create_page " Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:14   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 12/18] mm/filemap: Move the iocb checks into filemap_update_page Matthew Wilcox (Oracle)
                   ` (6 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Use AOP_TRUNCATED_PAGE to indicate that no error occurred, but the
page we looked up is no longer valid.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 42 +++++++++++++++++++-----------------------
 1 file changed, 19 insertions(+), 23 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 25945fefdd39..93c054f51677 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2231,24 +2231,21 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
 	return error;
 }
 
-static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
-		struct iov_iter *iter, struct page *page, loff_t pos,
-		loff_t count)
+static int filemap_update_page(struct kiocb *iocb,
+		struct address_space *mapping, struct iov_iter *iter,
+		struct page *page, loff_t pos, loff_t count)
 {
-	struct address_space *mapping = filp->f_mapping;
 	struct inode *inode = mapping->host;
 	int error;
 
 	if (iocb->ki_flags & IOCB_WAITQ) {
 		error = lock_page_async(page, iocb->ki_waitq);
-		if (error) {
-			put_page(page);
-			return ERR_PTR(error);
-		}
+		if (error)
+			goto error;
 	} else {
 		if (!trylock_page(page)) {
 			put_and_wait_on_page_locked(page, TASK_KILLABLE);
-			return NULL;
+			return AOP_TRUNCATED_PAGE;
 		}
 	}
 
@@ -2267,25 +2264,24 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
 		goto readpage;
 uptodate:
 	unlock_page(page);
-	return page;
+	return 0;
 
 readpage:
 	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
 		unlock_page(page);
-		put_page(page);
-		return ERR_PTR(-EAGAIN);
+		error = -EAGAIN;
+	} else {
+		error = filemap_read_page(iocb->ki_filp, mapping, page);
+		if (!error)
+			return 0;
 	}
-	error = filemap_read_page(iocb->ki_filp, mapping, page);
-	if (!error)
-		return page;
+error:
 	put_page(page);
-	if (error == AOP_TRUNCATED_PAGE)
-		return NULL;
-	return ERR_PTR(error);
+	return error;
 truncated:
 	unlock_page(page);
 	put_page(page);
-	return NULL;
+	return AOP_TRUNCATED_PAGE;
 }
 
 static int filemap_create_page(struct file *file,
@@ -2380,18 +2376,18 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 				goto err;
 			}
 
-			page = filemap_update_page(iocb, filp, iter, page,
+			err = filemap_update_page(iocb, mapping, iter, page,
 					pg_pos, pg_count);
-			if (IS_ERR_OR_NULL(page)) {
+			if (err)
 				pvec->nr--;
-				err = PTR_ERR_OR_ZERO(page);
-			}
 		}
 	}
 
 err:
 	if (likely(pvec->nr))
 		return 0;
+	if (err == AOP_TRUNCATED_PAGE)
+		goto find_page;
 	if (err)
 		return err;
 	/*
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno
  2020-11-04 20:42 ` [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno Matthew Wilcox (Oracle)
@ 2020-11-06  8:14   ` Christoph Hellwig
  2020-11-06  8:37     ` Christoph Hellwig
  0 siblings, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:14 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch, kent.overstreet
On Wed, Nov 04, 2020 at 08:42:12PM +0000, Matthew Wilcox (Oracle) wrote:
> Use AOP_TRUNCATED_PAGE to indicate that no error occurred, but the
> page we looked up is no longer valid.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
> ---
>  mm/filemap.c | 42 +++++++++++++++++++-----------------------
>  1 file changed, 19 insertions(+), 23 deletions(-)
> 
> diff --git a/mm/filemap.c b/mm/filemap.c
> index 25945fefdd39..93c054f51677 100644
> --- a/mm/filemap.c
> +++ b/mm/filemap.c
> @@ -2231,24 +2231,21 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
>  	return error;
>  }
>  
> +static int filemap_update_page(struct kiocb *iocb,
> +		struct address_space *mapping, struct iov_iter *iter,
> +		struct page *page, loff_t pos, loff_t count)
>  {
>  	struct inode *inode = mapping->host;
>  	int error;
>  
>  	if (iocb->ki_flags & IOCB_WAITQ) {
>  		error = lock_page_async(page, iocb->ki_waitq);
> +		if (error)
> +			goto error;
>  	} else {
>  		if (!trylock_page(page)) {
>  			put_and_wait_on_page_locked(page, TASK_KILLABLE);
> +			return AOP_TRUNCATED_PAGE;
>  		}
>  	}
>  
> @@ -2267,25 +2264,24 @@ static struct page *filemap_update_page(struct kiocb *iocb, struct file *filp,
>  		goto readpage;
>  uptodate:
>  	unlock_page(page);
> +	return 0;
>  
>  readpage:
>  	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
>  		unlock_page(page);
> +		error = -EAGAIN;
> +	} else {
> +		error = filemap_read_page(iocb->ki_filp, mapping, page);
> +		if (!error)
> +			return 0;
>  	}
> +error:
>  	put_page(page);
> +	return error;
>  truncated:
>  	unlock_page(page);
>  	put_page(page);
> +	return AOP_TRUNCATED_PAGE;
We could still consolidate the page unlocking by having another label.
Or even better move the put_page into the caller like I did in my
series, which would conceputally fit in pretty nicely here:
> +			err = filemap_update_page(iocb, mapping, iter, page,
>  					pg_pos, pg_count);
> +			if (err)
>  				pvec->nr--;
But otherwise this looks good:
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno
  2020-11-06  8:14   ` Christoph Hellwig
@ 2020-11-06  8:37     ` Christoph Hellwig
  2020-11-09 13:29       ` Matthew Wilcox
  0 siblings, 1 reply; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:37 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch, kent.overstreet
On Fri, Nov 06, 2020 at 09:14:20AM +0100, Christoph Hellwig wrote:
> We could still consolidate the page unlocking by having another label.
> Or even better move the put_page into the caller like I did in my
> series, which would conceputally fit in pretty nicely here:
FYI, this is what we should do for putting the page (on top of your
whole series), which also catches the leak for the readahead NOIO case:
diff --git a/mm/filemap.c b/mm/filemap.c
index 500e9fd4232cf9..dacee60b92d3d9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2268,40 +2268,35 @@ static int filemap_update_page(struct kiocb *iocb,
 		struct address_space *mapping, struct iov_iter *iter,
 		struct page *page)
 {
-	int error = -EAGAIN;
+	int error = 0;
 
 	if (!trylock_page(page)) {
 		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
-			goto error;
+			return -EAGAIN;
 		if (!(iocb->ki_flags & IOCB_WAITQ)) {
 			put_and_wait_on_page_locked(page, TASK_KILLABLE);
 			return AOP_TRUNCATED_PAGE;
 		}
 		error = __lock_page_async(page, iocb->ki_waitq);
 		if (error)
-			goto error;
+			return error;
 	}
 
-	error = AOP_TRUNCATED_PAGE;
-	if (!page->mapping)
-		goto unlock;
-	if (filemap_range_uptodate(iocb, mapping, iter, page)) {
+	if (!page->mapping) {
 		unlock_page(page);
-		return 0;
+		put_page(page);
+		return AOP_TRUNCATED_PAGE;
 	}
 
-	error = -EAGAIN;
-	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
-		goto unlock;
+	if (!filemap_range_uptodate(iocb, mapping, iter, page)) {
+		error = -EAGAIN;
+		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+			goto unlock;
+		return filemap_read_page(iocb->ki_filp, mapping, page);
+	}
 
-	error = filemap_read_page(iocb->ki_filp, mapping, page);
-	if (error)
-		goto error;
-	return 0;
 unlock:
 	unlock_page(page);
-error:
-	put_page(page);
 	return error;
 }
 
@@ -2396,8 +2391,9 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 
 	return 0;
 err:
-	pvec->nr--;
-	if (likely(pvec->nr))
+	if (err < 0)
+		put_page(page);
+	if (likely(--pvec->nr))
 		return 0;
 	if (err == AOP_TRUNCATED_PAGE)
 		goto retry;
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno
  2020-11-06  8:37     ` Christoph Hellwig
@ 2020-11-09 13:29       ` Matthew Wilcox
  0 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox @ 2020-11-09 13:29 UTC (permalink / raw)
  To: Christoph Hellwig; +Cc: linux-fsdevel, linux-mm, kent.overstreet
On Fri, Nov 06, 2020 at 09:37:46AM +0100, Christoph Hellwig wrote:
>  
> -	error = AOP_TRUNCATED_PAGE;
> -	if (!page->mapping)
> -		goto unlock;
> -	if (filemap_range_uptodate(iocb, mapping, iter, page)) {
> +	if (!page->mapping) {
>  		unlock_page(page);
> -		return 0;
> +		put_page(page);
> +		return AOP_TRUNCATED_PAGE;
>  	}
>  
> -	error = -EAGAIN;
> -	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
> -		goto unlock;
> +	if (!filemap_range_uptodate(iocb, mapping, iter, page)) {
> +		error = -EAGAIN;
> +		if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
> +			goto unlock;
> +		return filemap_read_page(iocb->ki_filp, mapping, page);
> +	}
>  
> -	error = filemap_read_page(iocb->ki_filp, mapping, page);
> -	if (error)
> -		goto error;
> -	return 0;
>  unlock:
>  	unlock_page(page);
> -error:
> -	put_page(page);
>  	return error;
>  }
It works out to be a little more complicated than that because
filemap_read_page() can also return AOP_TRUNCATED_PAGE.  Here's what I
currently have:
        if (!page->mapping)
                goto truncated;
        error = 0;
        if (filemap_range_uptodate(iocb, mapping, iter, page))
                goto unlock;
        error = -EAGAIN;
        if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
                goto unlock;
        error = filemap_read_page(iocb->ki_filp, mapping, page);
        if (error == AOP_TRUNCATED_PAGE)
                put_page(page);
        return error;
truncated:
        error = AOP_TRUNCATED_PAGE;
        put_page(page);
unlock:
        unlock_page(page);
        return error;
}
^ permalink raw reply	[flat|nested] 41+ messages in thread
 
 
 
- * [PATCH v2 12/18] mm/filemap: Move the iocb checks into filemap_update_page
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (10 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 11/18] mm/filemap: Convert filemap_update_page to return an errno Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 13/18] mm/filemap: Add filemap_range_uptodate Matthew Wilcox (Oracle)
                   ` (5 subsequent siblings)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
We don't need to give up when a non-blocking request sees a !Uptodate page.
We may be able to satisfy the read from a partially-uptodate page.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 mm/filemap.c | 24 ++++++++++--------------
 1 file changed, 10 insertions(+), 14 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 93c054f51677..9db94876122c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2236,17 +2236,18 @@ static int filemap_update_page(struct kiocb *iocb,
 		struct page *page, loff_t pos, loff_t count)
 {
 	struct inode *inode = mapping->host;
-	int error;
+	int error = -EAGAIN;
 
-	if (iocb->ki_flags & IOCB_WAITQ) {
-		error = lock_page_async(page, iocb->ki_waitq);
-		if (error)
+	if (!trylock_page(page)) {
+		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_NOIO))
 			goto error;
-	} else {
-		if (!trylock_page(page)) {
+		if (!(iocb->ki_flags & IOCB_WAITQ)) {
 			put_and_wait_on_page_locked(page, TASK_KILLABLE);
 			return AOP_TRUNCATED_PAGE;
 		}
+		error = __lock_page_async(page, iocb->ki_waitq);
+		if (error)
+			goto error;
 	}
 
 	if (!page->mapping)
@@ -2368,14 +2369,9 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 		}
 
 		if (!PageUptodate(page)) {
-			if ((iocb->ki_flags & IOCB_NOWAIT) ||
-			    ((iocb->ki_flags & IOCB_WAITQ) && pvec->nr > 1)) {
-				put_page(page);
-				pvec->nr--;
-				err = -EAGAIN;
-				goto err;
-			}
-
+			if ((iocb->ki_flags & IOCB_WAITQ) &&
+			    pagevec_count(pvec) > 1)
+				iocb->ki_flags |= IOCB_NOWAIT;
 			err = filemap_update_page(iocb, mapping, iter, page,
 					pg_pos, pg_count);
 			if (err)
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 13/18] mm/filemap: Add filemap_range_uptodate
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (11 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 12/18] mm/filemap: Move the iocb checks into filemap_update_page Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:16   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 14/18] mm/filemap: Split filemap_readahead out of filemap_get_pages Matthew Wilcox (Oracle)
                   ` (4 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Move the complicated condition and the calculations out of
filemap_update_page() into its own function.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 80 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 46 insertions(+), 34 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 9db94876122c..281538a05dc1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2231,11 +2231,39 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
 	return error;
 }
 
+static bool filemap_range_uptodate(struct kiocb *iocb,
+		struct address_space *mapping, struct iov_iter *iter,
+		struct page *page)
+{
+	loff_t pos;
+	int count;
+
+	if (PageUptodate(page))
+		return true;
+	/* pipes can't handle partially uptodate pages */
+	if (iov_iter_is_pipe(iter))
+		return false;
+	if (!mapping->a_ops->is_partially_uptodate)
+		return false;
+	if (mapping->host->i_blkbits >= (PAGE_SHIFT + thp_order(page)))
+		return false;
+
+	pos = (loff_t) page->index << PAGE_SHIFT;
+	if (pos > iocb->ki_pos) {
+		count = iocb->ki_pos + iter->count - pos;
+		pos = 0;
+	} else {
+		count = iter->count;
+		pos = iocb->ki_pos & (thp_size(page) - 1);
+	}
+
+	return mapping->a_ops->is_partially_uptodate(page, pos, count);
+}
+
 static int filemap_update_page(struct kiocb *iocb,
 		struct address_space *mapping, struct iov_iter *iter,
-		struct page *page, loff_t pos, loff_t count)
+		struct page *page)
 {
-	struct inode *inode = mapping->host;
 	int error = -EAGAIN;
 
 	if (!trylock_page(page)) {
@@ -2250,39 +2278,27 @@ static int filemap_update_page(struct kiocb *iocb,
 			goto error;
 	}
 
+	error = AOP_TRUNCATED_PAGE;
 	if (!page->mapping)
-		goto truncated;
-	if (PageUptodate(page))
-		goto uptodate;
-	if (inode->i_blkbits == PAGE_SHIFT ||
-			!mapping->a_ops->is_partially_uptodate)
-		goto readpage;
-	/* pipes can't handle partially uptodate pages */
-	if (unlikely(iov_iter_is_pipe(iter)))
-		goto readpage;
-	if (!mapping->a_ops->is_partially_uptodate(page,
-				pos & (thp_size(page) - 1), count))
-		goto readpage;
-uptodate:
-	unlock_page(page);
-	return 0;
-
-readpage:
-	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ)) {
+		goto unlock;
+	if (filemap_range_uptodate(iocb, mapping, iter, page)) {
 		unlock_page(page);
-		error = -EAGAIN;
-	} else {
-		error = filemap_read_page(iocb->ki_filp, mapping, page);
-		if (!error)
-			return 0;
+		return 0;
 	}
+
+	error = -EAGAIN;
+	if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT | IOCB_WAITQ))
+		goto unlock;
+
+	error = filemap_read_page(iocb->ki_filp, mapping, page);
+	if (error)
+		goto error;
+	return 0;
+unlock:
+	unlock_page(page);
 error:
 	put_page(page);
 	return error;
-truncated:
-	unlock_page(page);
-	put_page(page);
-	return AOP_TRUNCATED_PAGE;
 }
 
 static int filemap_create_page(struct file *file,
@@ -2353,9 +2369,6 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 	{
 		struct page *page = pvec->pages[pvec->nr - 1];
 		pgoff_t pg_index = page->index;
-		loff_t pg_pos = max(iocb->ki_pos,
-				    (loff_t) pg_index << PAGE_SHIFT);
-		loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
 
 		if (PageReadahead(page)) {
 			if (iocb->ki_flags & IOCB_NOIO) {
@@ -2372,8 +2385,7 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 			if ((iocb->ki_flags & IOCB_WAITQ) &&
 			    pagevec_count(pvec) > 1)
 				iocb->ki_flags |= IOCB_NOWAIT;
-			err = filemap_update_page(iocb, mapping, iter, page,
-					pg_pos, pg_count);
+			err = filemap_update_page(iocb, mapping, iter, page);
 			if (err)
 				pvec->nr--;
 		}
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 14/18] mm/filemap: Split filemap_readahead out of filemap_get_pages
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (12 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 13/18] mm/filemap: Add filemap_range_uptodate Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:19   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 15/18] mm/filemap: Restructure filemap_get_pages Matthew Wilcox (Oracle)
                   ` (3 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
This simplifies the error handling.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 281538a05dc1..7af0e656c5f6 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2330,6 +2330,17 @@ static int filemap_create_page(struct file *file,
 	return error;
 }
 
+static int filemap_readahead(struct kiocb *iocb, struct file *file,
+		struct address_space *mapping, struct page *page,
+		pgoff_t last_index)
+{
+	if (iocb->ki_flags & IOCB_NOIO)
+		return -EAGAIN;
+	page_cache_async_readahead(mapping, &file->f_ra, file, page,
+			page->index, last_index - page->index);
+	return 0;
+}
+
 static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 		struct pagevec *pvec)
 {
@@ -2368,17 +2379,14 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 got_pages:
 	{
 		struct page *page = pvec->pages[pvec->nr - 1];
-		pgoff_t pg_index = page->index;
 
 		if (PageReadahead(page)) {
-			if (iocb->ki_flags & IOCB_NOIO) {
-				put_page(page);
+			err = filemap_readahead(iocb, filp, mapping, page,
+					last_index);
+			if (err) {
 				pvec->nr--;
-				err = -EAGAIN;
 				goto err;
 			}
-			page_cache_async_readahead(mapping, ra, filp, page,
-					pg_index, last_index - pg_index);
 		}
 
 		if (!PageUptodate(page)) {
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 15/18] mm/filemap: Restructure filemap_get_pages
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (13 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 14/18] mm/filemap: Split filemap_readahead out of filemap_get_pages Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:21   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 16/18] mm/filemap: Don't relock the page after calling readpage Matthew Wilcox (Oracle)
                   ` (2 subsequent siblings)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
Remove the got_pages label, remove indentation, rename find_page to retry,
simplify error handling.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 64 +++++++++++++++++++++-------------------------------
 1 file changed, 26 insertions(+), 38 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7af0e656c5f6..22716f4bc977 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2349,67 +2349,55 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 	struct file_ra_state *ra = &filp->f_ra;
 	pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
 	pgoff_t last_index;
+	struct page *page;
 	int err = 0;
 
 	last_index = DIV_ROUND_UP(iocb->ki_pos + iter->count, PAGE_SIZE);
-find_page:
+retry:
 	if (fatal_signal_pending(current))
 		return -EINTR;
 
 	pagevec_init(pvec);
 	filemap_get_read_batch(mapping, index, last_index, pvec);
-	if (pvec->nr)
-		goto got_pages;
-
-	if (iocb->ki_flags & IOCB_NOIO)
-		return -EAGAIN;
-
-	page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
-
-	filemap_get_read_batch(mapping, index, last_index, pvec);
+	if (!pagevec_count(pvec)) {
+		if (iocb->ki_flags & IOCB_NOIO)
+			return -EAGAIN;
+		page_cache_sync_readahead(mapping, ra, filp, index,
+				last_index - index);
+		filemap_get_read_batch(mapping, index, last_index, pvec);
+	}
 	if (!pagevec_count(pvec)) {
 		if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ))
 			return -EAGAIN;
 		err = filemap_create_page(filp, mapping,
 				iocb->ki_pos >> PAGE_SHIFT, pvec);
 		if (err == AOP_TRUNCATED_PAGE)
-			goto find_page;
+			goto retry;
 		return err;
 	}
-got_pages:
-	{
-		struct page *page = pvec->pages[pvec->nr - 1];
-
-		if (PageReadahead(page)) {
-			err = filemap_readahead(iocb, filp, mapping, page,
-					last_index);
-			if (err) {
-				pvec->nr--;
-				goto err;
-			}
-		}
 
-		if (!PageUptodate(page)) {
-			if ((iocb->ki_flags & IOCB_WAITQ) &&
-			    pagevec_count(pvec) > 1)
-				iocb->ki_flags |= IOCB_NOWAIT;
-			err = filemap_update_page(iocb, mapping, iter, page);
-			if (err)
-				pvec->nr--;
-		}
+	page = pvec->pages[pagevec_count(pvec) - 1];
+	if (PageReadahead(page)) {
+		err = filemap_readahead(iocb, filp, mapping, page, last_index);
+		if (err)
+			goto err;
+	}
+	if (!PageUptodate(page)) {
+		if ((iocb->ki_flags & IOCB_WAITQ) && pagevec_count(pvec) > 1)
+			iocb->ki_flags |= IOCB_NOWAIT;
+		err = filemap_update_page(iocb, mapping, iter, page);
+		if (err)
+			goto err;
 	}
 
+	return 0;
 err:
+	pvec->nr--;
 	if (likely(pvec->nr))
 		return 0;
 	if (err == AOP_TRUNCATED_PAGE)
-		goto find_page;
-	if (err)
-		return err;
-	/*
-	 * No pages and no error means we raced and should retry:
-	 */
-	goto find_page;
+		goto retry;
+	return err;
 }
 
 /**
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * Re: [PATCH v2 15/18] mm/filemap: Restructure filemap_get_pages
  2020-11-04 20:42 ` [PATCH v2 15/18] mm/filemap: Restructure filemap_get_pages Matthew Wilcox (Oracle)
@ 2020-11-06  8:21   ` Christoph Hellwig
  0 siblings, 0 replies; 41+ messages in thread
From: Christoph Hellwig @ 2020-11-06  8:21 UTC (permalink / raw)
  To: Matthew Wilcox (Oracle); +Cc: linux-fsdevel, linux-mm, hch, kent.overstreet
On Wed, Nov 04, 2020 at 08:42:16PM +0000, Matthew Wilcox (Oracle) wrote:
> Remove the got_pages label, remove indentation, rename find_page to retry,
> simplify error handling.
> 
> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
> Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
Looks good,
Reviewed-by: Christoph Hellwig <hch@lst.de>
^ permalink raw reply	[flat|nested] 41+ messages in thread 
 
- * [PATCH v2 16/18] mm/filemap: Don't relock the page after calling readpage
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (14 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 15/18] mm/filemap: Restructure filemap_get_pages Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-06  8:21   ` Christoph Hellwig
  2020-11-04 20:42 ` [PATCH v2 17/18] mm/filemap: Rename generic_file_buffered_read to filemap_read Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 18/18] mm/filemap: Simplify generic_file_read_iter Matthew Wilcox (Oracle)
  17 siblings, 1 reply; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm; +Cc: Matthew Wilcox (Oracle), hch, kent.overstreet
We don't need to get the page lock again; we just need to wait for
the I/O to finish, so use wait_on_page_locked_killable() like the
other callers of ->readpage.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 21 +++++++--------------
 1 file changed, 7 insertions(+), 14 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index 22716f4bc977..721dcb580657 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2212,23 +2212,16 @@ static int filemap_read_page(struct file *file, struct address_space *mapping,
 	error = mapping->a_ops->readpage(file, page);
 	if (error)
 		return error;
-	if (PageUptodate(page))
-		return 0;
 
-	error = lock_page_killable(page);
+	error = wait_on_page_locked_killable(page);
 	if (error)
 		return error;
-	if (!PageUptodate(page)) {
-		if (page->mapping == NULL) {
-			/* page truncated */
-			error = AOP_TRUNCATED_PAGE;
-		} else {
-			shrink_readahead_size_eio(&file->f_ra);
-			error = -EIO;
-		}
-	}
-	unlock_page(page);
-	return error;
+	if (PageUptodate(page))
+		return 0;
+	if (!page->mapping)	/* page truncated */
+		return AOP_TRUNCATED_PAGE;
+	shrink_readahead_size_eio(&file->f_ra);
+	return -EIO;
 }
 
 static bool filemap_range_uptodate(struct kiocb *iocb,
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 17/18] mm/filemap: Rename generic_file_buffered_read to filemap_read
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (15 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 16/18] mm/filemap: Don't relock the page after calling readpage Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  2020-11-04 20:42 ` [PATCH v2 18/18] mm/filemap: Simplify generic_file_read_iter Matthew Wilcox (Oracle)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm
  Cc: Christoph Hellwig, kent.overstreet, Matthew Wilcox
From: Christoph Hellwig <hch@lst.de>
Rename generic_file_buffered_read to match the naming of filemap_fault,
also update the written parameter to a more descriptive name and
improve the kerneldoc comment.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 fs/btrfs/file.c    |  2 +-
 include/linux/fs.h |  4 ++--
 mm/filemap.c       | 35 ++++++++++++++++-------------------
 3 files changed, 19 insertions(+), 22 deletions(-)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 87355a38a654..1a4913e1fd12 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3633,7 +3633,7 @@ static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
 			return ret;
 	}
 
-	return generic_file_buffered_read(iocb, to, ret);
+	return filemap_read(iocb, to, ret);
 }
 
 const struct file_operations btrfs_file_operations = {
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 4ccc879ae845..413e327fa1c6 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2946,8 +2946,8 @@ extern ssize_t generic_write_checks(struct kiocb *, struct iov_iter *);
 extern int generic_write_check_limits(struct file *file, loff_t pos,
 		loff_t *count);
 extern int generic_file_rw_checks(struct file *file_in, struct file *file_out);
-extern ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *to, ssize_t already_read);
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *to,
+		ssize_t already_read);
 extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *);
 extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *);
diff --git a/mm/filemap.c b/mm/filemap.c
index 721dcb580657..d24c25345bae 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2394,23 +2394,20 @@ static int filemap_get_pages(struct kiocb *iocb, struct iov_iter *iter,
 }
 
 /**
- * generic_file_buffered_read - generic file read routine
- * @iocb:	the iocb to read
- * @iter:	data destination
- * @written:	already copied
+ * filemap_read - Read data from the page cache.
+ * @iocb: The iocb to read.
+ * @iter: Destination for the data.
+ * @already_read: Number of bytes already read by the caller.
  *
- * This is a generic file read routine, and uses the
- * mapping->a_ops->readpage() function for the actual low-level stuff.
+ * Copies data from the page cache.  If the data is not currently present,
+ * uses the readahead and readpage address_space operations to fetch it.
  *
- * This is really ugly. But the goto's actually try to clarify some
- * of the logic when it comes to error handling etc.
- *
- * Return:
- * * total number of bytes copied, including those the were already @written
- * * negative error code if nothing was copied
+ * Return: Total number of bytes copied, including those already read by
+ * the caller.  If an error happens before any bytes are copied, returns
+ * a negative error number.
  */
-ssize_t generic_file_buffered_read(struct kiocb *iocb,
-		struct iov_iter *iter, ssize_t written)
+ssize_t filemap_read(struct kiocb *iocb, struct iov_iter *iter,
+		ssize_t already_read)
 {
 	struct file *filp = iocb->ki_filp;
 	struct file_ra_state *ra = &filp->f_ra;
@@ -2433,7 +2430,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 		 * can no longer safely return -EIOCBQUEUED. Hence mark
 		 * an async read NOWAIT at that point.
 		 */
-		if ((iocb->ki_flags & IOCB_WAITQ) && written)
+		if ((iocb->ki_flags & IOCB_WAITQ) && already_read)
 			iocb->ki_flags |= IOCB_NOWAIT;
 
 		error = filemap_get_pages(iocb, iter, &pvec);
@@ -2493,7 +2490,7 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 			copied = copy_page_to_iter(page, offset, bytes, iter);
 
-			written += copied;
+			already_read += copied;
 			iocb->ki_pos += copied;
 			ra->prev_pos = iocb->ki_pos;
 
@@ -2509,9 +2506,9 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
 
 	file_accessed(filp);
 
-	return written ? written : error;
+	return already_read ? already_read : error;
 }
-EXPORT_SYMBOL_GPL(generic_file_buffered_read);
+EXPORT_SYMBOL_GPL(filemap_read);
 
 /**
  * generic_file_read_iter - generic filesystem read routine
@@ -2585,7 +2582,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 			goto out;
 	}
 
-	retval = generic_file_buffered_read(iocb, iter, retval);
+	retval = filemap_read(iocb, iter, retval);
 out:
 	return retval;
 }
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread
- * [PATCH v2 18/18] mm/filemap: Simplify generic_file_read_iter
  2020-11-04 20:42 [PATCH v2 00/18] Refactor generic_file_buffered_read Matthew Wilcox (Oracle)
                   ` (16 preceding siblings ...)
  2020-11-04 20:42 ` [PATCH v2 17/18] mm/filemap: Rename generic_file_buffered_read to filemap_read Matthew Wilcox (Oracle)
@ 2020-11-04 20:42 ` Matthew Wilcox (Oracle)
  17 siblings, 0 replies; 41+ messages in thread
From: Matthew Wilcox (Oracle) @ 2020-11-04 20:42 UTC (permalink / raw)
  To: linux-fsdevel, linux-mm
  Cc: Christoph Hellwig, kent.overstreet, Matthew Wilcox
From: Christoph Hellwig <hch@lst.de>
Avoid the pointless goto out just for returning retval.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Reviewed-by: Kent Overstreet <kent.overstreet@gmail.com>
---
 mm/filemap.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)
diff --git a/mm/filemap.c b/mm/filemap.c
index d24c25345bae..e84845ec7cd4 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2538,7 +2538,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t retval = 0;
 
 	if (!count)
-		goto out; /* skip atime */
+		return 0; /* skip atime */
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
 		struct file *file = iocb->ki_filp;
@@ -2556,7 +2556,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 						iocb->ki_pos,
 					        iocb->ki_pos + count - 1);
 			if (retval < 0)
-				goto out;
+				return retval;
 		}
 
 		file_accessed(file);
@@ -2579,12 +2579,10 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 		 */
 		if (retval < 0 || !count || iocb->ki_pos >= size ||
 		    IS_DAX(inode))
-			goto out;
+			return retval;
 	}
 
-	retval = filemap_read(iocb, iter, retval);
-out:
-	return retval;
+	return filemap_read(iocb, iter, retval);
 }
 EXPORT_SYMBOL(generic_file_read_iter);
 
-- 
2.28.0
^ permalink raw reply related	[flat|nested] 41+ messages in thread