* [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 19:44 ` Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
` (2 subsequent siblings)
3 siblings, 1 reply; 11+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle), linux-mm, linux-kernel, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-xfs,
cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
At allocation time, put the pages in the cache unless we're using
->readpages.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
mm/readahead.c | 51 +++++++++++++++++++++++++++++++-------------------
1 file changed, 32 insertions(+), 19 deletions(-)
diff --git a/mm/readahead.c b/mm/readahead.c
index fc77d13af556..5a6676640f20 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -114,10 +114,10 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
static void read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+ struct list_head *pages, pgoff_t start,
+ unsigned int nr_pages)
{
struct blk_plug plug;
- unsigned page_idx;
blk_start_plug(&plug);
@@ -125,18 +125,17 @@ static void read_pages(struct address_space *mapping, struct file *filp,
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
+ } else {
+ struct page *page;
+ unsigned long index;
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
+ xa_for_each_range(&mapping->i_pages, index, page, start,
+ start + nr_pages - 1) {
mapping->a_ops->readpage(filp, page);
- put_page(page);
+ put_page(page);
+ }
}
-out:
blk_finish_plug(&plug);
}
@@ -157,9 +156,11 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
int page_idx;
+ pgoff_t page_offset;
unsigned long nr_pages = 0;
loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ bool use_list = mapping->a_ops->readpages;
if (isize == 0)
goto out;
@@ -170,7 +171,7 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* Preallocate as many pages as we will need.
*/
for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ page_offset = offset + page_idx;
if (page_offset > end_index)
break;
@@ -178,13 +179,14 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
nr_pages = 0;
continue;
}
@@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
+ if (use_list) {
+ page->index = page_offset;
+ list_add(&page->lru, &page_pool);
+ } else if (!add_to_page_cache_lru(page, mapping, page_offset,
+ gfp_mask)) {
+ if (nr_pages)
+ read_pages(mapping, filp, &page_pool,
+ page_offset - nr_pages,
+ nr_pages);
+ nr_pages = 0;
+ continue;
+ }
if (page_idx == nr_to_read - lookahead_size)
SetPageReadahead(page);
nr_pages++;
@@ -205,7 +217,8 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
* will then handle the error.
*/
if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
+ read_pages(mapping, filp, &page_pool, page_offset - nr_pages,
+ nr_pages);
BUG_ON(!list_empty(&page_pool));
out:
return nr_pages;
--
2.24.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 03/12] readahead: Put pages in cache earlier
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 19:44 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-01-25 19:44 UTC (permalink / raw)
To: linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:44PM -0800, Matthew Wilcox wrote:
> @@ -192,8 +194,18 @@ unsigned long __do_page_cache_readahead(struct address_space *mapping,
> page = __page_cache_alloc(gfp_mask);
> if (!page)
> break;
> - page->index = page_offset;
> - list_add(&page->lru, &page_pool);
> + if (use_list) {
> + page->index = page_offset;
> + list_add(&page->lru, &page_pool);
> + } else if (!add_to_page_cache_lru(page, mapping, page_offset,
> + gfp_mask)) {
> + if (nr_pages)
> + read_pages(mapping, filp, &page_pool,
> + page_offset - nr_pages,
> + nr_pages);
> + nr_pages = 0;
This is missing a call to put_page().
> + continue;
> + }
> if (page_idx == nr_to_read - lookahead_size)
> SetPageReadahead(page);
> nr_pages++;
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
2020-01-29 0:24 ` Dave Chinner
2020-01-25 1:35 ` [PATCH 09/12] ext4: Convert from readpages to readahead Matthew Wilcox
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 2 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel
Cc: Matthew Wilcox (Oracle), linux-mm, linux-kernel, linux-btrfs,
linux-erofs, linux-ext4, linux-f2fs-devel, linux-xfs,
cluster-devel, ocfs2-devel
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
This replaces ->readpages with a saner interface:
- Return the number of pages not read instead of an ignored error code.
- Pages are already in the page cache when ->readahead is called.
- Implementation looks up the pages in the page cache instead of
having them passed in a linked list.
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-btrfs@vger.kernel.org
Cc: linux-erofs@lists.ozlabs.org
Cc: linux-ext4@vger.kernel.org
Cc: linux-f2fs-devel@lists.sourceforge.net
Cc: linux-xfs@vger.kernel.org
Cc: cluster-devel@redhat.com
Cc: ocfs2-devel@oss.oracle.com
---
Documentation/filesystems/locking.rst | 7 ++++++-
Documentation/filesystems/vfs.rst | 11 +++++++++++
include/linux/fs.h | 2 ++
include/linux/pagemap.h | 12 ++++++++++++
mm/readahead.c | 13 ++++++++++++-
5 files changed, 43 insertions(+), 2 deletions(-)
diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst
index 5057e4d9dcd1..d8a5dde914b5 100644
--- a/Documentation/filesystems/locking.rst
+++ b/Documentation/filesystems/locking.rst
@@ -239,6 +239,8 @@ prototypes::
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -271,7 +273,8 @@ writepage: yes, unlocks (see below)
readpage: yes, unlocks
writepages:
set_page_dirty no
-readpages:
+readahead: yes, unlocks
+readpages: no
write_begin: locks the page exclusive
write_end: yes, unlocks exclusive
bmap:
@@ -295,6 +298,8 @@ the request handler (/dev/loop).
->readpage() unlocks the page, either synchronously or via I/O
completion.
+->readahead() unlocks the page like ->readpage().
+
->readpages() populates the pagecache with the passed pages and starts
I/O against them. They come unlocked upon I/O completion.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 7d4d09dd5e6d..bb06fb7b120b 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
int (*readpage)(struct file *, struct page *);
int (*writepages)(struct address_space *, struct writeback_control *);
int (*set_page_dirty)(struct page *page);
+ unsigned (*readahead)(struct file *filp, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages);
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
@@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
If defined, it should set the PageDirty flag, and the
PAGECACHE_TAG_DIRTY tag in the radix tree.
+``readahead``
+ called by the VM to read pages associated with the address_space
+ object. The pages are consecutive in the page cache and are
+ locked. The implementation should decrement the page refcount after
+ attempting I/O on each page. Usually the page will be unlocked by
+ the I/O completion handler. If the function does not attempt I/O on
+ some pages, return the number of pages which were not read so the
+ common code can unlock the pages for you.
+
``readpages``
called by the VM to read pages associated with the address_space
object. This is essentially just a vector version of readpage.
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 98e0349adb52..a10f3a72e5ac 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -375,6 +375,8 @@ struct address_space_operations {
*/
int (*readpages)(struct file *filp, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages);
+ unsigned (*readahead)(struct file *, struct address_space *,
+ pgoff_t start, unsigned nr_pages);
int (*write_begin)(struct file *, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 37a4d9e32cd3..2baafd236a82 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -630,6 +630,18 @@ static inline int add_to_page_cache(struct page *page,
return error;
}
+/*
+ * Only call this from a ->readahead implementation.
+ */
+static inline
+struct page *readahead_page(struct address_space *mapping, pgoff_t index)
+{
+ struct page *page = xa_load(&mapping->i_pages, index);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
+ return page;
+}
+
static inline unsigned long dir_pages(struct inode *inode)
{
return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >>
diff --git a/mm/readahead.c b/mm/readahead.c
index 5a6676640f20..6d65dae6dad0 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
+ if (mapping->a_ops->readahead) {
+ unsigned left = mapping->a_ops->readahead(filp, mapping,
+ start, nr_pages);
+
+ while (left) {
+ struct page *page = readahead_page(mapping,
+ start + nr_pages - left - 1);
+ unlock_page(page);
+ put_page(page);
+ left--;
+ }
+ } else if (mapping->a_ops->readpages) {
mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
/* Clean up the remaining pages */
put_pages_list(pages);
--
2.24.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 3:57 ` Randy Dunlap
2020-02-01 0:25 ` Matthew Wilcox
2020-01-29 0:24 ` Dave Chinner
1 sibling, 1 reply; 11+ messages in thread
From: Randy Dunlap @ 2020-01-25 3:57 UTC (permalink / raw)
To: Matthew Wilcox, linux-fsdevel
Cc: linux-mm, linux-kernel, linux-btrfs, linux-erofs, linux-ext4,
linux-f2fs-devel, linux-xfs, cluster-devel, ocfs2-devel
On 1/24/20 5:35 PM, Matthew Wilcox wrote:
> diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
> index 7d4d09dd5e6d..bb06fb7b120b 100644
> --- a/Documentation/filesystems/vfs.rst
> +++ b/Documentation/filesystems/vfs.rst
> @@ -706,6 +706,8 @@ cache in your filesystem. The following members are defined:
> int (*readpage)(struct file *, struct page *);
> int (*writepages)(struct address_space *, struct writeback_control *);
> int (*set_page_dirty)(struct page *page);
> + unsigned (*readahead)(struct file *filp, struct address_space *mapping,
> + pgoff_t start, unsigned nr_pages);
> int (*readpages)(struct file *filp, struct address_space *mapping,
> struct list_head *pages, unsigned nr_pages);
> int (*write_begin)(struct file *, struct address_space *mapping,
> @@ -781,6 +783,15 @@ cache in your filesystem. The following members are defined:
> If defined, it should set the PageDirty flag, and the
> PAGECACHE_TAG_DIRTY tag in the radix tree.
>
> +``readahead``
> + called by the VM to read pages associated with the address_space
> + object. The pages are consecutive in the page cache and are
> + locked. The implementation should decrement the page refcount after
> + attempting I/O on each page. Usually the page will be unlocked by
> + the I/O completion handler. If the function does not attempt I/O on
> + some pages, return the number of pages which were not read so the
> + common code can unlock the pages for you.
> +
Please use consistent indentation (tabs).
> ``readpages``
> called by the VM to read pages associated with the address_space
> object. This is essentially just a vector version of readpage.
cheers.
--
~Randy
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 3:57 ` Randy Dunlap
@ 2020-02-01 0:25 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-02-01 0:25 UTC (permalink / raw)
To: Randy Dunlap
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 07:57:40PM -0800, Randy Dunlap wrote:
> > +``readahead``
> > + called by the VM to read pages associated with the address_space
> > + object. The pages are consecutive in the page cache and are
> > + locked. The implementation should decrement the page refcount after
> > + attempting I/O on each page. Usually the page will be unlocked by
> > + the I/O completion handler. If the function does not attempt I/O on
> > + some pages, return the number of pages which were not read so the
> > + common code can unlock the pages for you.
> > +
>
> Please use consistent indentation (tabs).
This turned out to be not my fault. The vim rst ... mode? plugin?
Whatever it is, it's converting tabs to spaces! To fix it, I had to
rename the file to .txt, make the edits, then rename it back. This is
very poor behaviour.
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
2020-01-25 3:57 ` Randy Dunlap
@ 2020-01-29 0:24 ` Dave Chinner
2020-01-30 8:00 ` Matthew Wilcox
1 sibling, 1 reply; 11+ messages in thread
From: Dave Chinner @ 2020-01-29 0:24 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This replaces ->readpages with a saner interface:
> - Return the number of pages not read instead of an ignored error code.
> - Pages are already in the page cache when ->readahead is called.
> - Implementation looks up the pages in the page cache instead of
> having them passed in a linked list.
....
> diff --git a/mm/readahead.c b/mm/readahead.c
> index 5a6676640f20..6d65dae6dad0 100644
> --- a/mm/readahead.c
> +++ b/mm/readahead.c
> @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
>
> blk_start_plug(&plug);
>
> - if (mapping->a_ops->readpages) {
> + if (mapping->a_ops->readahead) {
> + unsigned left = mapping->a_ops->readahead(filp, mapping,
> + start, nr_pages);
> +
> + while (left) {
> + struct page *page = readahead_page(mapping,
> + start + nr_pages - left - 1);
Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
page at index 2, which is the one we issued IO on, not the one we
"left behind" which is at index 3.
Cheers,
Dave.
--
Dave Chinner
david@fromorbit.com
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 04/12] mm: Add readahead address space operation
2020-01-29 0:24 ` Dave Chinner
@ 2020-01-30 8:00 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-01-30 8:00 UTC (permalink / raw)
To: Dave Chinner
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Wed, Jan 29, 2020 at 11:24:56AM +1100, Dave Chinner wrote:
> On Fri, Jan 24, 2020 at 05:35:45PM -0800, Matthew Wilcox wrote:
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This replaces ->readpages with a saner interface:
> > - Return the number of pages not read instead of an ignored error code.
> > - Pages are already in the page cache when ->readahead is called.
> > - Implementation looks up the pages in the page cache instead of
> > having them passed in a linked list.
> ....
> > diff --git a/mm/readahead.c b/mm/readahead.c
> > index 5a6676640f20..6d65dae6dad0 100644
> > --- a/mm/readahead.c
> > +++ b/mm/readahead.c
> > @@ -121,7 +121,18 @@ static void read_pages(struct address_space *mapping, struct file *filp,
> >
> > blk_start_plug(&plug);
> >
> > - if (mapping->a_ops->readpages) {
> > + if (mapping->a_ops->readahead) {
> > + unsigned left = mapping->a_ops->readahead(filp, mapping,
> > + start, nr_pages);
> > +
> > + while (left) {
> > + struct page *page = readahead_page(mapping,
> > + start + nr_pages - left - 1);
>
> Off by one? start = 2, nr_pages = 2, left = 1, this looks up the
> page at index 2, which is the one we issued IO on, not the one we
> "left behind" which is at index 3.
Yup. I originally had:
while (left--) ...
decided that was too confusing and didn't quite complete that thought.
^ permalink raw reply [flat|nested] 11+ messages in thread
* [PATCH 09/12] ext4: Convert from readpages to readahead
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
2020-01-25 1:35 ` [PATCH 03/12] readahead: Put pages in cache earlier Matthew Wilcox
2020-01-25 1:35 ` [PATCH 04/12] mm: Add readahead address space operation Matthew Wilcox
@ 2020-01-25 1:35 ` Matthew Wilcox
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
3 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-01-25 1:35 UTC (permalink / raw)
To: linux-fsdevel; +Cc: Matthew Wilcox (Oracle), linux-mm, linux-kernel, linux-ext4
From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Use the new readahead operation in ext4
Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org>
Cc: linux-ext4@vger.kernel.org
---
fs/ext4/ext4.h | 2 +-
fs/ext4/inode.c | 24 ++++++++++++------------
fs/ext4/readpage.c | 20 +++++++-------------
3 files changed, 20 insertions(+), 26 deletions(-)
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index f8578caba40d..a035694f3d9b 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -3216,7 +3216,7 @@ static inline void ext4_set_de_type(struct super_block *sb,
/* readpages.c */
extern int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
+ pgoff_t start, struct page *page,
unsigned nr_pages, bool is_readahead);
extern int __init ext4_init_post_read_processing(void);
extern void ext4_exit_post_read_processing(void);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 629a25d999f0..4afefc991b01 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -3217,7 +3217,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block)
static int ext4_readpage(struct file *file, struct page *page)
{
int ret = -EAGAIN;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = file_inode(file);
trace_ext4_readpage(page);
@@ -3225,23 +3225,23 @@ static int ext4_readpage(struct file *file, struct page *page)
ret = ext4_readpage_inline(inode, page);
if (ret == -EAGAIN)
- return ext4_mpage_readpages(page->mapping, NULL, page, 1,
- false);
+ return ext4_mpage_readpages(page->mapping, 0, page, 1, false);
return ret;
}
-static int
-ext4_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static unsigned
+ext4_readahead(struct file *file, struct address_space *mapping,
+ pgoff_t start, unsigned nr_pages)
{
struct inode *inode = mapping->host;
- /* If the file has inline data, no need to do readpages. */
+ /* If the file has inline data, no need to do readahead. */
if (ext4_has_inline_data(inode))
- return 0;
+ return nr_pages;
- return ext4_mpage_readpages(mapping, pages, NULL, nr_pages, true);
+ ext4_mpage_readpages(mapping, start, NULL, nr_pages, true);
+ return 0;
}
static void ext4_invalidatepage(struct page *page, unsigned int offset,
@@ -3565,7 +3565,7 @@ static int ext4_set_page_dirty(struct page *page)
static const struct address_space_operations ext4_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3582,7 +3582,7 @@ static const struct address_space_operations ext4_aops = {
static const struct address_space_operations ext4_journalled_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_write_begin,
@@ -3598,7 +3598,7 @@ static const struct address_space_operations ext4_journalled_aops = {
static const struct address_space_operations ext4_da_aops = {
.readpage = ext4_readpage,
- .readpages = ext4_readpages,
+ .readahead = ext4_readahead,
.writepage = ext4_writepage,
.writepages = ext4_writepages,
.write_begin = ext4_da_write_begin,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index fef7755300c3..aa3f46a237ef 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -7,8 +7,8 @@
*
* This was originally taken from fs/mpage.c
*
- * The intent is the ext4_mpage_readpages() function here is intended
- * to replace mpage_readpages() in the general case, not just for
+ * The ext4_mpage_readahead() function here is intended to
+ * replace mpage_readahead() in the general case, not just for
* encrypted files. It has some limitations (see below), where it
* will fall back to read_block_full_page(), but these limitations
* should only be hit when page_size != block_size.
@@ -209,9 +209,8 @@ static inline loff_t ext4_readpage_limit(struct inode *inode)
return i_size_read(inode);
}
-int ext4_mpage_readpages(struct address_space *mapping,
- struct list_head *pages, struct page *page,
- unsigned nr_pages, bool is_readahead)
+int ext4_mpage_readpages(struct address_space *mapping, pgoff_t start,
+ struct page *page, unsigned nr_pages, bool is_readahead)
{
struct bio *bio = NULL;
sector_t last_block_in_bio = 0;
@@ -239,14 +238,10 @@ int ext4_mpage_readpages(struct address_space *mapping,
int fully_mapped = 1;
unsigned first_hole = blocks_per_page;
- if (pages) {
- page = lru_to_page(pages);
+ if (is_readahead) {
+ page = readahead_page(mapping, start++);
prefetchw(&page->flags);
- list_del(&page->lru);
- if (add_to_page_cache_lru(page, mapping, page->index,
- readahead_gfp_mask(mapping)))
- goto next_page;
}
if (page_has_buffers(page))
@@ -402,10 +397,9 @@ int ext4_mpage_readpages(struct address_space *mapping,
else
unlock_page(page);
next_page:
- if (pages)
+ if (is_readahead)
put_page(page);
}
- BUG_ON(pages && !list_empty(pages));
if (bio)
submit_bio(bio);
return 0;
--
2.24.1
^ permalink raw reply related [flat|nested] 11+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-01-25 1:35 [PATCH 00/12] Change readahead API Matthew Wilcox
` (2 preceding siblings ...)
2020-01-25 1:35 ` [PATCH 09/12] ext4: Convert from readpages to readahead Matthew Wilcox
@ 2020-02-13 4:38 ` Andrew Morton
2020-02-13 13:43 ` Matthew Wilcox
3 siblings, 1 reply; 11+ messages in thread
From: Andrew Morton @ 2020-02-13 4:38 UTC (permalink / raw)
To: Matthew Wilcox
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel
On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
> From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
>
> This series adds a readahead address_space operation to eventually
> replace the readpages operation. The key difference is that
> pages are added to the page cache as they are allocated (and
> then looked up by the filesystem) instead of passing them on a
> list to the readpages operation and having the filesystem add
> them to the page cache. It's a net reduction in code for each
> implementation, more efficient than walking a list, and solves
> the direct-write vs buffered-read problem reported by yu kuai at
> https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
Unclear which patch fixes this and how it did it?
^ permalink raw reply [flat|nested] 11+ messages in thread
* Re: [PATCH 00/12] Change readahead API
2020-02-13 4:38 ` [PATCH 00/12] Change readahead API Andrew Morton
@ 2020-02-13 13:43 ` Matthew Wilcox
0 siblings, 0 replies; 11+ messages in thread
From: Matthew Wilcox @ 2020-02-13 13:43 UTC (permalink / raw)
To: Andrew Morton
Cc: linux-fsdevel, linux-mm, linux-kernel, linux-btrfs, linux-erofs,
linux-ext4, linux-f2fs-devel, linux-xfs, cluster-devel,
ocfs2-devel, Mark Fasheh, Joel Becker, Joseph Qi, Bob Peterson,
Andreas Gruenbacher
On Wed, Feb 12, 2020 at 08:38:52PM -0800, Andrew Morton wrote:
> On Fri, 24 Jan 2020 17:35:41 -0800 Matthew Wilcox <willy@infradead.org> wrote:
>
> > From: "Matthew Wilcox (Oracle)" <willy@infradead.org>
> >
> > This series adds a readahead address_space operation to eventually
> > replace the readpages operation. The key difference is that
> > pages are added to the page cache as they are allocated (and
> > then looked up by the filesystem) instead of passing them on a
> > list to the readpages operation and having the filesystem add
> > them to the page cache. It's a net reduction in code for each
> > implementation, more efficient than walking a list, and solves
> > the direct-write vs buffered-read problem reported by yu kuai at
> > https://lore.kernel.org/linux-fsdevel/20200116063601.39201-1-yukuai3@huawei.com/
>
> Unclear which patch fixes this and how it did it?
I suppose the problem isn't fixed until patch 13/13 is applied.
What yu kuai is seeing is a race where readahead allocates a page,
then passes it to iomap_readpages, which calls xfs_read_iomap_begin()
which looks up the extent. Then thread 2 does DIO which modifies the
extent, because there's nothing to say that thread 1 is still using it.
With this patch series, the readpages code puts the locked pages in the
cache before calling iomap_readpages, so any racing write will block on
the locked page until readahead is completed.
If you're tempted to put this into -mm, I have a couple of new changes;
one to fix a kernel-doc warning for mpage_readahead() and one to add
kernel-doc for iomap_readahead():
+++ b/fs/mpage.c
@@ -339,9 +339,7 @@
/**
* mpage_readahead - start reads against pages
- * @mapping: the address_space
- * @start: The number of the first page to read.
- * @nr_pages: The number of consecutive pages to read.
+ * @rac: Describes which pages to read.
* @get_block: The filesystem's block mapper function.
*
* This function walks the pages and the blocks within each page, building and
+++ b/fs/iomap/buffered-io.c
@@ -395,6 +395,21 @@
return done;
}
+/**
+ * iomap_readahead - Attempt to read pages from a file.
+ * @rac: Describes the pages to be read.
+ * @ops: The operations vector for the filesystem.
+ *
+ * This function is for filesystems to call to implement their readahead
+ * address_space operation.
+ *
+ * Context: The file is pinned by the caller, and the pages to be read are
+ * all locked and have an elevated refcount. This function will unlock
+ * the pages (once I/O has completed on them, or I/O has been determined to
+ * not be necessary). It will also decrease the refcount once the pages
+ * have been submitted for I/O. After this point, the page may be removed
+ * from the page cache, and should not be referenced.
+ */
void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops)
{
struct inode *inode = rac->mapping->host;
I'll do a v6 with those changes soon, but I would really like a bit more
review from filesystem people, particularly ocfs2 and gfs2.
^ permalink raw reply [flat|nested] 11+ messages in thread