From: Matthew Wilcox <willy@infradead.org>
To: Wang Yugui <wangyugui@e16-tech.com>
Cc: Dave Chinner <david@fromorbit.com>,
linux-xfs@vger.kernel.org, linux-fsdevel@vger.kernel.org
Subject: Re: Creating large folios in iomap buffered write path
Date: Thu, 18 May 2023 22:46:43 +0100 [thread overview]
Message-ID: <ZGacw+1cu49qnttj@casper.infradead.org> (raw)
In-Reply-To: <ZGZwNqYhttjREl0V@casper.infradead.org>
On Thu, May 18, 2023 at 07:36:38PM +0100, Matthew Wilcox wrote:
> Not so much a "folio problem" as "an enhancement nobody got around to doing
> yet". Here's a first attempt. It's still churning through an xfstests
> run for me. I have seen this warning trigger:
>
> WARN_ON_ONCE(!folio_test_uptodate(folio) &&
> folio_test_dirty(folio));
>
> in iomap_invalidate_folio() as it's now possible to create a folio
> for write that is larger than the write, and therefore we won't
> mark it uptodate. Maybe we should create slightly smaller folios.
Here's one that does. A couple of other small problems also fixed.
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c739b258a2d9..3702e5e47b0f 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -971,7 +971,7 @@ gfs2_iomap_get_folio(struct iomap_iter *iter, loff_t pos, unsigned len)
if (status)
return ERR_PTR(status);
- folio = iomap_get_folio(iter, pos);
+ folio = iomap_get_folio(iter, pos, len);
if (IS_ERR(folio))
gfs2_trans_end(sdp);
return folio;
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index 063133ec77f4..32ddddf9f35c 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -461,19 +461,25 @@ EXPORT_SYMBOL_GPL(iomap_is_partially_uptodate);
* iomap_get_folio - get a folio reference for writing
* @iter: iteration structure
* @pos: start offset of write
+ * @len: length of write
*
* Returns a locked reference to the folio at @pos, or an error pointer if the
* folio could not be obtained.
*/
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos)
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len)
{
unsigned fgp = FGP_WRITEBEGIN | FGP_NOFS;
+ struct folio *folio;
if (iter->flags & IOMAP_NOWAIT)
fgp |= FGP_NOWAIT;
+ fgp |= fgp_order(len);
- return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
+ folio = __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT,
fgp, mapping_gfp_mask(iter->inode->i_mapping));
+ if (!IS_ERR(folio) && folio_test_large(folio))
+ printk("index:%lu len:%zu order:%u\n", (unsigned long)(pos / PAGE_SIZE), len, folio_order(folio));
+ return folio;
}
EXPORT_SYMBOL_GPL(iomap_get_folio);
@@ -510,8 +516,8 @@ void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len)
iomap_page_release(folio);
} else if (folio_test_large(folio)) {
/* Must release the iop so the page can be split */
- WARN_ON_ONCE(!folio_test_uptodate(folio) &&
- folio_test_dirty(folio));
+ VM_WARN_ON_ONCE_FOLIO(!folio_test_uptodate(folio) &&
+ folio_test_dirty(folio), folio);
iomap_page_release(folio);
}
}
@@ -603,7 +609,7 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos,
if (folio_ops && folio_ops->get_folio)
return folio_ops->get_folio(iter, pos, len);
else
- return iomap_get_folio(iter, pos);
+ return iomap_get_folio(iter, pos, len);
}
static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret,
diff --git a/include/linux/iomap.h b/include/linux/iomap.h
index e2b836c2e119..80facb9c9e5b 100644
--- a/include/linux/iomap.h
+++ b/include/linux/iomap.h
@@ -261,7 +261,7 @@ int iomap_file_buffered_write_punch_delalloc(struct inode *inode,
int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops);
void iomap_readahead(struct readahead_control *, const struct iomap_ops *ops);
bool iomap_is_partially_uptodate(struct folio *, size_t from, size_t count);
-struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos);
+struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len);
bool iomap_release_folio(struct folio *folio, gfp_t gfp_flags);
void iomap_invalidate_folio(struct folio *folio, size_t offset, size_t len);
int iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len,
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index a56308a9d1a4..f4d05beb64eb 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -466,6 +466,19 @@ static inline void *detach_page_private(struct page *page)
return folio_detach_private(page_folio(page));
}
+/*
+ * There are some parts of the kernel which assume that PMD entries
+ * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
+ * limit the maximum allocation order to PMD size. I'm not aware of any
+ * assumptions about maximum order if THP are disabled, but 8 seems like
+ * a good order (that's 1MB if you're using 4kB pages)
+ */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
+#else
+#define MAX_PAGECACHE_ORDER 8
+#endif
+
#ifdef CONFIG_NUMA
struct folio *filemap_alloc_folio(gfp_t gfp, unsigned int order);
#else
@@ -505,14 +518,24 @@ pgoff_t page_cache_prev_miss(struct address_space *mapping,
#define FGP_NOWAIT 0x00000020
#define FGP_FOR_MMAP 0x00000040
#define FGP_STABLE 0x00000080
+#define FGP_ORDER(fgp) ((fgp) >> 26) /* top 6 bits */
#define FGP_WRITEBEGIN (FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE)
+static inline unsigned fgp_order(size_t size)
+{
+ unsigned int shift = ilog2(size);
+
+ if (shift <= PAGE_SHIFT)
+ return 0;
+ return (shift - PAGE_SHIFT) << 26;
+}
+
void *filemap_get_entry(struct address_space *mapping, pgoff_t index);
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ unsigned fgp_flags, gfp_t gfp);
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp);
+ unsigned fgp_flags, gfp_t gfp);
/**
* filemap_get_folio - Find and get a folio.
@@ -586,7 +609,7 @@ static inline struct page *find_get_page(struct address_space *mapping,
}
static inline struct page *find_get_page_flags(struct address_space *mapping,
- pgoff_t offset, int fgp_flags)
+ pgoff_t offset, unsigned fgp_flags)
{
return pagecache_get_page(mapping, offset, fgp_flags, 0);
}
diff --git a/mm/filemap.c b/mm/filemap.c
index b4c9bd368b7e..7abbb072d4d9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1910,7 +1910,7 @@ void *filemap_get_entry(struct address_space *mapping, pgoff_t index)
* Return: The found folio or an ERR_PTR() otherwise.
*/
struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ unsigned fgp_flags, gfp_t gfp)
{
struct folio *folio;
@@ -1952,7 +1952,9 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
folio_wait_stable(folio);
no_page:
if (!folio && (fgp_flags & FGP_CREAT)) {
+ unsigned order = FGP_ORDER(fgp_flags);
int err;
+
if ((fgp_flags & FGP_WRITE) && mapping_can_writeback(mapping))
gfp |= __GFP_WRITE;
if (fgp_flags & FGP_NOFS)
@@ -1961,26 +1963,38 @@ struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index,
gfp &= ~GFP_KERNEL;
gfp |= GFP_NOWAIT | __GFP_NOWARN;
}
-
- folio = filemap_alloc_folio(gfp, 0);
- if (!folio)
- return ERR_PTR(-ENOMEM);
-
if (WARN_ON_ONCE(!(fgp_flags & (FGP_LOCK | FGP_FOR_MMAP))))
fgp_flags |= FGP_LOCK;
- /* Init accessed so avoid atomic mark_page_accessed later */
- if (fgp_flags & FGP_ACCESSED)
- __folio_set_referenced(folio);
+ if (order > MAX_PAGECACHE_ORDER)
+ order = MAX_PAGECACHE_ORDER;
+ /* If we're not aligned, allocate a smaller folio */
+ if (index & ((1UL << order) - 1))
+ order = __ffs(index);
- err = filemap_add_folio(mapping, folio, index, gfp);
- if (unlikely(err)) {
+ do {
+ err = -ENOMEM;
+ if (order == 1)
+ order = 0;
+ folio = filemap_alloc_folio(gfp, order);
+ if (!folio)
+ continue;
+
+ /* Init accessed so avoid atomic mark_page_accessed later */
+ if (fgp_flags & FGP_ACCESSED)
+ __folio_set_referenced(folio);
+
+ err = filemap_add_folio(mapping, folio, index, gfp);
+ if (!err)
+ break;
folio_put(folio);
folio = NULL;
- if (err == -EEXIST)
- goto repeat;
- }
+ } while (order-- > 0);
+ if (err == -EEXIST)
+ goto repeat;
+ if (err)
+ return ERR_PTR(err);
/*
* filemap_add_folio locks the page, and for mmap
* we expect an unlocked page.
diff --git a/mm/folio-compat.c b/mm/folio-compat.c
index c6f056c20503..c96e88d9a262 100644
--- a/mm/folio-compat.c
+++ b/mm/folio-compat.c
@@ -92,7 +92,7 @@ EXPORT_SYMBOL(add_to_page_cache_lru);
noinline
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t index,
- int fgp_flags, gfp_t gfp)
+ unsigned fgp_flags, gfp_t gfp)
{
struct folio *folio;
diff --git a/mm/readahead.c b/mm/readahead.c
index 47afbca1d122..59a071badb90 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -462,19 +462,6 @@ static int try_context_readahead(struct address_space *mapping,
return 1;
}
-/*
- * There are some parts of the kernel which assume that PMD entries
- * are exactly HPAGE_PMD_ORDER. Those should be fixed, but until then,
- * limit the maximum allocation order to PMD size. I'm not aware of any
- * assumptions about maximum order if THP are disabled, but 8 seems like
- * a good order (that's 1MB if you're using 4kB pages)
- */
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-#define MAX_PAGECACHE_ORDER HPAGE_PMD_ORDER
-#else
-#define MAX_PAGECACHE_ORDER 8
-#endif
-
static inline int ra_alloc_folio(struct readahead_control *ractl, pgoff_t index,
pgoff_t mark, unsigned int order, gfp_t gfp)
{
next prev parent reply other threads:[~2023-05-18 21:46 UTC|newest]
Thread overview: 7+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20230510165055.01D5.409509F4@e16-tech.com>
[not found] ` <20230511013410.GY3223426@dread.disaster.area>
[not found] ` <20230517210740.6464.409509F4@e16-tech.com>
2023-05-18 18:36 ` Creating large folios in iomap buffered write path Matthew Wilcox
2023-05-18 21:46 ` Matthew Wilcox [this message]
2023-05-18 22:03 ` Matthew Wilcox
2023-05-19 2:55 ` Wang Yugui
2023-05-19 15:38 ` Matthew Wilcox
2023-05-20 13:35 ` Wang Yugui
2023-05-20 16:35 ` Matthew Wilcox
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=ZGacw+1cu49qnttj@casper.infradead.org \
--to=willy@infradead.org \
--cc=david@fromorbit.com \
--cc=linux-fsdevel@vger.kernel.org \
--cc=linux-xfs@vger.kernel.org \
--cc=wangyugui@e16-tech.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).