From: Matthew Wilcox <willy@infradead.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Amir Goldstein <amir73il@gmail.com>,
Hugh Dickins <hughd@google.com>,
Michael Larabel <Michael@michaellarabel.com>,
Ted Ts'o <tytso@google.com>,
Andreas Dilger <adilger.kernel@dilger.ca>,
Ext4 Developers List <linux-ext4@vger.kernel.org>,
Jan Kara <jack@suse.cz>,
linux-fsdevel <linux-fsdevel@vger.kernel.org>
Subject: Re: Kernel Benchmarking
Date: Sat, 12 Sep 2020 23:32:07 +0100 [thread overview]
Message-ID: <20200912223207.GD6583@casper.infradead.org> (raw)
In-Reply-To: <CAHk-=whjhYa3ig0U_mtpoU5Zok_2Y5zTCw8f-THkf1vHRBDNuA@mail.gmail.com>
On Sat, Sep 12, 2020 at 10:59:40AM -0700, Linus Torvalds wrote:
> Anyway, I don't have a great solution. I have a few options (roughly
> ordered by "simplest to most complex"):
>
> (a) just revert
> (b) add some busy-spinning
> (c) reader-writer page lock
> (d) try to de-emphasize the page lock
>
> Option (d) is "we already have a locking in many filesystems that give
> us exclusion between faulting in a page, and the truncate/hole punch,
> so we shouldn't use the page lock at all".
>
> I do think that the locking that filesystems do is in many ways
> inferior - it's done on a per-inode basis rather than on a per-page
> basis. But if the filesystems end up doing that *anyway*, what's the
> advantage of the finer granularity one? And *because* the common case
> is all about the reading case, the bigger granularity tends to work
> very well in practice, and basically never sees contention.
I guess this is option (e). Completely untested; not even compiled,
but it might be a design that means filesystems don't need to take
per-inode locks. I probably screwed up the drop-mmap-lock-for-io
parts of filemap_fault. I definitely didn't update DAX for the
new parameter for finish_fault(), and now I think about it, I didn't
update the header file either, so it definitely won't compile.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1aaea26556cc..3909613f1c9c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2602,8 +2602,22 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
}
+ if (fpin)
+ goto out_retry;
+ if (likely(PageUptodate(page)))
+ goto uptodate;
+
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
goto out_retry;
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+
+ /* Did somebody else update it for us? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ if (fpin)
+ goto out_retry;
+ goto uptodate;
+ }
/* Did it get truncated? */
if (unlikely(compound_head(page)->mapping != mapping)) {
@@ -2611,14 +2625,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
-
- /*
- * We have a locked page in the page cache, now we need to check
- * that it's up-to-date. If not, it is going to be due to an error.
- */
- if (unlikely(!PageUptodate(page)))
- goto page_not_uptodate;
/*
* We've made it this far and we had to drop our mmap_lock, now is the
@@ -2641,10 +2647,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- vmf->page = page;
- return ret | VM_FAULT_LOCKED;
-
-page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
@@ -2680,6 +2682,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
+
+uptodate:
+ vmf->page = page;
+ return ret | VM_FAULT_UPTODATE;
}
EXPORT_SYMBOL(filemap_fault);
diff --git a/mm/memory.c b/mm/memory.c
index 469af373ae76..48fb04e75a3a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3460,6 +3460,8 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return VM_FAULT_HWPOISON;
}
+ if (ret & VM_FAULT_UPTODATE)
+ return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf->page);
else
@@ -3684,7 +3686,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
-vm_fault_t finish_fault(struct vm_fault *vmf)
+vm_fault_t finish_fault(struct vm_fault *vmf, vm_fault_t ret2)
{
struct page *page;
vm_fault_t ret = 0;
@@ -3704,9 +3706,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
ret = alloc_set_pte(vmf, page);
+ if (ret2 & VM_FAULT_UPTODATE) {
+ if (!PageUptodate(page)) {
+ /* probably other things to do here */
+ page_remove_rmap(page);
+ pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+ put_page(page);
+ }
+ }
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return ret;
+ return ret | ret2;
}
static unsigned long fault_around_bytes __read_mostly =
@@ -3844,8 +3854,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ ret = finish_fault(vmf, ret);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);
return ret;
@@ -3878,8 +3889,9 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
- ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ ret = finish_fault(vmf, ret);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
@@ -3912,10 +3924,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
}
- ret |= finish_fault(vmf);
+ ret = finish_fault(vmf, ret);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
put_page(vmf->page);
return ret;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1da356..649381703f31 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -176,6 +176,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
+ ClearPageUptodate(page);
if (page_mapped(page)) {
pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
unmap_mapping_pages(mapping, page->index, nr, false);
@@ -738,7 +739,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
1, false);
}
}
- BUG_ON(page_mapped(page));
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
next prev parent reply other threads:[~2020-09-12 22:32 UTC|newest]
Thread overview: 59+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CAHk-=wiZnE409WkTOG6fbF_eV1LgrHBvMtyKkpTqM9zT5hpf9A@mail.gmail.com>
[not found] ` <aa90f272-1186-f9e1-8fdb-eefd332fdae8@MichaelLarabel.com>
[not found] ` <CAHk-=wh_31_XBNHbdF7EUJceLpEpwRxVF+_1TONzyBUym6Pw4w@mail.gmail.com>
[not found] ` <e24ef34d-7b1d-dd99-082d-28ca285a79ff@MichaelLarabel.com>
[not found] ` <CAHk-=wgEE4GuNjcRaaAvaS97tW+239-+tjcPjTq2FGhEuM8HYg@mail.gmail.com>
[not found] ` <6e1d8740-2594-c58b-ff02-a04df453d53c@MichaelLarabel.com>
[not found] ` <CAHk-=wgJ3-cEkU-5zXFPvRCHKkCCuKxVauYWGphjePEhJJgtgQ@mail.gmail.com>
[not found] ` <d2023f4c-ef14-b877-b5bb-e4f8af332abc@MichaelLarabel.com>
[not found] ` <CAHk-=wiz=J=8mJ=zRG93nuJ9GtQAm5bSRAbWJbWZuN4Br38+EQ@mail.gmail.com>
[not found] ` <CAHk-=wimM2kckaYj7spUJwehZkSYxK9RQqu3G392BE=73dyKtg@mail.gmail.com>
[not found] ` <8bb582d2-2841-94eb-8862-91d1225d5ebc@MichaelLarabel.com>
[not found] ` <CAHk-=wjqE_a6bpZyDQ4DCrvj_Dv2RwQoY7wN91kj8y-tZFRvEA@mail.gmail.com>
[not found] ` <0cbc959e-1b8d-8d7e-1dc6-672cf5b3899a@MichaelLarabel.com>
[not found] ` <CAHk-=whP-7Uw9WgWgjRgF1mCg+NnkOPpWjVw+a9M3F9C52DrVg@mail.gmail.com>
[not found] ` <CAHk-=wjfw3U5eTGWLaisPHg1+jXsCX=xLZgqPx4KJeHhEqRnEQ@mail.gmail.com>
[not found] ` <a2369108-7103-278c-9f10-6309a0a9dc3b@MichaelLarabel.com>
2020-09-12 7:28 ` Kernel Benchmarking Amir Goldstein
2020-09-12 10:32 ` Michael Larabel
2020-09-12 14:37 ` Matthew Wilcox
2020-09-12 14:44 ` Michael Larabel
2020-09-15 3:32 ` Matthew Wilcox
2020-09-15 10:39 ` Jan Kara
2020-09-15 13:52 ` Matthew Wilcox
[not found] ` <658ae026-32d9-0a25-5a59-9c510d6898d5@MichaelLarabel.com>
2020-09-14 17:47 ` Linus Torvalds
2020-09-14 20:21 ` Matthieu Baerts
2020-09-14 20:53 ` Linus Torvalds
2020-09-15 0:42 ` Linus Torvalds
2020-09-15 15:34 ` Matthieu Baerts
2020-09-15 18:27 ` Linus Torvalds
2020-09-15 18:47 ` Linus Torvalds
2020-09-15 19:26 ` Matthieu Baerts
2020-09-15 19:32 ` Linus Torvalds
2020-09-15 19:56 ` Matthieu Baerts
2020-09-15 23:35 ` Linus Torvalds
2020-09-16 10:34 ` Jan Kara
2020-09-16 18:47 ` Linus Torvalds
[not found] ` <9a92bf16-02c5-ba38-33c7-f350588ac874@tessares.net>
2020-09-15 19:24 ` Linus Torvalds
2020-09-15 19:38 ` Matthieu Baerts
2020-09-15 18:31 ` Linus Torvalds
2020-09-15 14:21 ` Michael Larabel
2020-09-15 17:52 ` Linus Torvalds
2020-09-17 17:51 ` Linus Torvalds
2020-09-17 18:23 ` Matthew Wilcox
2020-09-17 18:30 ` Linus Torvalds
2020-09-17 18:50 ` Matthew Wilcox
2020-09-17 19:00 ` Linus Torvalds
2020-09-17 19:27 ` Matthew Wilcox
2020-09-17 19:47 ` Linus Torvalds
2020-09-18 0:39 ` Sedat Dilek
2020-09-18 0:40 ` Sedat Dilek
2020-09-18 20:25 ` Sedat Dilek
2020-09-20 17:06 ` Linus Torvalds
2020-09-20 17:14 ` Sedat Dilek
2020-09-20 17:40 ` Linus Torvalds
2020-09-20 18:00 ` Sedat Dilek
2020-09-20 23:23 ` Dave Chinner
2020-09-20 23:31 ` Linus Torvalds
2020-09-20 23:40 ` Linus Torvalds
2020-09-21 1:20 ` Dave Chinner
2020-09-12 15:53 ` Matthew Wilcox
2020-09-12 17:59 ` Linus Torvalds
2020-09-12 20:32 ` Rogério Brito
2020-09-14 9:33 ` Jan Kara
2020-09-12 20:58 ` Josh Triplett
2020-09-12 20:59 ` James Bottomley
2020-09-12 21:15 ` Linus Torvalds
2020-09-12 22:32 ` Matthew Wilcox [this message]
2020-09-13 0:40 ` Dave Chinner
2020-09-13 2:39 ` Linus Torvalds
2020-09-13 3:40 ` Matthew Wilcox
2020-09-13 23:45 ` Dave Chinner
2020-09-14 3:31 ` Matthew Wilcox
2020-09-15 14:28 ` Chris Mason
2020-09-15 9:27 ` Jan Kara
2020-09-13 3:18 ` Matthew Wilcox
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200912223207.GD6583@casper.infradead.org \
--to=willy@infradead.org \
--cc=Michael@michaellarabel.com \
--cc=adilger.kernel@dilger.ca \
--cc=amir73il@gmail.com \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=tytso@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).