From: Matthew Wilcox <willy@infradead.org>
To: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Amir Goldstein <amir73il@gmail.com>,
Hugh Dickins <hughd@google.com>,
Michael Larabel <Michael@michaellarabel.com>,
Ted Ts'o <tytso@google.com>,
Andreas Dilger <adilger.kernel@dilger.ca>,
Ext4 Developers List <linux-ext4@vger.kernel.org>,
Jan Kara <jack@suse.cz>,
linux-fsdevel <linux-fsdevel@vger.kernel.org>
Subject: Re: Kernel Benchmarking
Date: Sat, 12 Sep 2020 23:32:07 +0100 [thread overview]
Message-ID: <20200912223207.GD6583@casper.infradead.org> (raw)
In-Reply-To: <CAHk-=whjhYa3ig0U_mtpoU5Zok_2Y5zTCw8f-THkf1vHRBDNuA@mail.gmail.com>
On Sat, Sep 12, 2020 at 10:59:40AM -0700, Linus Torvalds wrote:
> Anyway, I don't have a great solution. I have a few options (roughly
> ordered by "simplest to most complex"):
>
> (a) just revert
> (b) add some busy-spinning
> (c) reader-writer page lock
> (d) try to de-emphasize the page lock
>
> Option (d) is "we already have a locking in many filesystems that give
> us exclusion between faulting in a page, and the truncate/hole punch,
> so we shouldn't use the page lock at all".
>
> I do think that the locking that filesystems do is in many ways
> inferior - it's done on a per-inode basis rather than on a per-page
> basis. But if the filesystems end up doing that *anyway*, what's the
> advantage of the finer granularity one? And *because* the common case
> is all about the reading case, the bigger granularity tends to work
> very well in practice, and basically never sees contention.
I guess this is option (e). Completely untested; not even compiled,
but it might be a design that means filesystems don't need to take
per-inode locks. I probably screwed up the drop-mmap-lock-for-io
parts of filemap_fault. I definitely didn't update DAX for the
new parameter for finish_fault(), and now I think about it, I didn't
update the header file either, so it definitely won't compile.
diff --git a/mm/filemap.c b/mm/filemap.c
index 1aaea26556cc..3909613f1c9c 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2602,8 +2602,22 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
}
}
+ if (fpin)
+ goto out_retry;
+ if (likely(PageUptodate(page)))
+ goto uptodate;
+
if (!lock_page_maybe_drop_mmap(vmf, page, &fpin))
goto out_retry;
+ VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
+
+ /* Did somebody else update it for us? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ if (fpin)
+ goto out_retry;
+ goto uptodate;
+ }
/* Did it get truncated? */
if (unlikely(compound_head(page)->mapping != mapping)) {
@@ -2611,14 +2625,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
put_page(page);
goto retry_find;
}
- VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
-
- /*
- * We have a locked page in the page cache, now we need to check
- * that it's up-to-date. If not, it is going to be due to an error.
- */
- if (unlikely(!PageUptodate(page)))
- goto page_not_uptodate;
/*
* We've made it this far and we had to drop our mmap_lock, now is the
@@ -2641,10 +2647,6 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
return VM_FAULT_SIGBUS;
}
- vmf->page = page;
- return ret | VM_FAULT_LOCKED;
-
-page_not_uptodate:
/*
* Umm, take care of errors if the page isn't up-to-date.
* Try to re-read it _once_. We do this synchronously,
@@ -2680,6 +2682,10 @@ vm_fault_t filemap_fault(struct vm_fault *vmf)
if (fpin)
fput(fpin);
return ret | VM_FAULT_RETRY;
+
+uptodate:
+ vmf->page = page;
+ return ret | VM_FAULT_UPTODATE;
}
EXPORT_SYMBOL(filemap_fault);
diff --git a/mm/memory.c b/mm/memory.c
index 469af373ae76..48fb04e75a3a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3460,6 +3460,8 @@ static vm_fault_t __do_fault(struct vm_fault *vmf)
return VM_FAULT_HWPOISON;
}
+ if (ret & VM_FAULT_UPTODATE)
+ return ret;
if (unlikely(!(ret & VM_FAULT_LOCKED)))
lock_page(vmf->page);
else
@@ -3684,7 +3686,7 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
-vm_fault_t finish_fault(struct vm_fault *vmf)
+vm_fault_t finish_fault(struct vm_fault *vmf, vm_fault_t ret2)
{
struct page *page;
vm_fault_t ret = 0;
@@ -3704,9 +3706,17 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
ret = alloc_set_pte(vmf, page);
+ if (ret2 & VM_FAULT_UPTODATE) {
+ if (!PageUptodate(page)) {
+ /* probably other things to do here */
+ page_remove_rmap(page);
+ pte_clear(vmf->vma->vm_mm, vmf->address, vmf->pte);
+ put_page(page);
+ }
+ }
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
- return ret;
+ return ret | ret2;
}
static unsigned long fault_around_bytes __read_mostly =
@@ -3844,8 +3854,9 @@ static vm_fault_t do_read_fault(struct vm_fault *vmf)
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
- ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ ret = finish_fault(vmf, ret);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
put_page(vmf->page);
return ret;
@@ -3878,8 +3889,9 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma);
__SetPageUptodate(vmf->cow_page);
- ret |= finish_fault(vmf);
- unlock_page(vmf->page);
+ ret = finish_fault(vmf, ret);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
put_page(vmf->page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
@@ -3912,10 +3924,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
}
- ret |= finish_fault(vmf);
+ ret = finish_fault(vmf, ret);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
VM_FAULT_RETRY))) {
- unlock_page(vmf->page);
+ if (!(ret & VM_FAULT_UPTODATE))
+ unlock_page(vmf->page);
put_page(vmf->page);
return ret;
}
diff --git a/mm/truncate.c b/mm/truncate.c
index dd9ebc1da356..649381703f31 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -176,6 +176,7 @@ void do_invalidatepage(struct page *page, unsigned int offset,
static void
truncate_cleanup_page(struct address_space *mapping, struct page *page)
{
+ ClearPageUptodate(page);
if (page_mapped(page)) {
pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1;
unmap_mapping_pages(mapping, page->index, nr, false);
@@ -738,7 +739,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
1, false);
}
}
- BUG_ON(page_mapped(page));
ret2 = do_launder_page(mapping, page);
if (ret2 == 0) {
if (!invalidate_complete_page2(mapping, page))
next prev parent reply other threads:[~2020-09-12 22:32 UTC|newest]
Thread overview: 65+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <CAHk-=wiZnE409WkTOG6fbF_eV1LgrHBvMtyKkpTqM9zT5hpf9A@mail.gmail.com>
[not found] ` <4ced9401-de3d-b7c9-9976-2739e837fafc@MichaelLarabel.com>
[not found] ` <CAHk-=wj+Qj=wXByMrAx3T8jmw=soUetioRrbz6dQaECx+zjMtg@mail.gmail.com>
[not found] ` <CAHk-=wgOPjbJsj-LeLc-JMx9Sz9DjGF66Q+jQFJROt9X9utdBg@mail.gmail.com>
[not found] ` <CAHk-=wjjK7PTnDZNi039yBxSHtAqusFoRrZzgMNTiYkJYdNopw@mail.gmail.com>
[not found] ` <aa90f272-1186-f9e1-8fdb-eefd332fdae8@MichaelLarabel.com>
[not found] ` <CAHk-=wh_31_XBNHbdF7EUJceLpEpwRxVF+_1TONzyBUym6Pw4w@mail.gmail.com>
[not found] ` <e24ef34d-7b1d-dd99-082d-28ca285a79ff@MichaelLarabel.com>
[not found] ` <CAHk-=wgEE4GuNjcRaaAvaS97tW+239-+tjcPjTq2FGhEuM8HYg@mail.gmail.com>
[not found] ` <6e1d8740-2594-c58b-ff02-a04df453d53c@MichaelLarabel.com>
[not found] ` <CAHk-=wgJ3-cEkU-5zXFPvRCHKkCCuKxVauYWGphjePEhJJgtgQ@mail.gmail.com>
[not found] ` <d2023f4c-ef14-b877-b5bb-e4f8af332abc@MichaelLarabel.com>
[not found] ` <CAHk-=wiz=J=8mJ=zRG93nuJ9GtQAm5bSRAbWJbWZuN4Br38+EQ@mail.gmail.com>
2020-09-11 0:05 ` Kernel Benchmarking Linus Torvalds
2020-09-11 0:49 ` Michael Larabel
2020-09-11 2:20 ` Linus Torvalds
[not found] ` <0cbc959e-1b8d-8d7e-1dc6-672cf5b3899a@MichaelLarabel.com>
2020-09-11 16:19 ` Linus Torvalds
2020-09-11 22:07 ` Linus Torvalds
2020-09-11 22:37 ` Michael Larabel
2020-09-12 7:28 ` Amir Goldstein
2020-09-12 10:32 ` Michael Larabel
2020-09-12 14:37 ` Matthew Wilcox
2020-09-12 14:44 ` Michael Larabel
2020-09-15 3:32 ` Matthew Wilcox
2020-09-15 10:39 ` Jan Kara
2020-09-15 13:52 ` Matthew Wilcox
[not found] ` <658ae026-32d9-0a25-5a59-9c510d6898d5@MichaelLarabel.com>
2020-09-14 17:47 ` Linus Torvalds
2020-09-14 20:21 ` Matthieu Baerts
2020-09-14 20:53 ` Linus Torvalds
2020-09-15 0:42 ` Linus Torvalds
2020-09-15 15:34 ` Matthieu Baerts
2020-09-15 18:27 ` Linus Torvalds
2020-09-15 18:47 ` Linus Torvalds
2020-09-15 19:26 ` Matthieu Baerts
2020-09-15 19:32 ` Linus Torvalds
2020-09-15 19:56 ` Matthieu Baerts
2020-09-15 23:35 ` Linus Torvalds
2020-09-16 10:34 ` Jan Kara
2020-09-16 18:47 ` Linus Torvalds
[not found] ` <9a92bf16-02c5-ba38-33c7-f350588ac874@tessares.net>
2020-09-15 19:24 ` Linus Torvalds
2020-09-15 19:38 ` Matthieu Baerts
2020-09-15 18:31 ` Linus Torvalds
2020-09-15 14:21 ` Michael Larabel
2020-09-15 17:52 ` Linus Torvalds
2020-09-17 17:51 ` Linus Torvalds
2020-09-17 18:23 ` Matthew Wilcox
2020-09-17 18:30 ` Linus Torvalds
2020-09-17 18:50 ` Matthew Wilcox
2020-09-17 19:00 ` Linus Torvalds
2020-09-17 19:27 ` Matthew Wilcox
2020-09-17 19:47 ` Linus Torvalds
2020-09-18 0:39 ` Sedat Dilek
2020-09-18 0:40 ` Sedat Dilek
2020-09-18 20:25 ` Sedat Dilek
2020-09-20 17:06 ` Linus Torvalds
2020-09-20 17:14 ` Sedat Dilek
2020-09-20 17:40 ` Linus Torvalds
2020-09-20 18:00 ` Sedat Dilek
2020-09-20 23:23 ` Dave Chinner
2020-09-20 23:31 ` Linus Torvalds
2020-09-20 23:40 ` Linus Torvalds
2020-09-21 1:20 ` Dave Chinner
2020-09-12 15:53 ` Matthew Wilcox
2020-09-12 17:59 ` Linus Torvalds
2020-09-12 20:32 ` Rogério Brito
2020-09-14 9:33 ` Jan Kara
2020-09-12 20:58 ` Josh Triplett
2020-09-12 20:59 ` James Bottomley
2020-09-12 21:15 ` Linus Torvalds
2020-09-12 22:32 ` Matthew Wilcox [this message]
2020-09-13 0:40 ` Dave Chinner
2020-09-13 2:39 ` Linus Torvalds
2020-09-13 3:40 ` Matthew Wilcox
2020-09-13 23:45 ` Dave Chinner
2020-09-14 3:31 ` Matthew Wilcox
2020-09-15 14:28 ` Chris Mason
2020-09-15 9:27 ` Jan Kara
2020-09-13 3:18 ` Matthew Wilcox
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200912223207.GD6583@casper.infradead.org \
--to=willy@infradead.org \
--cc=Michael@michaellarabel.com \
--cc=adilger.kernel@dilger.ca \
--cc=amir73il@gmail.com \
--cc=hughd@google.com \
--cc=jack@suse.cz \
--cc=linux-ext4@vger.kernel.org \
--cc=linux-fsdevel@vger.kernel.org \
--cc=torvalds@linux-foundation.org \
--cc=tytso@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.