From: Peter Zijlstra <peterz@infradead.org>
To: Mel Gorman <mgorman@techsingularity.net>
Cc: Linus Torvalds <torvalds@linux-foundation.org>,
Andrew Morton <akpm@linux-foundation.org>,
"Kirill A. Shutemov" <kirill.shutemov@linux.intel.com>,
Johannes Weiner <hannes@cmpxchg.org>, Jan Kara <jack@suse.cz>,
Rik van Riel <riel@redhat.com>, linux-mm <linux-mm@kvack.org>
Subject: Re: page_waitqueue() considered harmful
Date: Wed, 28 Sep 2016 13:11:15 +0200 [thread overview]
Message-ID: <20160928111115.GS5016@twins.programming.kicks-ass.net> (raw)
In-Reply-To: <20160928104500.GC3903@techsingularity.net>
On Wed, Sep 28, 2016 at 11:45:00AM +0100, Mel Gorman wrote:
> tldr: Other than 32-bit vs 64-bit, I could not find anything obviously wrong.
Thanks, meanwhile I redid the patch based on the feedback from Nick and
Linus.
I know I ignored the 32bit issue, I figured we'd first see if it helps
at all before mucking about with that.
In any case, I don't mind doing a PG_waiters and also using it for the
PG_writeback thing. Just wasn't what I was thinking of when doing that
patch.
Re the naming of __unlock_page(), its the slow path continuation of
unlock_page(). But I'm not too bothered if people want to change the
name.
So the below boots and builds a kernel and must be equally perfect, then
again x86 has no-op smp_mb__{before,after}_atomic() so the lack of
barriers will not affect anything much. PowerPC, ARM and MIPS (among
others) will want testing.
---
include/linux/page-flags.h | 2 ++
include/linux/pagemap.h | 25 +++++++++----
include/trace/events/mmflags.h | 1 +
mm/filemap.c | 80 +++++++++++++++++++++++++++++++++++++-----
4 files changed, 93 insertions(+), 15 deletions(-)
diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h
index 74e4dda..0ed3900 100644
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -73,6 +73,7 @@
*/
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
+ PG_contended, /* Page lock is contended. */
PG_error,
PG_referenced,
PG_uptodate,
@@ -253,6 +254,7 @@ static inline int TestClearPage##uname(struct page *page) { return 0; }
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
__PAGEFLAG(Locked, locked, PF_NO_TAIL)
+PAGEFLAG(Contended, contended, PF_NO_TAIL)
PAGEFLAG(Error, error, PF_NO_COMPOUND) TESTCLEARFLAG(Error, error, PF_NO_COMPOUND)
PAGEFLAG(Referenced, referenced, PF_HEAD)
TESTCLEARFLAG(Referenced, referenced, PF_HEAD)
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 66a1260..c8b8651 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -417,7 +417,7 @@ extern void __lock_page(struct page *page);
extern int __lock_page_killable(struct page *page);
extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags);
-extern void unlock_page(struct page *page);
+extern void __unlock_page(struct page *page);
static inline int trylock_page(struct page *page)
{
@@ -448,6 +448,20 @@ static inline int lock_page_killable(struct page *page)
return 0;
}
+static inline void unlock_page(struct page *page)
+{
+ page = compound_head(page);
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+ clear_bit_unlock(PG_locked, &page->flags);
+ /*
+ * Since PG_locked and PG_contended are in the same word, Program-Order
+ * ensures the load of PG_contended must not observe a value earlier
+ * than our clear_bit() store. See lock_page_wait().
+ */
+ if (PageContended(page))
+ __unlock_page(page);
+}
+
/*
* lock_page_or_retry - Lock the page, unless this would block and the
* caller indicated that it can handle a retry.
@@ -472,11 +486,11 @@ extern int wait_on_page_bit_killable(struct page *page, int bit_nr);
extern int wait_on_page_bit_killable_timeout(struct page *page,
int bit_nr, unsigned long timeout);
+extern int wait_on_page_lock(struct page *page, int mode);
+
static inline int wait_on_page_locked_killable(struct page *page)
{
- if (!PageLocked(page))
- return 0;
- return wait_on_page_bit_killable(compound_head(page), PG_locked);
+ return wait_on_page_lock(page, TASK_KILLABLE);
}
extern wait_queue_head_t *page_waitqueue(struct page *page);
@@ -494,8 +508,7 @@ static inline void wake_up_page(struct page *page, int bit)
*/
static inline void wait_on_page_locked(struct page *page)
{
- if (PageLocked(page))
- wait_on_page_bit(compound_head(page), PG_locked);
+ wait_on_page_lock(page, TASK_UNINTERRUPTIBLE);
}
/*
diff --git a/include/trace/events/mmflags.h b/include/trace/events/mmflags.h
index 5a81ab4..18b8398 100644
--- a/include/trace/events/mmflags.h
+++ b/include/trace/events/mmflags.h
@@ -81,6 +81,7 @@
#define __def_pageflag_names \
{1UL << PG_locked, "locked" }, \
+ {1UL << PG_contended, "contended" }, \
{1UL << PG_error, "error" }, \
{1UL << PG_referenced, "referenced" }, \
{1UL << PG_uptodate, "uptodate" }, \
diff --git a/mm/filemap.c b/mm/filemap.c
index 8a287df..79aab60 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -847,15 +847,17 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* The mb is necessary to enforce ordering between the clear_bit and the read
* of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()).
*/
-void unlock_page(struct page *page)
+void __unlock_page(struct page *page)
{
- page = compound_head(page);
- VM_BUG_ON_PAGE(!PageLocked(page), page);
- clear_bit_unlock(PG_locked, &page->flags);
- smp_mb__after_atomic();
- wake_up_page(page, PG_locked);
+ struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(&page->flags, PG_locked);
+ wait_queue_head_t *wq = page_waitqueue(page);
+
+ if (waitqueue_active(wq))
+ __wake_up(wq, TASK_NORMAL, 1, &key);
+ else
+ ClearPageContended(page);
}
-EXPORT_SYMBOL(unlock_page);
+EXPORT_SYMBOL(__unlock_page);
/**
* end_page_writeback - end writeback against a page
@@ -908,6 +910,54 @@ void page_endio(struct page *page, bool is_write, int err)
}
EXPORT_SYMBOL_GPL(page_endio);
+static int lock_page_wait(struct wait_bit_key *word, int mode)
+{
+ struct page *page = container_of(word->flags, struct page, flags);
+
+ /*
+ * We cannot go sleep without having PG_contended set. This would mean
+ * nobody would issue a wakeup and we'd be stuck.
+ */
+ if (!PageContended(page)) {
+ SetPageContended(page);
+
+ /*
+ * There are two orderings of importance:
+ *
+ * 1)
+ *
+ * [unlock] [wait]
+ *
+ * clear PG_locked set PG_contended
+ * test PG_contended test (and-set) PG_locked
+ *
+ * Since these are on the same word, and the clear/set
+ * operation are atomic, they are ordered against one another.
+ * Program-Order further constraints a CPU from speculating the
+ * later load to not be earlier than the RmW. So this doesn't
+ * need an explicit barrier. Also see unlock_page().
+ *
+ * 2)
+ *
+ * [unlock] [wait]
+ *
+ * test PG_contended set PG_contended
+ * wakeup/remove add
+ * clear PG_contended trylock
+ * sleep
+ *
+ * In this case however, we must ensure PG_contended is set
+ * before we add ourselves to the list, such that unlock() must
+ * either see PG_contended or we see its clear.
+ */
+ smp_mb__after_atomic();
+
+ return 0;
+ }
+
+ return bit_wait_io(word, mode);
+}
+
/**
* __lock_page - get a lock on the page, assuming we need to sleep to get it
* @page: the page to lock
@@ -917,7 +967,7 @@ void __lock_page(struct page *page)
struct page *page_head = compound_head(page);
DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
- __wait_on_bit_lock(page_waitqueue(page_head), &wait, bit_wait_io,
+ __wait_on_bit_lock(page_waitqueue(page_head), &wait, lock_page_wait,
TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(__lock_page);
@@ -928,10 +978,22 @@ int __lock_page_killable(struct page *page)
DEFINE_WAIT_BIT(wait, &page_head->flags, PG_locked);
return __wait_on_bit_lock(page_waitqueue(page_head), &wait,
- bit_wait_io, TASK_KILLABLE);
+ lock_page_wait, TASK_KILLABLE);
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
+int wait_on_page_lock(struct page *page, int mode)
+{
+ struct page __always_unused *__page = (page = compound_head(page));
+ DEFINE_WAIT_BIT(wait, &page->flags, PG_locked);
+
+ if (!PageLocked(page))
+ return 0;
+
+ return __wait_on_bit(page_waitqueue(page), &wait, lock_page_wait, mode);
+}
+EXPORT_SYMBOL(wait_on_page_lock);
+
/*
* Return values:
* 1 - page is locked; mmap_sem is still held.
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2016-09-28 11:11 UTC|newest]
Thread overview: 42+ messages / expand[flat|nested] mbox.gz Atom feed top
2016-09-26 20:58 page_waitqueue() considered harmful Linus Torvalds
2016-09-26 21:23 ` Rik van Riel
2016-09-26 21:30 ` Linus Torvalds
2016-09-26 23:11 ` Kirill A. Shutemov
2016-09-27 1:01 ` Rik van Riel
2016-09-27 7:30 ` Peter Zijlstra
2016-09-27 8:54 ` Mel Gorman
2016-09-27 9:11 ` Kirill A. Shutemov
2016-09-27 9:42 ` Mel Gorman
2016-09-27 9:52 ` Minchan Kim
2016-09-27 12:11 ` Kirill A. Shutemov
2016-09-29 8:01 ` Peter Zijlstra
2016-09-29 12:55 ` Nicholas Piggin
2016-09-29 13:16 ` Peter Zijlstra
2016-09-29 13:54 ` Nicholas Piggin
2016-09-29 15:05 ` Rik van Riel
2016-09-27 8:03 ` Jan Kara
2016-09-27 8:31 ` Mel Gorman
2016-09-27 14:34 ` Peter Zijlstra
2016-09-27 15:08 ` Nicholas Piggin
2016-09-27 16:31 ` Linus Torvalds
2016-09-27 16:49 ` Peter Zijlstra
2016-09-28 10:45 ` Mel Gorman
2016-09-28 11:11 ` Peter Zijlstra [this message]
2016-09-28 16:10 ` Linus Torvalds
2016-09-29 13:08 ` Peter Zijlstra
2016-10-03 10:47 ` Mel Gorman
2016-09-27 14:53 ` Nicholas Piggin
2016-09-27 15:17 ` Nicholas Piggin
2016-09-27 16:52 ` Peter Zijlstra
2016-09-27 17:06 ` Nicholas Piggin
2016-09-28 7:05 ` Peter Zijlstra
2016-09-28 11:05 ` Paul E. McKenney
2016-09-28 11:16 ` Peter Zijlstra
2016-09-28 12:58 ` Paul E. McKenney
2016-09-29 1:31 ` Nicholas Piggin
2016-09-29 2:12 ` Paul E. McKenney
2016-09-29 6:21 ` Peter Zijlstra
2016-09-29 6:42 ` Nicholas Piggin
2016-09-29 7:14 ` Peter Zijlstra
2016-09-29 7:43 ` Peter Zijlstra
2016-09-28 7:40 ` Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20160928111115.GS5016@twins.programming.kicks-ass.net \
--to=peterz@infradead.org \
--cc=akpm@linux-foundation.org \
--cc=hannes@cmpxchg.org \
--cc=jack@suse.cz \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-mm@kvack.org \
--cc=mgorman@techsingularity.net \
--cc=riel@redhat.com \
--cc=torvalds@linux-foundation.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.