* [PATCH] futex: Fix ZERO_PAGE cause infinite loop
@ 2009-12-24 9:29 KOSAKI Motohiro
2009-12-24 9:39 ` Peter Zijlstra
0 siblings, 1 reply; 15+ messages in thread
From: KOSAKI Motohiro @ 2009-12-24 9:29 UTC (permalink / raw)
To: Hugh Dickins, KAMEZAWA Hiroyuki, Nick Piggin, Peter Zijlstra,
Ingo Molnar, LKML
Cc: kosaki.motohiro
This patch need both futex and zero-page developer's ack.
==========================================================
commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
following test program never finish and waste 100% cpu time.
At the making commit 38d47c1b7 (rely on get_user_pages() for shared
futexes). There isn't zero page in linux kernel. then, futex developers
thought gup retry is safe. but we reinstated zero page later...
This patch fixes it.
futex-zero.c
---------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
---------------------------------------------------------------------
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
---
include/linux/mm.h | 16 ++++++++++++++++
kernel/futex.c | 3 +++
mm/memory.c | 14 --------------
3 files changed, 19 insertions(+), 14 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2265f28..dd755ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -751,6 +751,22 @@ struct zap_details {
unsigned long truncate_count; /* Compare vm_truncate_count */
};
+#ifndef is_zero_pfn
+extern unsigned long zero_pfn;
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return pfn == zero_pfn;
+}
+#endif
+
+#ifndef my_zero_pfn
+extern unsigned long zero_pfn;
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return zero_pfn;
+}
+#endif
+
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..79b89cc 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -254,6 +254,8 @@ again:
page = compound_head(page);
lock_page(page);
+ if (is_zero_pfn(page_to_pfn(page)))
+ goto anon_key;
if (!page->mapping) {
unlock_page(page);
put_page(page);
@@ -268,6 +270,7 @@ again:
* the object not the particular process.
*/
if (PageAnon(page)) {
+ anon_key:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1b..3743fb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -457,20 +457,6 @@ static inline int is_cow_mapping(unsigned int flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
2009-12-24 9:29 [PATCH] futex: Fix ZERO_PAGE cause infinite loop KOSAKI Motohiro
@ 2009-12-24 9:39 ` Peter Zijlstra
2009-12-24 17:15 ` Ulrich Drepper
2009-12-25 1:51 ` KOSAKI Motohiro
0 siblings, 2 replies; 15+ messages in thread
From: Peter Zijlstra @ 2009-12-24 9:39 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Hugh Dickins, KAMEZAWA Hiroyuki, Nick Piggin, Ingo Molnar, LKML,
Thomas Gleixner, Darren Hart
Added tglx and dvhart to the CC.
On Thu, 2009-12-24 at 18:29 +0900, KOSAKI Motohiro wrote:
> This patch need both futex and zero-page developer's ack.
> ==========================================================
>
> commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
> following test program never finish and waste 100% cpu time.
>
> At the making commit 38d47c1b7 (rely on get_user_pages() for shared
> futexes). There isn't zero page in linux kernel. then, futex developers
> thought gup retry is safe. but we reinstated zero page later...
>
> This patch fixes it.
>
> futex-zero.c
> ---------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <syscall.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> int main(int argc, char **argv)
> {
> long page_size;
> int ret;
> void *buf;
>
> page_size = sysconf(_SC_PAGESIZE);
>
> buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> if (buf == (void *)-1) {
> perror("mmap error.\n");
> exit(1);
> }
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return 0;
> }
> ---------------------------------------------------------------------
>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
> Cc: Nick Piggin <npiggin@suse.de>
> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
> Cc: Ingo Molnar <mingo@elte.hu>
> ---
> include/linux/mm.h | 16 ++++++++++++++++
> kernel/futex.c | 3 +++
> mm/memory.c | 14 --------------
> 3 files changed, 19 insertions(+), 14 deletions(-)
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..79b89cc 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -254,6 +254,8 @@ again:
>
> page = compound_head(page);
> lock_page(page);
> + if (is_zero_pfn(page_to_pfn(page)))
> + goto anon_key;
> if (!page->mapping) {
> unlock_page(page);
> put_page(page);
> @@ -268,6 +270,7 @@ again:
> * the object not the particular process.
> */
> if (PageAnon(page)) {
> + anon_key:
> key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> key->private.mm = mm;
> key->private.address = address;
Doesn't it make more sense to simply fail the futex op?
What I mean is, all (?) futex ops assume userspace actually wrote
something to the address in question to begin with, therefore it can
never be the zero page.
So it being the zero page means someone goofed up, and we should bail,
no?
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
2009-12-24 9:39 ` Peter Zijlstra
@ 2009-12-24 17:15 ` Ulrich Drepper
2009-12-25 1:51 ` KOSAKI Motohiro
1 sibling, 0 replies; 15+ messages in thread
From: Ulrich Drepper @ 2009-12-24 17:15 UTC (permalink / raw)
To: Peter Zijlstra
Cc: KOSAKI Motohiro, Hugh Dickins, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Darren Hart
On Thu, Dec 24, 2009 at 01:39, Peter Zijlstra <peterz@infradead.org> wrote:
> What I mean is, all (?) futex ops assume userspace actually wrote
> something to the address in question to begin with, therefore it can
> never be the zero page.
In practice today this is likely always the case. But it doesn't have
to be like that. In theory a zero-initialized futex is a reasonable
object on which to wait. You should not preclude this from happening
just because it's not done like this today.
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
2009-12-24 9:39 ` Peter Zijlstra
2009-12-24 17:15 ` Ulrich Drepper
@ 2009-12-25 1:51 ` KOSAKI Motohiro
2009-12-30 16:03 ` Hugh Dickins
2010-01-05 18:04 ` [PATCH] futex: Fix ZERO_PAGE cause infinite loop Darren Hart
1 sibling, 2 replies; 15+ messages in thread
From: KOSAKI Motohiro @ 2009-12-25 1:51 UTC (permalink / raw)
To: Peter Zijlstra
Cc: kosaki.motohiro, Hugh Dickins, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Darren Hart, Ulrich Drepper
> > diff --git a/kernel/futex.c b/kernel/futex.c
> > index 8e3c3ff..79b89cc 100644
> > --- a/kernel/futex.c
> > +++ b/kernel/futex.c
> > @@ -254,6 +254,8 @@ again:
> >
> > page = compound_head(page);
> > lock_page(page);
> > + if (is_zero_pfn(page_to_pfn(page)))
> > + goto anon_key;
> > if (!page->mapping) {
> > unlock_page(page);
> > put_page(page);
> > @@ -268,6 +270,7 @@ again:
> > * the object not the particular process.
> > */
> > if (PageAnon(page)) {
> > + anon_key:
> > key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> > key->private.mm = mm;
> > key->private.address = address;
>
> Doesn't it make more sense to simply fail the futex op?
>
> What I mean is, all (?) futex ops assume userspace actually wrote
> something to the address in question to begin with, therefore it can
> never be the zero page.
>
> So it being the zero page means someone goofed up, and we should bail,
> no?
Hm. probably we need to discuss more.
Firstly, if we assume current glibc implimentation, you are right,
we can assume userland always initialize the page explicitly before using
futex. then we never seen zero page in futex.
but, I don't think futex itself assume it now. at least man page
doesn't describe such limilation. so, if you prefer bail and man fix,
I'm acceptable maybe.
I don't know any library except glibc use futex directly, or not.
but we don't have any reason to assume futex is used only glibc.
(plus, glibc might change its implementation in future release, perhaps)
following testcase is more practice than last mail's one.
if we add zero page bail logic, this testcase mihgt be fail.
----------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
void * futex_wait(void *arg)
{
void *addr = arg;
int ret;
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, addr, FUTEX_WAIT, 0, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return NULL;
}
int main(int argc, char **argv)
{
long page_size;
pthread_t thr;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
ret = pthread_create(&thr, NULL, futex_wait, buf);
if (ret < 0) {
fprintf(stderr, "pthread_create error\n");
exit(1);
}
sleep(10);
fprintf(stderr, "futex wake\n");
*((int*)buf) = 1;
ret = syscall( SYS_futex, buf, FUTEX_WAKE, 1, NULL, NULL, NULL);
fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
fprintf(stderr, "join\n");
pthread_join(thr, NULL);
return 0;
}
--------------------------------------------------------------------------
man page fix has another difficulty. in past days, zero pages was perfectly
transparent from userland. then any man page don't describe "what's zero page".
then some administrator don't know about zero page at all.
So, if your main disliked point is goto statement, following patch is ok
to me.
Finally, I agree this is really corner case issue. I can agree man page fix
too. but I hope to know which point you dislike in my patch.
Thanks.
>From 09b0a7426c707e2fab5ac8e1ef8feec14415ee62 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Thu, 24 Dec 2009 18:07:42 +0900
Subject: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
commit a13ea5b7 (mm: reinstate ZERO_PAGE) made the unfortunate regression.
following test program never finish and waste 100% cpu time.
At the making commit 38d47c1b7 (rely on get_user_pages() for shared
futexes). There isn't zero page in linux kernel. then, futex developers
thought gup retry is safe. but we reinstated zero page later...
This patch fixes it.
futex-zero.c
---------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
---------------------------------------------------------------------
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
---
include/linux/mm.h | 16 ++++++++++++++++
kernel/futex.c | 6 ++++--
mm/memory.c | 14 --------------
3 files changed, 20 insertions(+), 16 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 2265f28..dd755ea 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -751,6 +751,22 @@ struct zap_details {
unsigned long truncate_count; /* Compare vm_truncate_count */
};
+#ifndef is_zero_pfn
+extern unsigned long zero_pfn;
+static inline int is_zero_pfn(unsigned long pfn)
+{
+ return pfn == zero_pfn;
+}
+#endif
+
+#ifndef my_zero_pfn
+extern unsigned long zero_pfn;
+static inline unsigned long my_zero_pfn(unsigned long addr)
+{
+ return zero_pfn;
+}
+#endif
+
struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..ad72989 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -222,6 +222,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
struct mm_struct *mm = current->mm;
struct page *page;
int err;
+ int is_zero_page;
/*
* The futex address must be "naturally" aligned.
@@ -253,8 +254,9 @@ again:
return err;
page = compound_head(page);
+ is_zero_page = is_zero_pfn(page_to_pfn(page));
lock_page(page);
- if (!page->mapping) {
+ if (!is_zero_page && !page->mapping) {
unlock_page(page);
put_page(page);
goto again;
@@ -267,7 +269,7 @@ again:
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page)) {
+ if (is_zero_page || PageAnon(page)) {
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
diff --git a/mm/memory.c b/mm/memory.c
index 09e4b1b..3743fb5 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -457,20 +457,6 @@ static inline int is_cow_mapping(unsigned int flags)
return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
}
-#ifndef is_zero_pfn
-static inline int is_zero_pfn(unsigned long pfn)
-{
- return pfn == zero_pfn;
-}
-#endif
-
-#ifndef my_zero_pfn
-static inline unsigned long my_zero_pfn(unsigned long addr)
-{
- return zero_pfn;
-}
-#endif
-
/*
* vm_normal_page -- This function gets the "struct page" associated with a pte.
*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
2009-12-25 1:51 ` KOSAKI Motohiro
@ 2009-12-30 16:03 ` Hugh Dickins
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
2010-01-07 6:32 ` [PATCH] mips,mm: Reinstate move_pte optimization KOSAKI Motohiro
2010-01-05 18:04 ` [PATCH] futex: Fix ZERO_PAGE cause infinite loop Darren Hart
1 sibling, 2 replies; 15+ messages in thread
From: Hugh Dickins @ 2009-12-30 16:03 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin, Ingo Molnar, LKML,
Thomas Gleixner, Darren Hart, Ulrich Drepper
This was a good find, thanks a lot for discovering it.
I'm afraid I'd quite forgotten all the mm-implementation-dependence
that has accumulated (for good performance reasons) in kernel/futex.c.
(Luckily, given that KSM pages are also PageAnon, it will happen to
be working correctly on them: that's not your concern, but something
your report has reminded me to consider.)
On Fri, 25 Dec 2009, KOSAKI Motohiro wrote:
> > > diff --git a/kernel/futex.c b/kernel/futex.c
> > > index 8e3c3ff..79b89cc 100644
> > > --- a/kernel/futex.c
> > > +++ b/kernel/futex.c
> > > @@ -254,6 +254,8 @@ again:
> > >
> > > page = compound_head(page);
> > > lock_page(page);
> > > + if (is_zero_pfn(page_to_pfn(page)))
> > > + goto anon_key;
If something like this or your replacment does go forward,
then I think that test is better inside the "if (!page->mapping)"
below. Admittedly that adds even more mm-dependence here (relying
on a zero page to have NULL page->mapping); but isn't page_to_pfn()
one of those functions which is trivial on many configs but expensive
on some? Better call it only in the rare case that it's needed.
Though wouldn't it be even better not to use is_zero_pfn() at all?
That was convenient in mm/memory.c because it had the pfn or pte right
at hand, but here a traditional (page == ZERO_PAGE(address)) would be
more efficient.
Which would save having to move is_zero_pfn() from mm/memory.c
to include/linux/mm.h - I'd prefer to keep it private if we can.
But for completeness, this would involve resurrecting the 2.6.19
MIPS move_pte(), which makes sure mremap() move doesn't interfere
with our assumptions. Something like
#define __HAVE_ARCH_MOVE_PTE
pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
unsigned long new_addr)
{
if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
pte = mk_pte(ZERO_PAGE(new_addr), prot);
return pte;
}
in arch/mips/include/asm/pgtable.h.
> > > if (!page->mapping) {
> > > unlock_page(page);
> > > put_page(page);
> > > @@ -268,6 +270,7 @@ again:
> > > * the object not the particular process.
> > > */
> > > if (PageAnon(page)) {
> > > + anon_key:
> > > key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
> > > key->private.mm = mm;
> > > key->private.address = address;
> >
> > Doesn't it make more sense to simply fail the futex op?
If we were to decide to fail the futex op on an unmodified ZERO_PAGE,
couldn't we simply delete the "again" label and just say
if (!page->mapping) {
/* Truncated file page, or a ZERO_PAGE */
unlock_page(page);
put_page(page);
return -EFAULT;
}
I don't know what the retry on that case was expected to achieve.
> >
> > What I mean is, all (?) futex ops assume userspace actually wrote
> > something to the address in question to begin with, therefore it can
> > never be the zero page.
> >
> > So it being the zero page means someone goofed up, and we should bail,
> > no?
I share Peter's wish to avoid cluttering this up with more special
case testing, but also your and Ulrich's wish to keep generality.
>
> Hm. probably we need to discuss more.
>
> Firstly, if we assume current glibc implimentation, you are right,
> we can assume userland always initialize the page explicitly before using
> futex. then we never seen zero page in futex.
>
> but, I don't think futex itself assume it now. at least man page
> doesn't describe such limilation. so, if you prefer bail and man fix,
> I'm acceptable maybe.
Here's another worry with the current futex implementation,
which might help me to decide which way to jump on the ZERO_PAGE.
Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
placed in a private area backed by a file, then it could be regarded as
FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
time.
Perhaps that's no problem at all, it's a long time since I was involved
with futexes, I think you and Peter will grasp the consequences quicker
than I shall.
But it seems no more improbable than the ZERO_PAGE case: some app
might place its futexes in the .data section of the executable,
which is a private mapping of the executable file.
If this case is also an issue, then perhaps we just need to update
the man page to say that whatever is responsible for initializing a
futex does need to write to it (or the page it's in) before it's used,
otherwise behaviour is undefined. (But we should then use the -EFAULT
patch above, we'd all prefer an error to busylooping.)
>
> I don't know any library except glibc use futex directly, or not.
> but we don't have any reason to assume futex is used only glibc.
> (plus, glibc might change its implementation in future release, perhaps)
I expect there are other implementations; but apparently none
that have yet reported this ZERO_PAGE behaviour as a problem.
...
>
> man page fix has another difficulty. in past days, zero pages was perfectly
> transparent from userland. then any man page don't describe "what's zero page".
> then some administrator don't know about zero page at all.
>
> So, if your main disliked point is goto statement, following patch is ok
> to me.
Although I'm as indecisive as Hamlet,
"goto or no goto, that is NOT the question":
I don't care, I think the goto in your original was fine.
>
> Finally, I agree this is really corner case issue. I can agree man page fix
> too. but I hope to know which point you dislike in my patch.
I just prefer not to have to add extra tests for the ZERO_PAGE
if we can get away without them.
Hugh
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH v2] futex: remove rw parameter from get_futex_key()
2009-12-30 16:03 ` Hugh Dickins
@ 2010-01-05 7:32 ` KOSAKI Motohiro
2010-01-05 11:21 ` Hugh Dickins
` (2 more replies)
2010-01-07 6:32 ` [PATCH] mips,mm: Reinstate move_pte optimization KOSAKI Motohiro
1 sibling, 3 replies; 15+ messages in thread
From: KOSAKI Motohiro @ 2010-01-05 7:32 UTC (permalink / raw)
To: Hugh Dickins
Cc: kosaki.motohiro, Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Darren Hart, Ulrich Drepper
Hi
sorry very delayed responce, I've review futex code again.
> > Hm. probably we need to discuss more.
> >
> > Firstly, if we assume current glibc implimentation, you are right,
> > we can assume userland always initialize the page explicitly before using
> > futex. then we never seen zero page in futex.
> >
> > but, I don't think futex itself assume it now. at least man page
> > doesn't describe such limilation. so, if you prefer bail and man fix,
> > I'm acceptable maybe.
>
> Here's another worry with the current futex implementation,
> which might help me to decide which way to jump on the ZERO_PAGE.
>
> Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
> FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
> we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
> placed in a private area backed by a file, then it could be regarded as
> FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
> time.
very true!
> Perhaps that's no problem at all, it's a long time since I was involved
> with futexes, I think you and Peter will grasp the consequences quicker
> than I shall.
>
> But it seems no more improbable than the ZERO_PAGE case: some app
> might place its futexes in the .data section of the executable,
> which is a private mapping of the executable file.
>
> If this case is also an issue, then perhaps we just need to update
> the man page to say that whatever is responsible for initializing a
> futex does need to write to it (or the page it's in) before it's used,
> otherwise behaviour is undefined. (But we should then use the -EFAULT
> patch above, we'd all prefer an error to busylooping.)
I have a question. Why can't we use write mode get_user_pages_fast()?
I mean glibc always mekes write access before calling futex. it mean
write mode get_user_pages() doesn't mekes cow on practical usage.
Following patch is implemented such policy. What do you think?
>From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 5 Jan 2010 11:33:00 +0900
Subject: [PATCH] futex: remove rw parameter from get_futex_key()
Currently, futex have two problem.
A) current futex doesn't handle private file mappings properly.
get_futex_key() use PageAnon() to distinguish file and anon. it can
makes following bad scenario.
1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
sleep on file mapping object.
2) thread-B write a variable and it makes cow.
3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
sleeped thread on the anonymous page. (but it's nothing)
following testcase reproduce this issue.
actual result)
FUTEX_WAIT thread never wake up.
expect result)
FUTEX_WAIT thread wake up by FUTEX_WAKE.
--------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <syscall.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
char pad[4096] = {1};
int val = 1;
char pad2[4096] = {1};
void * futex_wait(void *arg)
{
int ret;
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, &val, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return NULL;
}
int main(int argc, char **argv)
{
pthread_t thr;
int ret;
ret = pthread_create(&thr, NULL, futex_wait, NULL);
if (ret < 0) {
fprintf(stderr, "pthread_create error\n");
exit(1);
}
sleep(10);
fprintf(stderr, "futex wake\n");
val = 2;
ret = syscall( SYS_futex, &val, FUTEX_WAKE, 1, NULL, NULL, NULL);
fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
fprintf(stderr, "join\n");
pthread_join(thr, NULL);
return 0;
}
--------------------------------------------------------------------
B) current futex doesn't handle zero page properly.
read mode get_user_pages() can return zero page. but current futex code doesn't
handle it at all. Then, zero page makes infinite loop internally.
following testcase can reproduce this issue.
actual result)
FUTEX_WAIT never return and waste cpu time 100%.
expected result)
FUTEX_WAIT return immediately.
------------------------------------------------------------------
#include <stdio.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <syscall.h>
#include <sys/time.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <unistd.h>
#include <errno.h>
#include <linux/futex.h>
#include <pthread.h>
int main(int argc, char **argv)
{
long page_size;
int ret;
void *buf;
page_size = sysconf(_SC_PAGESIZE);
buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
if (buf == (void *)-1) {
perror("mmap error.\n");
exit(1);
}
fprintf(stderr, "futex wait\n");
ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
if (ret != 0 && errno != EWOULDBLOCK) {
perror("futex error.\n");
exit(1);
}
fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
return 0;
}
------------------------------------------------------------------
The solution is to use write mode get_user_page() always for page lookup.
It prevent to lookup both file page of private mappings and zero page.
performance concern:
Probaly very little. because glibc always initialize variables for futex
before to call futex(). It mean glibc user never seen the overhead of this
patch.
compatibility concern:
This patch have few compatibility break. After this patch, FUTEX_WAIT require
writable access to futex variables (read-only mappings makes EFAULT).
But practically it's no problem. again glibc always initalize variables for futex
explicitly. nobody use read-only mappings.
Reported-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
kernel/futex.c | 27 ++++++++++++---------------
1 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..d9b3a22 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
- err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+ err = get_user_pages_fast(address, 1, 1, &page);
if (err < 0)
return err;
@@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1175,11 +1173,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -2023,7 +2020,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_waiter.task = NULL;
key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
--
1.6.5.2
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2] futex: remove rw parameter from get_futex_key()
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
@ 2010-01-05 11:21 ` Hugh Dickins
2010-01-05 20:41 ` Darren Hart
2010-01-06 23:29 ` Darren Hart
2010-01-13 10:30 ` [tip:core/urgent] futexes: Remove " tip-bot for KOSAKI Motohiro
2 siblings, 1 reply; 15+ messages in thread
From: Hugh Dickins @ 2010-01-05 11:21 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin, Ingo Molnar, LKML,
Thomas Gleixner, Darren Hart, Ulrich Drepper
On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
> > > Hm. probably we need to discuss more.
> > >
> > > Firstly, if we assume current glibc implimentation, you are right,
> > > we can assume userland always initialize the page explicitly before using
> > > futex. then we never seen zero page in futex.
> > >
> > > but, I don't think futex itself assume it now. at least man page
> > > doesn't describe such limilation. so, if you prefer bail and man fix,
> > > I'm acceptable maybe.
> >
> > Here's another worry with the current futex implementation,
> > which might help me to decide which way to jump on the ZERO_PAGE.
> >
> > Originally, a futex on a MAP_PRIVATE (!VM_MAYSHARE) area was necessarily
> > FUT_OFF_MMSHARED. Nowadays, with the get_user_pages_fast implementation,
> > we have no idea whether the vma is VM_MAYSHARE or not. So if a futex is
> > placed in a private area backed by a file, then it could be regarded as
> > FUT_OFF_INODE at futex_wait() time, but FUT_OFF_MMSHARED at futex_wake()
> > time.
>
> very true!
>
>
> > Perhaps that's no problem at all, it's a long time since I was involved
> > with futexes, I think you and Peter will grasp the consequences quicker
> > than I shall.
> >
> > But it seems no more improbable than the ZERO_PAGE case: some app
> > might place its futexes in the .data section of the executable,
> > which is a private mapping of the executable file.
> >
> > If this case is also an issue, then perhaps we just need to update
> > the man page to say that whatever is responsible for initializing a
> > futex does need to write to it (or the page it's in) before it's used,
> > otherwise behaviour is undefined. (But we should then use the -EFAULT
> > patch above, we'd all prefer an error to busylooping.)
>
> I have a question. Why can't we use write mode get_user_pages_fast()?
> I mean glibc always mekes write access before calling futex. it mean
> write mode get_user_pages() doesn't mekes cow on practical usage.
>
> Following patch is implemented such policy. What do you think?
It crossed my mind to do it that way too - honest!
I pushed it away thinking that some app may sometimes be using
mprotect PROT_READ or PROT_NONE over these areas, for one reason
or another - perhaps debugging or self-monitoring.
But I've no experience of futex use at all: if futexperts think it's
reasonable always to get_futex_key() for writing, then that is much
the neatest way to deal with both zero page and private file cases.
Over to the experts...
Hugh
>
> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Date: Tue, 5 Jan 2010 11:33:00 +0900
> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>
> Currently, futex have two problem.
>
> A) current futex doesn't handle private file mappings properly.
>
> get_futex_key() use PageAnon() to distinguish file and anon. it can
> makes following bad scenario.
>
> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> sleep on file mapping object.
> 2) thread-B write a variable and it makes cow.
> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> sleeped thread on the anonymous page. (but it's nothing)
>
> following testcase reproduce this issue.
>
> actual result)
> FUTEX_WAIT thread never wake up.
>
> expect result)
> FUTEX_WAIT thread wake up by FUTEX_WAKE.
>
> --------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <syscall.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> char pad[4096] = {1};
> int val = 1;
> char pad2[4096] = {1};
>
> void * futex_wait(void *arg)
> {
> int ret;
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, &val, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return NULL;
> }
>
> int main(int argc, char **argv)
> {
> pthread_t thr;
> int ret;
>
> ret = pthread_create(&thr, NULL, futex_wait, NULL);
> if (ret < 0) {
> fprintf(stderr, "pthread_create error\n");
> exit(1);
> }
>
> sleep(10);
>
> fprintf(stderr, "futex wake\n");
> val = 2;
> ret = syscall( SYS_futex, &val, FUTEX_WAKE, 1, NULL, NULL, NULL);
> fprintf(stderr, "futex_wake: ret = %d, errno = %d\n", ret, errno);
>
> fprintf(stderr, "join\n");
> pthread_join(thr, NULL);
>
> return 0;
> }
> --------------------------------------------------------------------
>
> B) current futex doesn't handle zero page properly.
>
> read mode get_user_pages() can return zero page. but current futex code doesn't
> handle it at all. Then, zero page makes infinite loop internally.
>
> following testcase can reproduce this issue.
>
> actual result)
> FUTEX_WAIT never return and waste cpu time 100%.
>
> expected result)
> FUTEX_WAIT return immediately.
>
> ------------------------------------------------------------------
> #include <stdio.h>
> #include <stdlib.h>
> #include <sys/mman.h>
> #include <syscall.h>
> #include <sys/time.h>
> #include <sys/types.h>
> #include <sys/stat.h>
> #include <fcntl.h>
> #include <unistd.h>
> #include <errno.h>
> #include <linux/futex.h>
> #include <pthread.h>
>
> int main(int argc, char **argv)
> {
> long page_size;
> int ret;
> void *buf;
>
> page_size = sysconf(_SC_PAGESIZE);
>
> buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
> if (buf == (void *)-1) {
> perror("mmap error.\n");
> exit(1);
> }
>
> fprintf(stderr, "futex wait\n");
> ret = syscall( SYS_futex, buf, FUTEX_WAIT, 1, NULL, NULL, NULL);
> if (ret != 0 && errno != EWOULDBLOCK) {
> perror("futex error.\n");
> exit(1);
> }
> fprintf(stderr, "futex_wait: ret = %d, errno = %d\n", ret, errno);
>
> return 0;
> }
> ------------------------------------------------------------------
>
> The solution is to use write mode get_user_page() always for page lookup.
> It prevent to lookup both file page of private mappings and zero page.
>
> performance concern:
>
> Probaly very little. because glibc always initialize variables for futex
> before to call futex(). It mean glibc user never seen the overhead of this
> patch.
>
> compatibility concern:
>
> This patch have few compatibility break. After this patch, FUTEX_WAIT require
> writable access to futex variables (read-only mappings makes EFAULT).
> But practically it's no problem. again glibc always initalize variables for futex
> explicitly. nobody use read-only mappings.
>
> Reported-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> ---
> kernel/futex.c | 27 ++++++++++++---------------
> 1 files changed, 12 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..d9b3a22 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
> * @uaddr: virtual address of the futex
> * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
> * @key: address where result is stored.
> - * @rw: mapping needs to be read/write (values: VERIFY_READ,
> - * VERIFY_WRITE)
> *
> * Returns a negative error code or 0
> * The key words are stored in *key on success.
> @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
> * lock_page() might sleep, the caller should not hold a spinlock.
> */
> static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
> {
> unsigned long address = (unsigned long)uaddr;
> struct mm_struct *mm = current->mm;
> @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> * but access_ok() should be faster than find_vma()
> */
> if (!fshared) {
> - if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
> + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
> return -EFAULT;
> key->private.mm = mm;
> key->private.address = address;
> @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> }
>
> again:
> - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
> + err = get_user_pages_fast(address, 1, 1, &page);
> if (err < 0)
> return err;
>
> @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
> if (!bitset)
> return -EINVAL;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
> int ret, op_ret;
>
> retry:
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1175,11 +1173,10 @@ retry:
> pi_state = NULL;
> }
>
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2,
> - requeue_pi ? VERIFY_WRITE : VERIFY_READ);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
> */
> retry:
> q->key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &q->key);
> if (unlikely(ret != 0))
> return ret;
>
> @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
> q.requeue_pi_key = NULL;
> retry:
> q.key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &q.key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2023,7 +2020,7 @@ retry:
> if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
> return -EPERM;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> rt_waiter.task = NULL;
>
> key2 = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out;
>
> --
> 1.6.5.2
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH] futex: Fix ZERO_PAGE cause infinite loop
2009-12-25 1:51 ` KOSAKI Motohiro
2009-12-30 16:03 ` Hugh Dickins
@ 2010-01-05 18:04 ` Darren Hart
1 sibling, 0 replies; 15+ messages in thread
From: Darren Hart @ 2010-01-05 18:04 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Peter Zijlstra, Hugh Dickins, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Ulrich Drepper
KOSAKI Motohiro wrote:
> I don't know any library except glibc use futex directly, or not.
> but we don't have any reason to assume futex is used only glibc.
> (plus, glibc might change its implementation in future release, perhaps)
I only know of 3 off-hand. The checkpoint restart guys have test suites
that use the futex syscalls. The futextest test suite obviously does.
There is also the userspace RCU library that Mathieu Desnoyers has been
workingon. None of these make use off the zero page as far as I know.
However, once we sort this out, I shall have to add such a test to
futextest, perhaps something similar to the test in this thread.
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] futex: remove rw parameter from get_futex_key()
2010-01-05 11:21 ` Hugh Dickins
@ 2010-01-05 20:41 ` Darren Hart
2010-01-06 2:27 ` KOSAKI Motohiro
0 siblings, 1 reply; 15+ messages in thread
From: Darren Hart @ 2010-01-05 20:41 UTC (permalink / raw)
To: Hugh Dickins
Cc: KOSAKI Motohiro, Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Ulrich Drepper, Hansen, Dave
Hugh Dickins wrote:
> On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
>> From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
>> Date: Tue, 5 Jan 2010 11:33:00 +0900
>> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>>
>> Currently, futex have two problem.
>>
>> A) current futex doesn't handle private file mappings properly.
>>
>> get_futex_key() use PageAnon() to distinguish file and anon. it can
>> makes following bad scenario.
>>
>> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
>> sleep on file mapping object.
>> 2) thread-B write a variable and it makes cow.
>> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
>> sleeped thread on the anonymous page. (but it's nothing)
>>
Excellent test case, thank you! Would you consider preparing a patch to
futextest?
http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
I did some experimentation here and found that:
o The test works if the *_PRIVATE op codes are used.
This is because the futex keys are generated using only the virtual
address of the page, which doesn't change on a COW.
o If the waiter writes to the val first, it works.
This forces the COW before the waiter generates it's futex key.
So the waiter's key is generated based on the page cache page address
for shared futexes when the value hasn't been written to prior to wait.
The only scenario where I could think of wanting this behavior is if
another process were to try and wake the waiter via the same file backed
page. However, as I understand it, the re-use of the same page for
unwritten-to private pages is an optimization and can't be relied upon.
So this scenario is out. Another would be to use the futex as a very
simple wait queue where the value is never changed. In this case
however, the implementation is racy as the value check is effectively
negated, so this use case is also out.
As such, I see no reason not to always use VERIFY_WRITE and force a COW
prior to generating the futex_key for shared futexes. It is not
necessary for private futexes however as they use only the virtual address.
I am not sure on whether or not it makes sense to avoid the VERIFY_WRITE
on the private futexes. Could be it is just more code for negligible
benefit. Thoughts?
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] futex: remove rw parameter from get_futex_key()
2010-01-05 20:41 ` Darren Hart
@ 2010-01-06 2:27 ` KOSAKI Motohiro
2010-01-06 23:14 ` Darren Hart
0 siblings, 1 reply; 15+ messages in thread
From: KOSAKI Motohiro @ 2010-01-06 2:27 UTC (permalink / raw)
To: Darren Hart
Cc: kosaki.motohiro, Hugh Dickins, Peter Zijlstra, KAMEZAWA Hiroyuki,
Nick Piggin, Ingo Molnar, LKML, Thomas Gleixner, Ulrich Drepper,
Hansen, Dave
[-- Attachment #1: Type: text/plain, Size: 3026 bytes --]
> Hugh Dickins wrote:
> > On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>
> >> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> >> From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> >> Date: Tue, 5 Jan 2010 11:33:00 +0900
> >> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
> >>
> >> Currently, futex have two problem.
> >>
> >> A) current futex doesn't handle private file mappings properly.
> >>
> >> get_futex_key() use PageAnon() to distinguish file and anon. it can
> >> makes following bad scenario.
> >>
> >> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> >> sleep on file mapping object.
> >> 2) thread-B write a variable and it makes cow.
> >> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> >> sleeped thread on the anonymous page. (but it's nothing)
> >>
>
> Excellent test case, thank you! Would you consider preparing a patch to
> futextest?
>
> http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
Patch attached. you can feel free any modify such file. thanks.
> I did some experimentation here and found that:
>
> o The test works if the *_PRIVATE op codes are used.
> This is because the futex keys are generated using only the virtual
> address of the page, which doesn't change on a COW.
> o If the waiter writes to the val first, it works.
> This forces the COW before the waiter generates it's futex key.
True.
> So the waiter's key is generated based on the page cache page address
> for shared futexes when the value hasn't been written to prior to wait.
>
> The only scenario where I could think of wanting this behavior is if
> another process were to try and wake the waiter via the same file backed
> page. However, as I understand it, the re-use of the same page for
> unwritten-to private pages is an optimization and can't be relied upon.
> So this scenario is out. Another would be to use the futex as a very
> simple wait queue where the value is never changed. In this case
> however, the implementation is racy as the value check is effectively
> negated, so this use case is also out.
>
> As such, I see no reason not to always use VERIFY_WRITE and force a COW
> prior to generating the futex_key for shared futexes. It is not
> necessary for private futexes however as they use only the virtual address.
>
> I am not sure on whether or not it makes sense to avoid the VERIFY_WRITE
> on the private futexes. Could be it is just more code for negligible
> benefit. Thoughts?
I think it's no problem. because,
as performance view:
access_ok() of almost arch (included x86) ignore VERIFY_WRITE.
then, this change doesn't cause performance loss of course.
(current futex mainly handle ro-mapping problem by fault_in_user_writeable)
as consistency view:
This patch have better consistency without FUTEX_PRIVATE_FLAG case.
as usability view:
Nobody want to use ro-mappings for private futex. it's obviously meaningless
and useless.
[-- Attachment #2: 0001-futextest-Add-two-testcase.patch --]
[-- Type: application/octet-stream, Size: 7607 bytes --]
From 36b04190325209aea5d02ab1d5d5572cd6c8280a Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Wed, 6 Jan 2010 11:07:08 +0900
Subject: [PATCH] futextest: Add two testcase
Recently, we have discussed zero-page and private file mappings bug.
thus such test case will be added.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
---
functional/Makefile | 4 +-
functional/futex_wait_private_mapped_file.c | 112 +++++++++++++++++++++++++++
functional/futex_wait_uninitialized_heap.c | 85 ++++++++++++++++++++
functional/run.sh | 7 ++
4 files changed, 207 insertions(+), 1 deletions(-)
create mode 100644 functional/futex_wait_private_mapped_file.c
create mode 100644 functional/futex_wait_uninitialized_heap.c
diff --git a/functional/Makefile b/functional/Makefile
index 69356fa..6ecb42c 100644
--- a/functional/Makefile
+++ b/functional/Makefile
@@ -8,7 +8,9 @@ TARGETS := \
futex_wait_wouldblock \
futex_requeue_pi \
futex_requeue_pi_signal_restart \
- futex_requeue_pi_mismatched_ops
+ futex_requeue_pi_mismatched_ops \
+ futex_wait_uninitialized_heap \
+ futex_wait_private_mapped_file
.PHONY: all clean
all: $(TARGETS)
diff --git a/functional/futex_wait_private_mapped_file.c b/functional/futex_wait_private_mapped_file.c
new file mode 100644
index 0000000..fb6467a
--- /dev/null
+++ b/functional/futex_wait_private_mapped_file.c
@@ -0,0 +1,112 @@
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * NAME
+ * futex_wait_private_mapped_file.c
+ *
+ * DESCRIPTION
+ * Internally, Futex has two handling mode, anon and file. The private file
+ * mapping is special. At first it behave as file, but after write anything
+ * it behave as anon. This test is intent to test such case.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <syscall.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <pthread.h>
+#include <libgen.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+#define PAGE_SZ 4096
+
+char pad[PAGE_SZ] = {1};
+futex_t val = 1;
+char pad2[PAGE_SZ] = {1};
+
+void* thr_futex_wait(void *arg)
+{
+ int ret;
+
+ info("futex wait\n");
+ ret = futex_wait(&val, 1, NULL, 0);
+ if (ret != 0 && errno != EWOULDBLOCK) {
+ perror("futex error.\n");
+ print_result(RET_ERROR);
+ exit(RET_ERROR);
+ }
+ info("futex_wait: ret = %d, errno = %d\n", ret, errno);
+
+ return NULL;
+}
+
+int main(int argc, char **argv)
+{
+ pthread_t thr;
+ int ret = RET_PASS;
+ int res;
+ int c;
+
+ while ((c = getopt(argc, argv, "c")) != -1) {
+ switch(c) {
+ case 'c':
+ log_color(1);
+ break;
+ default:
+ exit(1);
+ }
+ }
+
+ printf("%s: Test the futex value of private file mappings in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ ret = pthread_create(&thr, NULL, thr_futex_wait, NULL);
+ if (ret < 0) {
+ fprintf(stderr, "pthread_create error\n");
+ goto error;
+ }
+
+ info("wait a while");
+ sleep(3);
+ val = 2;
+ res = futex_wake(&val, 1, 0);
+ info("futex_wake %d", res);
+
+ info("join");
+ pthread_join(thr, NULL);
+
+ print_result(ret);
+ return ret;
+
+error:
+ ret = RET_ERROR;
+ print_result(ret);
+ return ret;
+}
diff --git a/functional/futex_wait_uninitialized_heap.c b/functional/futex_wait_uninitialized_heap.c
new file mode 100644
index 0000000..cd7e5f6
--- /dev/null
+++ b/functional/futex_wait_uninitialized_heap.c
@@ -0,0 +1,85 @@
+/******************************************************************************
+ *
+ * Copyright FUJITSU LIMITED 2010
+ * Copyright KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * NAME
+ * futex_wait_uninitialized_heap.c
+ *
+ * DESCRIPTION
+ * Wait on uninitialized heap. It shold be zero and FUTEX_WAIT should return
+ * immediately. This test is intent to test zero page handling in futex.
+ *
+ * AUTHOR
+ * KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ * HISTORY
+ * 2010-Jan-6: Initial version by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
+ *
+ *****************************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/mman.h>
+#include <syscall.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include <errno.h>
+#include <linux/futex.h>
+#include <libgen.h>
+
+#include "logging.h"
+#include "futextest.h"
+
+int main(int argc, char **argv)
+{
+ long page_size;
+ int res;
+ void *buf;
+ int ret = RET_PASS;
+ int c;
+
+ while ((c = getopt(argc, argv, "c")) != -1) {
+ switch(c) {
+ case 'c':
+ log_color(1);
+ break;
+ default:
+ exit(1);
+ }
+ }
+
+ page_size = sysconf(_SC_PAGESIZE);
+
+ buf = mmap(NULL, page_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+ if (buf == (void *)-1) {
+ perror("mmap error.\n");
+ exit(1);
+ }
+
+ printf("%s: Test the uninitialized futex value in FUTEX_WAIT\n",
+ basename(argv[0]));
+
+ res = futex_wait(buf, 1, NULL, 0);
+ if (res != 0 && errno != EWOULDBLOCK) {
+ ret = RET_FAIL;
+ }
+ print_result(ret);
+
+ return ret;
+}
diff --git a/functional/run.sh b/functional/run.sh
index 959b389..411c20f 100755
--- a/functional/run.sh
+++ b/functional/run.sh
@@ -29,6 +29,8 @@
#
# HISTORY
# 2009-Nov-9: Initial version by Darren Hart <dvhltc@us.ibm.com>
+# 2010-Jan-6: Add futex_wait_uninitialized_heap and futex_wait_private_mapped_file
+# by KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
#
###############################################################################
@@ -83,3 +85,8 @@ echo
echo
./futex_wait_wouldblock $COLOR
+
+echo
+./futex_wait_uninitialized_heap
+./futex_wait_private_mapped_file
+
--
1.6.5.2
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH v2] futex: remove rw parameter from get_futex_key()
2010-01-06 2:27 ` KOSAKI Motohiro
@ 2010-01-06 23:14 ` Darren Hart
0 siblings, 0 replies; 15+ messages in thread
From: Darren Hart @ 2010-01-06 23:14 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Hugh Dickins, Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Ulrich Drepper, Hansen, Dave
KOSAKI Motohiro wrote:
>> Hugh Dickins wrote:
>>> On Tue, 5 Jan 2010, KOSAKI Motohiro wrote:
>>>> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
>>>> From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
>>>> Date: Tue, 5 Jan 2010 11:33:00 +0900
>>>> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>>>>
>>>> Currently, futex have two problem.
>>>>
>>>> A) current futex doesn't handle private file mappings properly.
>>>>
>>>> get_futex_key() use PageAnon() to distinguish file and anon. it can
>>>> makes following bad scenario.
>>>>
>>>> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
>>>> sleep on file mapping object.
>>>> 2) thread-B write a variable and it makes cow.
>>>> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
>>>> sleeped thread on the anonymous page. (but it's nothing)
>>>>
>> Excellent test case, thank you! Would you consider preparing a patch to
>> futextest?
>>
>> http://git.kernel.org/?p=linux/kernel/git/dvhart/futextest.git;a=summary
>
> Patch attached. you can feel free any modify such file. thanks.
Kosaki,
Thank you for the patch! I've merged your patch into futextest and added
some logic to detect failure and not hang the box (so the test can be
run unattended).
Thanks,
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
^ permalink raw reply [flat|nested] 15+ messages in thread
* Re: [PATCH v2] futex: remove rw parameter from get_futex_key()
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
2010-01-05 11:21 ` Hugh Dickins
@ 2010-01-06 23:29 ` Darren Hart
2010-01-13 10:30 ` [tip:core/urgent] futexes: Remove " tip-bot for KOSAKI Motohiro
2 siblings, 0 replies; 15+ messages in thread
From: Darren Hart @ 2010-01-06 23:29 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Hugh Dickins, Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Ulrich Drepper
KOSAKI Motohiro wrote:
Hi Kosaki,
Some test results below. Things look good, you have my Ack.
> From c3e2dfdff84b9b720e646fd6dd3c38fff293c7e6 Mon Sep 17 00:00:00 2001
> From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
> Date: Tue, 5 Jan 2010 11:33:00 +0900
> Subject: [PATCH] futex: remove rw parameter from get_futex_key()
>
> Currently, futex have two problem.
>
> A) current futex doesn't handle private file mappings properly.
>
> get_futex_key() use PageAnon() to distinguish file and anon. it can
> makes following bad scenario.
>
> 1) thread-A call futex(private-mapping, FUTEX_WAIT). it makes to
> sleep on file mapping object.
> 2) thread-B write a variable and it makes cow.
> 3) thread-B call futex(private-mapping, FUTEX_WAKE). it wake up
> sleeped thread on the anonymous page. (but it's nothing)
>
> following testcase reproduce this issue.
>
> actual result)
> FUTEX_WAIT thread never wake up.
>
> expect result)
> FUTEX_WAIT thread wake up by FUTEX_WAKE.
>
<snipped test source>
>
> B) current futex doesn't handle zero page properly.
>
> read mode get_user_pages() can return zero page. but current futex code doesn't
> handle it at all. Then, zero page makes infinite loop internally.
>
> following testcase can reproduce this issue.
>
> actual result)
> FUTEX_WAIT never return and waste cpu time 100%.
>
> expected result)
> FUTEX_WAIT return immediately.
>
<snipped test source>
> The solution is to use write mode get_user_page() always for page lookup.
> It prevent to lookup both file page of private mappings and zero page.
>
> performance concern:
>
> Probaly very little. because glibc always initialize variables for futex
> before to call futex(). It mean glibc user never seen the overhead of this
> patch.
>
> compatibility concern:
>
> This patch have few compatibility break. After this patch, FUTEX_WAIT require
> writable access to futex variables (read-only mappings makes EFAULT).
> But practically it's no problem. again glibc always initalize variables for futex
> explicitly. nobody use read-only mappings.
>
> Reported-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
The tests have been added to futextest. In addition to confirming these
tests, I compared the performance/futex_wait test results before and
after this patch and found no significant delta.
Threads 2.6.33-rc3 2.6.33-rc3-patched
------------------------------------------
1 42373 42373
2 24938 24450
3 5931 7163
4 4715 4480
5 4593 4227
6 4993 4608
8 4448 5294
10 4778 5149
12 4836 5079
16 4476 3828
24 4188 4314
32 4380 4384
64 4598 4764
128 4429 5102
256 5081 4462
521 4854 4444
1024 4933 4272
------------------------------------------
System: 4xAMD Opteron 270 @ 2GHz
Units: Thousands of iterations per second
Ingo and Thomas, please consider applying this patch. Should this also
be sent to Stable?
Acked-by: Darren Hart <dvhltc@us.ibm.com>
> ---
> kernel/futex.c | 27 ++++++++++++---------------
> 1 files changed, 12 insertions(+), 15 deletions(-)
>
> diff --git a/kernel/futex.c b/kernel/futex.c
> index 8e3c3ff..d9b3a22 100644
> --- a/kernel/futex.c
> +++ b/kernel/futex.c
> @@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
> * @uaddr: virtual address of the futex
> * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
> * @key: address where result is stored.
> - * @rw: mapping needs to be read/write (values: VERIFY_READ,
> - * VERIFY_WRITE)
> *
> * Returns a negative error code or 0
> * The key words are stored in *key on success.
> @@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
> * lock_page() might sleep, the caller should not hold a spinlock.
> */
> static int
> -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
> {
> unsigned long address = (unsigned long)uaddr;
> struct mm_struct *mm = current->mm;
> @@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> * but access_ok() should be faster than find_vma()
> */
> if (!fshared) {
> - if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
> + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
> return -EFAULT;
> key->private.mm = mm;
> key->private.address = address;
> @@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
> }
>
> again:
> - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
> + err = get_user_pages_fast(address, 1, 1, &page);
> if (err < 0)
> return err;
>
> @@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
> if (!bitset)
> return -EINVAL;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
> int ret, op_ret;
>
> retry:
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1175,11 +1173,10 @@ retry:
> pi_state = NULL;
> }
>
> - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
> + ret = get_futex_key(uaddr1, fshared, &key1);
> if (unlikely(ret != 0))
> goto out;
> - ret = get_futex_key(uaddr2, fshared, &key2,
> - requeue_pi ? VERIFY_WRITE : VERIFY_READ);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out_put_key1;
>
> @@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
> */
> retry:
> q->key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
> + ret = get_futex_key(uaddr, fshared, &q->key);
> if (unlikely(ret != 0))
> return ret;
>
> @@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
> q.requeue_pi_key = NULL;
> retry:
> q.key = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &q.key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2023,7 +2020,7 @@ retry:
> if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
> return -EPERM;
>
> - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
> + ret = get_futex_key(uaddr, fshared, &key);
> if (unlikely(ret != 0))
> goto out;
>
> @@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
> rt_waiter.task = NULL;
>
> key2 = FUTEX_KEY_INIT;
> - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
> + ret = get_futex_key(uaddr2, fshared, &key2);
> if (unlikely(ret != 0))
> goto out;
>
--
Darren Hart
IBM Linux Technology Center
Real-Time Linux Team
^ permalink raw reply [flat|nested] 15+ messages in thread
* [PATCH] mips,mm: Reinstate move_pte optimization
2009-12-30 16:03 ` Hugh Dickins
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
@ 2010-01-07 6:32 ` KOSAKI Motohiro
2010-01-11 10:15 ` Ralf Baechle
1 sibling, 1 reply; 15+ messages in thread
From: KOSAKI Motohiro @ 2010-01-07 6:32 UTC (permalink / raw)
To: Hugh Dickins, Ralf Baechle, linux-mips, Carsten Otte
Cc: kosaki.motohiro, Peter Zijlstra, KAMEZAWA Hiroyuki, Nick Piggin,
Ingo Molnar, LKML, Thomas Gleixner, Darren Hart, Ulrich Drepper
CC to mips folks.
> If something like this or your replacment does go forward,
> then I think that test is better inside the "if (!page->mapping)"
> below. Admittedly that adds even more mm-dependence here (relying
> on a zero page to have NULL page->mapping); but isn't page_to_pfn()
> one of those functions which is trivial on many configs but expensive
> on some? Better call it only in the rare case that it's needed.
>
> Though wouldn't it be even better not to use is_zero_pfn() at all?
> That was convenient in mm/memory.c because it had the pfn or pte right
> at hand, but here a traditional (page == ZERO_PAGE(address)) would be
> more efficient.
>
> Which would save having to move is_zero_pfn() from mm/memory.c
> to include/linux/mm.h - I'd prefer to keep it private if we can.
> But for completeness, this would involve resurrecting the 2.6.19
> MIPS move_pte(), which makes sure mremap() move doesn't interfere
> with our assumptions. Something like
>
> #define __HAVE_ARCH_MOVE_PTE
> pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
> unsigned long new_addr)
> {
> if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
> pte = mk_pte(ZERO_PAGE(new_addr), prot);
> return pte;
> }
>
> in arch/mips/include/asm/pgtable.h.
I agree with resurrecting mips move_pte. At least your patch
passed my cross compile test :)
Ralf, can you please review following patch?
======================================================
Subject: [PATCH] mips,mm: Reinstate move_pte optimization
From: Hugh Dickins <hugh.dickins@tiscali.co.uk>
About three years ago, we removed mips specific move_pte by commit
701dfbc1cb (mm: mremap correct rmap accounting). because it is only
small optimization and it has bug.
However new zero-page thing doesn't have such problem and behavior
consistency of mremap have worth a bit.
This patch reinstate it.
Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: linux-mips@linux-mips.org
Cc: Nick Piggin <npiggin@suse.de>
Cc: Carsten Otte <cotte@de.ibm.com>
---
arch/mips/include/asm/pgtable.h | 8 ++++++++
1 files changed, 8 insertions(+), 0 deletions(-)
diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h
index 1854336..6ad2f73 100644
--- a/arch/mips/include/asm/pgtable.h
+++ b/arch/mips/include/asm/pgtable.h
@@ -387,6 +387,14 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma,
remap_pfn_range(vma, vaddr, pfn, size, prot)
#endif
+#define __HAVE_ARCH_MOVE_PTE
+pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr, unsigned long new_addr)
+{
+ if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
+ pte = mk_pte(ZERO_PAGE(new_addr), prot);
+ return pte;
+}
+
#include <asm-generic/pgtable.h>
/*
--
1.6.5.2
^ permalink raw reply related [flat|nested] 15+ messages in thread
* Re: [PATCH] mips,mm: Reinstate move_pte optimization
2010-01-07 6:32 ` [PATCH] mips,mm: Reinstate move_pte optimization KOSAKI Motohiro
@ 2010-01-11 10:15 ` Ralf Baechle
0 siblings, 0 replies; 15+ messages in thread
From: Ralf Baechle @ 2010-01-11 10:15 UTC (permalink / raw)
To: KOSAKI Motohiro
Cc: Hugh Dickins, linux-mips, Carsten Otte, Peter Zijlstra,
KAMEZAWA Hiroyuki, Nick Piggin, Ingo Molnar, LKML,
Thomas Gleixner, Darren Hart, Ulrich Drepper
On Thu, Jan 07, 2010 at 03:32:57PM +0900, KOSAKI Motohiro wrote:
> > to include/linux/mm.h - I'd prefer to keep it private if we can.
> > But for completeness, this would involve resurrecting the 2.6.19
> > MIPS move_pte(), which makes sure mremap() move doesn't interfere
> > with our assumptions. Something like
> >
> > #define __HAVE_ARCH_MOVE_PTE
> > pte_t move_pte(pte_t pte, pgprot_t prot, unsigned long old_addr,
> > unsigned long new_addr)
> > {
> > if (pte_present(pte) && is_zero_pfn(pte_pfn(pte)))
> > pte = mk_pte(ZERO_PAGE(new_addr), prot);
> > return pte;
> > }
> >
> > in arch/mips/include/asm/pgtable.h.
>
> I agree with resurrecting mips move_pte. At least your patch
> passed my cross compile test :)
>
> Ralf, can you please review following patch?
Looks good.
Acked-by: Ralf Baechle <ralf@linux-mips.org>
Thanks,
Ralf
^ permalink raw reply [flat|nested] 15+ messages in thread
* [tip:core/urgent] futexes: Remove rw parameter from get_futex_key()
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
2010-01-05 11:21 ` Hugh Dickins
2010-01-06 23:29 ` Darren Hart
@ 2010-01-13 10:30 ` tip-bot for KOSAKI Motohiro
2 siblings, 0 replies; 15+ messages in thread
From: tip-bot for KOSAKI Motohiro @ 2010-01-13 10:30 UTC (permalink / raw)
To: linux-tip-commits
Cc: linux-kernel, dvhltc, hpa, mingo, torvalds, peterz, npiggin,
drepper, stable, tglx, hugh.dickins, kamezawa.hiroyu,
kosaki.motohiro, mingo
Commit-ID: 7485d0d3758e8e6491a5c9468114e74dc050785d
Gitweb: http://git.kernel.org/tip/7485d0d3758e8e6491a5c9468114e74dc050785d
Author: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
AuthorDate: Tue, 5 Jan 2010 16:32:43 +0900
Committer: Ingo Molnar <mingo@elte.hu>
CommitDate: Wed, 13 Jan 2010 09:17:36 +0100
futexes: Remove rw parameter from get_futex_key()
Currently, futexes have two problem:
A) The current futex code doesn't handle private file mappings properly.
get_futex_key() uses PageAnon() to distinguish file and
anon, which can cause the following bad scenario:
1) thread-A call futex(private-mapping, FUTEX_WAIT), it
sleeps on file mapping object.
2) thread-B writes a variable and it makes it cow.
3) thread-B calls futex(private-mapping, FUTEX_WAKE), it
wakes up blocked thread on the anonymous page. (but it's nothing)
B) Current futex code doesn't handle zero page properly.
Read mode get_user_pages() can return zero page, but current
futex code doesn't handle it at all. Then, zero page makes
infinite loop internally.
The solution is to use write mode get_user_page() always for
page lookup. It prevents the lookup of both file page of private
mappings and zero page.
Performance concerns:
Probaly very little, because glibc always initialize variables
for futex before to call futex(). It means glibc users never see
the overhead of this patch.
Compatibility concerns:
This patch has few compatibility issues. After this patch,
FUTEX_WAIT require writable access to futex variables (read-only
mappings makes EFAULT). But practically it's not a problem,
glibc always initalizes variables for futexes explicitly - nobody
uses read-only mappings.
Reported-by: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Darren Hart <dvhltc@us.ibm.com>
Cc: <stable@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Ulrich Drepper <drepper@gmail.com>
LKML-Reference: <20100105162633.45A2.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
kernel/futex.c | 27 ++++++++++++---------------
1 files changed, 12 insertions(+), 15 deletions(-)
diff --git a/kernel/futex.c b/kernel/futex.c
index 8e3c3ff..d9b3a22 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -203,8 +203,6 @@ static void drop_futex_key_refs(union futex_key *key)
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
- * @rw: mapping needs to be read/write (values: VERIFY_READ,
- * VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
@@ -216,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key)
* lock_page() might sleep, the caller should not hold a spinlock.
*/
static int
-get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
@@ -239,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
@@ -248,7 +246,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
- err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
+ err = get_user_pages_fast(address, 1, 1, &page);
if (err < 0)
return err;
@@ -867,7 +865,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -913,10 +911,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1175,11 +1173,10 @@ retry:
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
+ ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2,
- requeue_pi ? VERIFY_WRITE : VERIFY_READ);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out_put_key1;
@@ -1738,7 +1735,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+ ret = get_futex_key(uaddr, fshared, &q->key);
if (unlikely(ret != 0))
return ret;
@@ -1904,7 +1901,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
q.requeue_pi_key = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out;
@@ -2023,7 +2020,7 @@ retry:
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
+ ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
@@ -2215,7 +2212,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
rt_waiter.task = NULL;
key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+ ret = get_futex_key(uaddr2, fshared, &key2);
if (unlikely(ret != 0))
goto out;
^ permalink raw reply related [flat|nested] 15+ messages in thread
end of thread, other threads:[~2010-01-13 10:33 UTC | newest]
Thread overview: 15+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2009-12-24 9:29 [PATCH] futex: Fix ZERO_PAGE cause infinite loop KOSAKI Motohiro
2009-12-24 9:39 ` Peter Zijlstra
2009-12-24 17:15 ` Ulrich Drepper
2009-12-25 1:51 ` KOSAKI Motohiro
2009-12-30 16:03 ` Hugh Dickins
2010-01-05 7:32 ` [PATCH v2] futex: remove rw parameter from get_futex_key() KOSAKI Motohiro
2010-01-05 11:21 ` Hugh Dickins
2010-01-05 20:41 ` Darren Hart
2010-01-06 2:27 ` KOSAKI Motohiro
2010-01-06 23:14 ` Darren Hart
2010-01-06 23:29 ` Darren Hart
2010-01-13 10:30 ` [tip:core/urgent] futexes: Remove " tip-bot for KOSAKI Motohiro
2010-01-07 6:32 ` [PATCH] mips,mm: Reinstate move_pte optimization KOSAKI Motohiro
2010-01-11 10:15 ` Ralf Baechle
2010-01-05 18:04 ` [PATCH] futex: Fix ZERO_PAGE cause infinite loop Darren Hart
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox