* [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
@ 2014-06-02 14:46 Jan Beulich
2014-06-02 15:16 ` Kirill A. Shutemov
2014-06-02 15:54 ` David Vrabel
0 siblings, 2 replies; 6+ messages in thread
From: Jan Beulich @ 2014-06-02 14:46 UTC (permalink / raw)
To: linux-mm
Cc: David Vrabel, mingo, tglx, Boris Ostrovsky, Konrad Rzeszutek Wilk,
hpa
For cold page allocations using the normal clear_highpage() mechanism
may be inefficient on certain architectures, namely due to needlessly
replacing a good part of the data cache contents. Introduce an arch-
overridable clear_cold_highpage() (using streaming non-temporal stores
on x86, where an override gets implemented right away) to make use of
in this specific case.
Leverage the impovement in the Xen balloon driver, eliminating the
explicit scrub_page() function.
Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
---
arch/x86/include/asm/page_types.h | 3 ++
arch/x86/lib/Makefile | 4 +--
arch/x86/lib/clear_page_32.S | 42 ++++++++++++++++++++++++++++++++++++++
arch/x86/lib/clear_page_64.S | 28 ++++++++++++++++++++++++-
drivers/xen/balloon.c | 14 ++++++------
include/linux/highmem.h | 13 +++++++++++
mm/page_alloc.c | 8 +++++--
7 files changed, 100 insertions(+), 12 deletions(-)
--- 3.15-rc8/arch/x86/include/asm/page_types.h
+++ 3.15-rc8-clear-cold-highpage/arch/x86/include/asm/page_types.h
@@ -46,6 +46,9 @@
#ifndef __ASSEMBLY__
+void clear_cold_page(void *);
+#define clear_cold_page clear_cold_page
+
extern int devmem_is_allowed(unsigned long pagenr);
extern unsigned long max_low_pfn_mapped;
--- 3.15-rc8/arch/x86/lib/Makefile
+++ 3.15-rc8-clear-cold-highpage/arch/x86/lib/Makefile
@@ -19,7 +19,7 @@ obj-$(CONFIG_SMP) += msr-smp.o cache-smp
lib-y := delay.o misc.o
lib-y += thunk_$(BITS).o
lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
-lib-y += memcpy_$(BITS).o
+lib-y += memcpy_$(BITS).o clear_page_$(BITS).o
lib-$(CONFIG_SMP) += rwlock.o
lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
@@ -39,7 +39,7 @@ endif
else
obj-y += iomap_copy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
- lib-y += thunk_64.o clear_page_64.o copy_page_64.o
+ lib-y += thunk_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
lib-y += copy_user_64.o copy_user_nocache_64.o
lib-y += cmpxchg16b_emu.o
--- /home/jbeulich/tmp/linux-3.15-rc8/arch/x86/lib/clear_page_32.S 1970-01-01 01:00:00.000000000 +0100
+++ 3.15-rc8-clear-cold-highpage/arch/x86/lib/clear_page_32.S
@@ -0,0 +1,42 @@
+#include <linux/linkage.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
+#include <asm/dwarf2.h>
+#include <asm/page_types.h>
+
+ENTRY(clear_cold_page)
+ CFI_STARTPROC
+ xorl %edx,%edx
+#ifdef CONFIG_X86_USE_3DNOW
+ jmp mmx_clear_page
+#else
+ movl $PAGE_SIZE,%ecx
+ jmp memset
+#endif
+ .p2align 4
+.Lcold_loop:
+ decl %ecx
+#define PUT(x) movntil %edx,x*4(%eax)
+ movntil %edx,(%eax)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+ leal 8*4(%eax),%eax
+ jnz .Lcold_loop
+ sfence
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_cold_page)
+
+ .section .altinstr_replacement,"ax"
+1: movl $PAGE_SIZE/(8*4),%ecx
+2:
+ .previous
+ .section .altinstructions,"a"
+ altinstruction_entry clear_cold_page, 1b, X86_FEATURE_XMM2, \
+ .Lcold_loop-clear_cold_page, 2b-1b
+ .previous
--- 3.15-rc8/arch/x86/lib/clear_page_64.S
+++ 3.15-rc8-clear-cold-highpage/arch/x86/lib/clear_page_64.S
@@ -1,6 +1,7 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
+#include <asm/page_types.h>
/*
* Zero a page.
@@ -27,7 +28,7 @@ ENDPROC(clear_page_c_e)
ENTRY(clear_page)
CFI_STARTPROC
xorl %eax,%eax
- movl $4096/64,%ecx
+ movl $PAGE_SIZE/64,%ecx
.p2align 4
.Lloop:
decl %ecx
@@ -40,6 +41,7 @@ ENTRY(clear_page)
PUT(5)
PUT(6)
PUT(7)
+#undef PUT
leaq 64(%rdi),%rdi
jnz .Lloop
nop
@@ -48,6 +50,30 @@ ENTRY(clear_page)
.Lclear_page_end:
ENDPROC(clear_page)
+ENTRY(clear_cold_page)
+ CFI_STARTPROC
+ xorl %eax,%eax
+ movl $PAGE_SIZE/(8*8),%ecx
+ .p2align 4
+.Lcold_loop:
+ decl %ecx
+#define PUT(x) movntiq %rax,x*8(%rdi)
+ movntiq %rax,(%rdi)
+ PUT(1)
+ PUT(2)
+ PUT(3)
+ PUT(4)
+ PUT(5)
+ PUT(6)
+ PUT(7)
+#undef PUT
+ leaq 8*8(%rdi),%rdi
+ jnz .Lcold_loop
+ sfence
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_cold_page)
+
/*
* Some CPUs support enhanced REP MOVSB/STOSB instructions.
* It is recommended to use this when possible.
--- 3.15-rc8/drivers/xen/balloon.c
+++ 3.15-rc8-clear-cold-highpage/drivers/xen/balloon.c
@@ -107,12 +107,11 @@ static DECLARE_DELAYED_WORK(balloon_work
#define GFP_BALLOON \
(GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC)
-static void scrub_page(struct page *page)
-{
#ifdef CONFIG_XEN_SCRUB_PAGES
- clear_highpage(page);
+#define __GFP_SCRUB __GFP_ZERO
+#else
+#define __GFP_SCRUB 0
#endif
-}
/* balloon_append: add the given page to the balloon. */
static void __balloon_append(struct page *page)
@@ -360,7 +359,9 @@ static enum bp_state increase_reservatio
#endif
/* Relinquish the page back to the allocator. */
- __free_reserved_page(page);
+ ClearPageReserved(page);
+ init_page_count(page);
+ free_hot_cold_page(page, 1);
}
balloon_stats.current_pages += rc;
@@ -392,6 +393,7 @@ static enum bp_state decrease_reservatio
if (nr_pages > ARRAY_SIZE(frame_list))
nr_pages = ARRAY_SIZE(frame_list);
+ gfp |= __GFP_NOTRACK | __GFP_COLD | __GFP_SCRUB;
for (i = 0; i < nr_pages; i++) {
page = alloc_page(gfp);
if (page == NULL) {
@@ -399,8 +401,6 @@ static enum bp_state decrease_reservatio
state = BP_EAGAIN;
break;
}
- scrub_page(page);
-
frame_list[i] = page_to_pfn(page);
}
--- 3.15-rc8/include/linux/highmem.h
+++ 3.15-rc8-clear-cold-highpage/include/linux/highmem.h
@@ -189,6 +189,19 @@ static inline void clear_highpage(struct
kunmap_atomic(kaddr);
}
+#ifndef __HAVE_ARCH_CLEAR_COLD_HIGHPAGE
+#ifdef clear_cold_page
+static inline void clear_cold_highpage(struct page *page)
+{
+ void *kaddr = kmap_atomic(page);
+ clear_cold_page(kaddr);
+ kunmap_atomic(kaddr);
+}
+#else
+#define clear_cold_highpage clear_highpage
+#endif
+#endif
+
static inline void zero_user_segments(struct page *page,
unsigned start1, unsigned end1,
unsigned start2, unsigned end2)
--- 3.15-rc8/mm/page_alloc.c
+++ 3.15-rc8-clear-cold-highpage/mm/page_alloc.c
@@ -417,8 +417,12 @@ static inline void prep_zero_page(struct
* and __GFP_HIGHMEM from hard or soft interrupt context.
*/
VM_BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt());
- for (i = 0; i < (1 << order); i++)
- clear_highpage(page + i);
+ for (i = 0; i < (1 << order); i++) {
+ if (unlikely(gfp_flags & __GFP_COLD))
+ clear_cold_highpage(page + i);
+ else
+ clear_highpage(page + i);
+ }
}
#ifdef CONFIG_DEBUG_PAGEALLOC
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
2014-06-02 14:46 [PATCH] improve __GFP_COLD/__GFP_ZERO interaction Jan Beulich
@ 2014-06-02 15:16 ` Kirill A. Shutemov
2014-06-02 15:27 ` Jan Beulich
2014-06-02 15:54 ` David Vrabel
1 sibling, 1 reply; 6+ messages in thread
From: Kirill A. Shutemov @ 2014-06-02 15:16 UTC (permalink / raw)
To: Jan Beulich
Cc: linux-mm, David Vrabel, mingo, tglx, Boris Ostrovsky,
Konrad Rzeszutek Wilk, hpa
On Mon, Jun 02, 2014 at 03:46:10PM +0100, Jan Beulich wrote:
> For cold page allocations using the normal clear_highpage() mechanism
> may be inefficient on certain architectures, namely due to needlessly
> replacing a good part of the data cache contents. Introduce an arch-
> overridable clear_cold_highpage() (using streaming non-temporal stores
> on x86, where an override gets implemented right away) to make use of
> in this specific case.
>
> Leverage the impovement in the Xen balloon driver, eliminating the
> explicit scrub_page() function.
Any benchmark data?
I've tried non-temporal stores to clear huge pages, but it didn't helped
much. I believe it can vary between micro-architectures, but we need
numbers. I've played with Westmere that time.
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
2014-06-02 15:16 ` Kirill A. Shutemov
@ 2014-06-02 15:27 ` Jan Beulich
2014-06-02 15:49 ` Kirill A. Shutemov
0 siblings, 1 reply; 6+ messages in thread
From: Jan Beulich @ 2014-06-02 15:27 UTC (permalink / raw)
To: Kirill A. Shutemov
Cc: David Vrabel, mingo, linux-mm, tglx, Boris Ostrovsky,
Konrad Rzeszutek Wilk, hpa
>>> On 02.06.14 at 17:16, <kirill@shutemov.name> wrote:
> On Mon, Jun 02, 2014 at 03:46:10PM +0100, Jan Beulich wrote:
>> For cold page allocations using the normal clear_highpage() mechanism
>> may be inefficient on certain architectures, namely due to needlessly
>> replacing a good part of the data cache contents. Introduce an arch-
>> overridable clear_cold_highpage() (using streaming non-temporal stores
>> on x86, where an override gets implemented right away) to make use of
>> in this specific case.
>>
>> Leverage the impovement in the Xen balloon driver, eliminating the
>> explicit scrub_page() function.
>
> Any benchmark data?
>
> I've tried non-temporal stores to clear huge pages, but it didn't helped
> much. I believe it can vary between micro-architectures, but we need
> numbers. I've played with Westmere that time.
It's not at all clear to me what to measure here - after all this isn't
about improving the page clearing latency or throughput, but about
avoiding to disturb other operations.
Jan
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
2014-06-02 15:27 ` Jan Beulich
@ 2014-06-02 15:49 ` Kirill A. Shutemov
0 siblings, 0 replies; 6+ messages in thread
From: Kirill A. Shutemov @ 2014-06-02 15:49 UTC (permalink / raw)
To: Jan Beulich
Cc: David Vrabel, mingo, linux-mm, tglx, Boris Ostrovsky,
Konrad Rzeszutek Wilk, hpa
On Mon, Jun 02, 2014 at 04:27:52PM +0100, Jan Beulich wrote:
> >>> On 02.06.14 at 17:16, <kirill@shutemov.name> wrote:
> > On Mon, Jun 02, 2014 at 03:46:10PM +0100, Jan Beulich wrote:
> >> For cold page allocations using the normal clear_highpage() mechanism
> >> may be inefficient on certain architectures, namely due to needlessly
> >> replacing a good part of the data cache contents. Introduce an arch-
> >> overridable clear_cold_highpage() (using streaming non-temporal stores
> >> on x86, where an override gets implemented right away) to make use of
> >> in this specific case.
> >>
> >> Leverage the impovement in the Xen balloon driver, eliminating the
> >> explicit scrub_page() function.
> >
> > Any benchmark data?
> >
> > I've tried non-temporal stores to clear huge pages, but it didn't helped
> > much. I believe it can vary between micro-architectures, but we need
> > numbers. I've played with Westmere that time.
>
> It's not at all clear to me what to measure here - after all this isn't
> about improving the page clearing latency or throughput, but about
> avoiding to disturb other operations.
It would be nice to find a workload which benefits from not trashing cache
from page allocator.
--
Kirill A. Shutemov
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
2014-06-02 14:46 [PATCH] improve __GFP_COLD/__GFP_ZERO interaction Jan Beulich
2014-06-02 15:16 ` Kirill A. Shutemov
@ 2014-06-02 15:54 ` David Vrabel
2014-06-04 16:03 ` Jan Beulich
1 sibling, 1 reply; 6+ messages in thread
From: David Vrabel @ 2014-06-02 15:54 UTC (permalink / raw)
To: Jan Beulich, linux-mm
Cc: mingo, tglx, Boris Ostrovsky, Konrad Rzeszutek Wilk, hpa
On 02/06/14 15:46, Jan Beulich wrote:
>
> --- 3.15-rc8/drivers/xen/balloon.c
> +++ 3.15-rc8-clear-cold-highpage/drivers/xen/balloon.c
Please split the Xen part out into a separate patch since this is a
useful cleanup either way.
David
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
* Re: [PATCH] improve __GFP_COLD/__GFP_ZERO interaction
2014-06-02 15:54 ` David Vrabel
@ 2014-06-04 16:03 ` Jan Beulich
0 siblings, 0 replies; 6+ messages in thread
From: Jan Beulich @ 2014-06-04 16:03 UTC (permalink / raw)
To: David Vrabel
Cc: mingo, linux-mm, tglx, Boris Ostrovsky, Konrad Rzeszutek Wilk,
hpa
>>> On 02.06.14 at 17:54, <david.vrabel@citrix.com> wrote:
> On 02/06/14 15:46, Jan Beulich wrote:
>>
>> --- 3.15-rc8/drivers/xen/balloon.c
>> +++ 3.15-rc8-clear-cold-highpage/drivers/xen/balloon.c
>
> Please split the Xen part out into a separate patch since this is a
> useful cleanup either way.
Actually I'm not convinced the Xen part alone is a good change: By
switching to __GFP_COLD allocations without using suitable special
cased memory scrubbing you'd blow good parts of your data cache
for no good reason, i.e. this quite likely would introduce a
performance regression. Which I wouldn't want to put my name
under.
Jan
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
^ permalink raw reply [flat|nested] 6+ messages in thread
end of thread, other threads:[~2014-06-04 16:03 UTC | newest]
Thread overview: 6+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2014-06-02 14:46 [PATCH] improve __GFP_COLD/__GFP_ZERO interaction Jan Beulich
2014-06-02 15:16 ` Kirill A. Shutemov
2014-06-02 15:27 ` Jan Beulich
2014-06-02 15:49 ` Kirill A. Shutemov
2014-06-02 15:54 ` David Vrabel
2014-06-04 16:03 ` Jan Beulich
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).