From: Mel Gorman <mgorman@suse.de>
To: Dave Hansen <dave@sr71.net>
Cc: x86@kernel.org, linux-kernel@vger.kernel.org, linux-mm@kvack.org,
akpm@linux-foundation.org, kirill.shutemov@linux.intel.com,
ak@linux.intel.com, riel@redhat.com, alex.shi@linaro.org,
dave.hansen@linux.intel.com
Subject: Re: [PATCH 4/6] x86: mm: trace tlb flushes
Date: Thu, 24 Apr 2014 11:14:20 +0100 [thread overview]
Message-ID: <20140424101419.GS23991@suse.de> (raw)
In-Reply-To: <20140421182425.93E696A3@viggo.jf.intel.com>
On Mon, Apr 21, 2014 at 11:24:25AM -0700, Dave Hansen wrote:
>
> From: Dave Hansen <dave.hansen@linux.intel.com>
>
> We don't have any good way to figure out what kinds of flushes
> are being attempted. Right now, we can try to use the vm
> counters, but those only tell us what we actually did with the
> hardware (one-by-one vs full) and don't tell us what was actually
> _requested_.
>
And when enabled they are a penalty even for those that don't care.
> This allows us to select out "interesting" TLB flushes that we
> might want to optimize (like the ranged ones) and ignore the ones
> that we have very little control over (the ones at context
> switch).
>
> Also, since we have a pair of tracepoint calls in
> flush_tlb_mm_range(), we can time the deltas between them to make
> sure that we got the "invlpg vs. global flush" balance correct in
> practice.
>
> Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
> ---
>
> b/arch/x86/include/asm/mmu_context.h | 6 +++++
> b/arch/x86/mm/tlb.c | 12 +++++++++--
> b/include/linux/mm_types.h | 10 +++++++++
> b/include/trace/events/tlb.h | 37 +++++++++++++++++++++++++++++++++++
> b/mm/Makefile | 2 -
> b/mm/trace_tlb.c | 12 +++++++++++
> 6 files changed, 76 insertions(+), 3 deletions(-)
>
> diff -puN arch/x86/include/asm/mmu_context.h~tlb-trace-flushes arch/x86/include/asm/mmu_context.h
> --- a/arch/x86/include/asm/mmu_context.h~tlb-trace-flushes 2014-04-21 11:10:35.519867746 -0700
> +++ b/arch/x86/include/asm/mmu_context.h 2014-04-21 11:10:35.527868108 -0700
> @@ -3,6 +3,10 @@
>
> #include <asm/desc.h>
> #include <linux/atomic.h>
> +#include <linux/mm_types.h>
> +
> +#include <trace/events/tlb.h>
> +
> #include <asm/pgalloc.h>
> #include <asm/tlbflush.h>
> #include <asm/paravirt.h>
> @@ -44,6 +48,7 @@ static inline void switch_mm(struct mm_s
>
> /* Re-load page tables */
> load_cr3(next->pgd);
> + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
>
> /* Stop flush ipis for the previous mm */
> cpumask_clear_cpu(cpu, mm_cpumask(prev));
> @@ -71,6 +76,7 @@ static inline void switch_mm(struct mm_s
> * to make sure to use no freed page tables.
> */
> load_cr3(next->pgd);
> + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> load_LDT_nolock(&next->context);
> }
> }
> diff -puN arch/x86/mm/tlb.c~tlb-trace-flushes arch/x86/mm/tlb.c
> --- a/arch/x86/mm/tlb.c~tlb-trace-flushes 2014-04-21 11:10:35.520867791 -0700
> +++ b/arch/x86/mm/tlb.c 2014-04-21 11:10:35.528868153 -0700
> @@ -14,6 +14,8 @@
> #include <asm/uv/uv.h>
> #include <linux/debugfs.h>
>
> +#include <trace/events/tlb.h>
> +
> DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate)
> = { &init_mm, 0, };
>
> @@ -49,6 +51,7 @@ void leave_mm(int cpu)
> if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
> cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
> load_cr3(swapper_pg_dir);
> + trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
> }
> }
> EXPORT_SYMBOL_GPL(leave_mm);
> @@ -105,9 +108,10 @@ static void flush_tlb_func(void *info)
>
> count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
> if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) {
> - if (f->flush_end == TLB_FLUSH_ALL)
> + if (f->flush_end == TLB_FLUSH_ALL) {
> local_flush_tlb();
> - else if (!f->flush_end)
> + trace_tlb_flush(TLB_REMOTE_SHOOTDOWN, TLB_FLUSH_ALL);
> + } else if (!f->flush_end)
> __flush_tlb_single(f->flush_start);
> else {
> unsigned long addr;
Why is only the TLB_FLUSH_ALL case traced here and not the single flush
or range of flushes? __native_flush_tlb_single() doesn't have a trace
point so I worry we are missing visibility on this part in particular
this part.
while (addr < f->flush_end) {
__flush_tlb_single(addr);
addr += PAGE_SIZE;
}
> @@ -152,7 +156,9 @@ void flush_tlb_current_task(void)
> preempt_disable();
>
> count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
> + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
> local_flush_tlb();
> + trace_tlb_flush(TLB_LOCAL_SHOOTDOWN_DONE, TLB_FLUSH_ALL);
> if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
> flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
> preempt_enable();
Are the two tracepoints really useful? Are they fine enough to measure
the cost of the TLB flush? It misses the refill obviously but not much
we can do there.
> @@ -188,6 +194,7 @@ void flush_tlb_mm_range(struct mm_struct
> if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
> base_pages_to_flush = (end - start) >> PAGE_SHIFT;
>
> + trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN, base_pages_to_flush);
> if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
> base_pages_to_flush = TLB_FLUSH_ALL;
> count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
> @@ -199,6 +206,7 @@ void flush_tlb_mm_range(struct mm_struct
> __flush_tlb_single(addr);
> }
> }
> + trace_tlb_flush(TLB_LOCAL_MM_SHOOTDOWN_DONE, base_pages_to_flush);
> out:
> if (base_pages_to_flush == TLB_FLUSH_ALL) {
> start = 0UL;
> diff -puN include/linux/mm_types.h~tlb-trace-flushes include/linux/mm_types.h
> --- a/include/linux/mm_types.h~tlb-trace-flushes 2014-04-21 11:10:35.522867881 -0700
> +++ b/include/linux/mm_types.h 2014-04-21 11:10:35.529868198 -0700
> @@ -510,4 +510,14 @@ static inline void clear_tlb_flush_pendi
> }
> #endif
>
> +enum tlb_flush_reason {
> + TLB_FLUSH_ON_TASK_SWITCH,
> + TLB_REMOTE_SHOOTDOWN,
> + TLB_LOCAL_SHOOTDOWN,
> + TLB_LOCAL_SHOOTDOWN_DONE,
> + TLB_LOCAL_MM_SHOOTDOWN,
> + TLB_LOCAL_MM_SHOOTDOWN_DONE,
> + NR_TLB_FLUSH_REASONS,
> +};
> +
Bonus points if you use the string formatting similar to the reason field
int events/writeback.h. You do something like that already but there are
already helpers for use with __print_symbolic so you do not need to roll
your own version.
It should reduce the need to add trace_tlb.c if you include the header in
something like memory.c instead.
> #endif /* _LINUX_MM_TYPES_H */
> diff -puN /dev/null include/trace/events/tlb.h
> --- /dev/null 2014-04-10 11:28:14.066815724 -0700
> +++ b/include/trace/events/tlb.h 2014-04-21 11:10:35.529868198 -0700
> @@ -0,0 +1,37 @@
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM tlb
> +
> +#if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_TLB_H
> +
> +#include <linux/mm_types.h>
> +#include <linux/tracepoint.h>
> +
> +extern const char * const tlb_flush_reason_desc[];
> +
> +TRACE_EVENT(tlb_flush,
> +
> + TP_PROTO(int reason, unsigned long pages),
> + TP_ARGS(reason, pages),
> +
> + TP_STRUCT__entry(
> + __field( int, reason)
> + __field(unsigned long, pages)
> + ),
> +
> + TP_fast_assign(
> + __entry->reason = reason;
> + __entry->pages = pages;
> + ),
> +
> + TP_printk("pages: %ld reason: %d (%s)",
> + __entry->pages,
> + __entry->reason,
> + tlb_flush_reason_desc[__entry->reason])
> +);
> +
I would also suggest you match the output formatting with writeback.h
which would look like
pages:%lu reason:%s
The raw format should still have the integer while the string formatting
would have something human readable. Instead
> +#endif /* _TRACE_TLB_H */
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> +
> diff -puN mm/Makefile~tlb-trace-flushes mm/Makefile
> --- a/mm/Makefile~tlb-trace-flushes 2014-04-21 11:10:35.524867971 -0700
> +++ b/mm/Makefile 2014-04-21 11:10:35.530868243 -0700
> @@ -5,7 +5,7 @@
> mmu-y := nommu.o
> mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
> mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
> - vmalloc.o pagewalk.o pgtable-generic.o
> + vmalloc.o pagewalk.o pgtable-generic.o trace_tlb.o
>
> ifdef CONFIG_CROSS_MEMORY_ATTACH
> mmu-$(CONFIG_MMU) += process_vm_access.o
> diff -puN /dev/null mm/trace_tlb.c
> --- /dev/null 2014-04-10 11:28:14.066815724 -0700
> +++ b/mm/trace_tlb.c 2014-04-21 11:10:35.530868243 -0700
> @@ -0,0 +1,12 @@
> +#define CREATE_TRACE_POINTS
> +#include <trace/events/tlb.h>
> +
> +const char * const tlb_flush_reason_desc[] = {
> + __stringify(TLB_FLUSH_ON_TASK_SWITCH),
> + __stringify(TLB_REMOTE_SHOOTDOWN),
> + __stringify(TLB_LOCAL_SHOOTDOWN),
> + __stringify(TLB_LOCAL_SHOOTDOWN_DONE),
> + __stringify(TLB_LOCAL_MM_SHOOTDOWN),
> + __stringify(TLB_LOCAL_MM_SHOOTDOWN_DONE),
> +};
> +
> _
--
Mel Gorman
SUSE Labs
--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org. For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>
next prev parent reply other threads:[~2014-04-24 10:14 UTC|newest]
Thread overview: 28+ messages / expand[flat|nested] mbox.gz Atom feed top
2014-04-21 18:24 [PATCH 0/6] x86: rework tlb range flushing code Dave Hansen
2014-04-21 18:24 ` [PATCH 1/6] x86: mm: clean up tlb " Dave Hansen
2014-04-22 16:53 ` Rik van Riel
2014-04-24 8:33 ` Mel Gorman
2014-04-21 18:24 ` [PATCH 2/6] x86: mm: rip out complicated, out-of-date, buggy TLB flushing Dave Hansen
2014-04-22 16:54 ` Rik van Riel
2014-04-24 8:45 ` Mel Gorman
2014-04-24 16:58 ` Dave Hansen
2014-04-24 18:00 ` Mel Gorman
2014-04-25 21:39 ` Dave Hansen
2014-04-21 18:24 ` [PATCH 3/6] x86: mm: fix missed global TLB flush stat Dave Hansen
2014-04-22 17:15 ` Rik van Riel
2014-04-24 8:49 ` Mel Gorman
2014-04-21 18:24 ` [PATCH 4/6] x86: mm: trace tlb flushes Dave Hansen
2014-04-22 21:19 ` Rik van Riel
2014-04-24 10:14 ` Mel Gorman [this message]
2014-04-24 20:42 ` Dave Hansen
2014-04-21 18:24 ` [PATCH 5/6] x86: mm: new tunable for single vs full TLB flush Dave Hansen
2014-04-22 21:31 ` Rik van Riel
2014-04-24 10:37 ` Mel Gorman
2014-04-24 17:25 ` Dave Hansen
2014-04-24 17:53 ` Rik van Riel
2014-04-24 22:03 ` Dave Hansen
2014-07-07 17:43 ` Dave Hansen
2014-07-08 0:43 ` Alex Shi
2014-04-21 18:24 ` [PATCH 6/6] x86: mm: set TLB flush tunable to sane value (33) Dave Hansen
2014-04-22 21:33 ` Rik van Riel
2014-04-24 10:46 ` Mel Gorman
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20140424101419.GS23991@suse.de \
--to=mgorman@suse.de \
--cc=ak@linux.intel.com \
--cc=akpm@linux-foundation.org \
--cc=alex.shi@linaro.org \
--cc=dave.hansen@linux.intel.com \
--cc=dave@sr71.net \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=riel@redhat.com \
--cc=x86@kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).