From: Kan Liang <kan.liang@linux.intel.com>
To: peterz@infradead.org, acme@kernel.org, mingo@redhat.com,
linux-kernel@vger.kernel.org
Cc: mark.rutland@arm.com, alexander.shishkin@linux.intel.com,
jolsa@redhat.com, eranian@google.com, ak@linux.intel.com,
dave.hansen@intel.com, kirill.shutemov@linux.intel.com,
Kan Liang <kan.liang@linux.intel.com>
Subject: [PATCH V6 01/16] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE
Date: Mon, 10 Aug 2020 14:24:21 -0700 [thread overview]
Message-ID: <20200810212436.8026-2-kan.liang@linux.intel.com> (raw)
In-Reply-To: <20200810212436.8026-1-kan.liang@linux.intel.com>
Current perf can report both virtual addresses and physical addresses,
but not the page size. Without the page size information of the utilized
page, users cannot decide whether to promote/demote large pages to
optimize memory usage.
Add a new sample type for the data page size.
Current perf already has a facility to collect data virtual addresses.
A page walker is required to walk the pages tables and calculate the
page size from a given virtual address.
On some platforms, e.g., X86, the page walker is invoked in an NMI
handler. So the page walker must be IRQ-safe and low overhead. Besides,
the page walker should work for both user and kernel virtual address.
The existing generic page walker, e.g., walk_page_range_novma(), is a
little bit complex and doesn't guarantee the IRQ-safe. The follow_page()
is only for user-virtual address.
Add a new function perf_get_page_size() to walk the page tables and
calculate the page size. In the function:
- Interrupts have to be disabled to prevent any teardown of the page
tables.
- The size of a normal page is from the pre-defined page size macros.
- The size of a compound page is retrieved from the helper function,
page_size().
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Kan Liang <kan.liang@linux.intel.com>
---
include/linux/perf_event.h | 1 +
include/uapi/linux/perf_event.h | 4 +-
kernel/events/core.c | 121 ++++++++++++++++++++++++++++++++
3 files changed, 125 insertions(+), 1 deletion(-)
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3737e653f47e..5de95f36d7a8 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1035,6 +1035,7 @@ struct perf_sample_data {
u64 phys_addr;
u64 cgroup;
+ u64 data_page_size;
} ____cacheline_aligned;
/* default value for data source */
diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index 52ca2093831c..32484accc7a3 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -143,8 +143,9 @@ enum perf_event_sample_format {
PERF_SAMPLE_PHYS_ADDR = 1U << 19,
PERF_SAMPLE_AUX = 1U << 20,
PERF_SAMPLE_CGROUP = 1U << 21,
+ PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22,
- PERF_SAMPLE_MAX = 1U << 22, /* non-ABI */
+ PERF_SAMPLE_MAX = 1U << 23, /* non-ABI */
__PERF_SAMPLE_CALLCHAIN_EARLY = 1ULL << 63, /* non-ABI; internal use */
};
@@ -879,6 +880,7 @@ enum perf_event_type {
* { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR
* { u64 size;
* char data[size]; } && PERF_SAMPLE_AUX
+ * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE
* };
*/
PERF_RECORD_SAMPLE = 9,
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 0fbf17ff7cc5..00becacfd15e 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -51,6 +51,7 @@
#include <linux/proc_ns.h>
#include <linux/mount.h>
#include <linux/min_heap.h>
+#include <linux/highmem.h>
#include "internal.h"
@@ -1895,6 +1896,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
if (sample_type & PERF_SAMPLE_CGROUP)
size += sizeof(data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ size += sizeof(data->data_page_size);
+
event->header_size = size;
}
@@ -6939,6 +6943,9 @@ void perf_output_sample(struct perf_output_handle *handle,
if (sample_type & PERF_SAMPLE_CGROUP)
perf_output_put(handle, data->cgroup);
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ perf_output_put(handle, data->data_page_size);
+
if (sample_type & PERF_SAMPLE_AUX) {
perf_output_put(handle, data->aux_size);
@@ -6996,6 +7003,117 @@ static u64 perf_virt_to_phys(u64 virt)
return phys_addr;
}
+#ifdef CONFIG_MMU
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ struct page *page;
+ pgd_t *pgd;
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ pgd = pgd_offset(mm, addr);
+ if (pgd_none(*pgd))
+ return 0;
+
+ p4d = p4d_offset(pgd, addr);
+ if (!p4d_present(*p4d))
+ return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+ if (p4d_leaf(*p4d)) {
+ page = p4d_page(*p4d);
+
+ if (PageCompound(page))
+ return page_size(compound_head(page));
+
+ return P4D_SIZE;
+ }
+#endif
+
+ pud = pud_offset(p4d, addr);
+ if (!pud_present(*pud))
+ return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+ if (pud_leaf(*pud)) {
+ page = pud_page(*pud);
+
+ if (PageCompound(page))
+ return page_size(compound_head(page));
+
+ return PUD_SIZE;
+ }
+#endif
+
+ pmd = pmd_offset(pud, addr);
+ if (!pmd_present(*pmd))
+ return 0;
+
+#if (defined(CONFIG_HUGETLB_PAGE) || defined(CONFIG_TRANSPARENT_HUGEPAGE))
+ if (pmd_leaf(*pmd)) {
+ page = pmd_page(*pmd);
+
+ if (PageCompound(page))
+ return page_size(compound_head(page));
+
+ return PMD_SIZE;
+ }
+#endif
+
+ pte = pte_offset_map(pmd, addr);
+ if (!pte_present(*pte)) {
+ pte_unmap(pte);
+ return 0;
+ }
+
+ pte_unmap(pte);
+ return PAGE_SIZE;
+}
+
+#else
+
+static u64 __perf_get_page_size(struct mm_struct *mm, unsigned long addr)
+{
+ return 0;
+}
+
+#endif
+
+/* Return the page size of a given virtual address. */
+static u64 perf_get_page_size(unsigned long addr)
+{
+ struct mm_struct *mm;
+ unsigned long flags;
+ u64 size;
+
+ if (!addr)
+ return 0;
+
+ /*
+ * Software page-table walkers must disable IRQs,
+ * which prevents any tear down of the page tables.
+ */
+ local_irq_save(flags);
+
+ mm = current->mm;
+ if (!mm) {
+ /*
+ * For kernel threads and the like, use init_mm so that
+ * we can find kernel memory.
+ */
+ mm = &init_mm;
+ }
+
+ size = __perf_get_page_size(mm, addr);
+
+ local_irq_restore(flags);
+
+ return size;
+}
+
static struct perf_callchain_entry __empty_callchain = { .nr = 0, };
struct perf_callchain_entry *
@@ -7151,6 +7269,9 @@ void perf_prepare_sample(struct perf_event_header *header,
}
#endif
+ if (sample_type & PERF_SAMPLE_DATA_PAGE_SIZE)
+ data->data_page_size = perf_get_page_size(data->addr);
+
if (sample_type & PERF_SAMPLE_AUX) {
u64 size;
--
2.17.1
next prev parent reply other threads:[~2020-08-10 21:27 UTC|newest]
Thread overview: 32+ messages / expand[flat|nested] mbox.gz Atom feed top
2020-08-10 21:24 [PATCH V6 00/16] Add the page size in the perf record Kan Liang
2020-08-10 21:24 ` Kan Liang [this message]
2020-08-10 21:35 ` [PATCH V6 01/16] perf/core: Add PERF_SAMPLE_DATA_PAGE_SIZE Peter Zijlstra
2020-08-10 21:39 ` Peter Zijlstra
2020-08-10 22:36 ` Liang, Kan
2020-08-10 21:47 ` Dave Hansen
2020-08-10 22:38 ` Liang, Kan
2020-08-10 22:47 ` Peter Zijlstra
2020-08-12 13:39 ` Liang, Kan
2020-08-12 13:53 ` Dave Hansen
2020-08-10 21:24 ` [PATCH V6 02/16] perf/x86/intel: Support PERF_SAMPLE_DATA_PAGE_SIZE Kan Liang
2020-08-10 21:40 ` Peter Zijlstra
2020-08-10 22:36 ` Liang, Kan
2020-08-10 21:24 ` [PATCH V6 03/16] perf/core: Add support for PERF_SAMPLE_CODE_PAGE_SIZE Kan Liang
2020-08-10 21:41 ` Peter Zijlstra
2020-08-10 22:37 ` Liang, Kan
2020-08-10 22:44 ` Peter Zijlstra
2020-08-10 21:24 ` [PATCH V6 04/16] tools headers UAPI: Update tools's copy of linux/perf_event.h Kan Liang
2020-08-10 21:24 ` [PATCH V6 05/16] perf record: Support new sample type for data page size Kan Liang
2020-08-10 21:24 ` [PATCH V6 06/16] perf script: Use ULL for enum perf_output_field Kan Liang
2020-08-12 12:21 ` Arnaldo Carvalho de Melo
2020-08-12 13:42 ` Liang, Kan
2020-08-10 21:24 ` [PATCH V6 07/16] perf script: Support data page size Kan Liang
2020-08-10 21:24 ` [PATCH V6 08/16] perf sort: Add sort option for " Kan Liang
2020-08-10 21:24 ` [PATCH V6 09/16] perf mem: Factor out a function to generate sort order Kan Liang
2020-08-10 21:24 ` [PATCH V6 10/16] perf mem: Clean up output format Kan Liang
2020-08-10 21:24 ` [PATCH V6 11/16] perf mem: Support data page size Kan Liang
2020-08-10 21:24 ` [PATCH V6 12/16] perf test: Add test case for PERF_SAMPLE_DATA_PAGE_SIZE Kan Liang
2020-08-10 21:24 ` [PATCH V6 13/16] perf tools: Add support for PERF_SAMPLE_CODE_PAGE_SIZE Kan Liang
2020-08-10 21:24 ` [PATCH V6 14/16] perf script: " Kan Liang
2020-08-10 21:24 ` [PATCH V6 15/16] perf report: " Kan Liang
2020-08-10 21:24 ` [PATCH V6 16/16] perf test: Add test case " Kan Liang
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20200810212436.8026-2-kan.liang@linux.intel.com \
--to=kan.liang@linux.intel.com \
--cc=acme@kernel.org \
--cc=ak@linux.intel.com \
--cc=alexander.shishkin@linux.intel.com \
--cc=dave.hansen@intel.com \
--cc=eranian@google.com \
--cc=jolsa@redhat.com \
--cc=kirill.shutemov@linux.intel.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mark.rutland@arm.com \
--cc=mingo@redhat.com \
--cc=peterz@infradead.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox