All of lore.kernel.org
 help / color / mirror / Atom feed
From: Saravanan D <saravanand@fb.com>
To: <x86@kernel.org>, <dave.hansen@linux.intel.com>,
	<luto@kernel.org>, <peterz@infradead.org>
Cc: <linux-kernel@vger.kernel.org>, <kernel-team@fb.com>,
	Saravanan D <saravanand@fb.com>
Subject: [PATCH V2] x86/mm: Tracking linear mapping split events
Date: Wed, 27 Jan 2021 09:51:24 -0800	[thread overview]
Message-ID: <20210127175124.3289879-1-saravanand@fb.com> (raw)
In-Reply-To: <bd157a11-8e6b-5f44-4d91-d99adb9f8686@intel.com>

Numerous hugepage splits in the linear mapping would give
admins the signal to narrow down the sluggishness caused by TLB
miss/reload.

To help with debugging, we introduce monotonic lifetime  hugepage
split event counts since SYSTEM_RUNNING to be displayed as part of
/proc/vmstat in x86 servers

The lifetime split event information will be displayed at the bottom of
/proc/vmstat
....
swap_ra 0
swap_ra_hit 0
direct_map_2M_splits 139
direct_map_4M_splits 0
direct_map_1G_splits 7
nr_unstable 0
....

Ancillary debugfs split event counts exported to userspace via read-write
endpoints : /sys/kernel/debug/x86/direct_map_[2M|4M|1G]_split

dmesg log when user resets the debugfs split event count for
debugging
....
[  232.470531] debugfs 2M Pages split event count(128) reset to 0
....

One of the many lasting (as we don't coalesce back) sources for huge page
splits is tracing as the granular page attribute/permission changes would
force the kernel to split code segments mapped to huge pages to smaller
ones thereby increasing the probability of TLB miss/reload even after
tracing has been stopped.

Signed-off-by: Saravanan D <saravanand@fb.com>
---
 arch/x86/mm/pat/set_memory.c  | 117 ++++++++++++++++++++++++++++++++++
 include/linux/vm_event_item.h |   8 +++
 mm/vmstat.c                   |   8 +++
 3 files changed, 133 insertions(+)

diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c
index 16f878c26667..97b6ef8dbd12 100644
--- a/arch/x86/mm/pat/set_memory.c
+++ b/arch/x86/mm/pat/set_memory.c
@@ -16,6 +16,8 @@
 #include <linux/pci.h>
 #include <linux/vmalloc.h>
 #include <linux/libnvdimm.h>
+#include <linux/vmstat.h>
+#include <linux/kernel.h>
 
 #include <asm/e820/api.h>
 #include <asm/processor.h>
@@ -76,6 +78,104 @@ static inline pgprot_t cachemode2pgprot(enum page_cache_mode pcm)
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
+static unsigned long split_page_event_count[PG_LEVEL_NUM];
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int direct_map_2M_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 2M Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_2M]);
+	split_page_event_count[PG_LEVEL_2M] = 0;
+
+	return 0;
+}
+
+static int direct_map_2M_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_2M];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_2M_split, direct_map_2M_split_get,
+			 direct_map_2M_split_set, "%llu\n");
+#else
+static int direct_map_4M_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 4M Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_2M]);
+	split_page_event_count[PG_LEVEL_2M] = 0;
+
+	return 0;
+}
+
+static int direct_map_4M_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_2M];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_4M_split, direct_map_4M_split_get,
+			 direct_map_4M_split_set, "%llu\n");
+#endif
+
+static int direct_map_1G_split_set(void *data, u64 val)
+{
+	switch (val) {
+	case 0:
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	pr_info("debugfs 1G Pages split event count(%lu) reset to 0",
+		  split_page_event_count[PG_LEVEL_1G]);
+	split_page_event_count[PG_LEVEL_1G] = 0;
+
+	return 0;
+}
+
+static int direct_map_1G_split_get(void *data, u64 *val)
+{
+	*val = split_page_event_count[PG_LEVEL_1G];
+	return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(fops_direct_map_1G_split, direct_map_1G_split_get,
+			 direct_map_1G_split_set, "%llu\n");
+
+static __init int direct_map_split_debugfs_init(void)
+{
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	debugfs_create_file("direct_map_2M_split", 0600,
+			    arch_debugfs_dir, NULL,
+			    &fops_direct_map_2M_split);
+#else
+	debugfs_create_file("direct_map_4M_split", 0600,
+			    arch_debugfs_dir, NULL,
+			    &fops_direct_map_4M_split);
+#endif
+	if (direct_gbpages)
+		debugfs_create_file("direct_map_1G_split", 0600,
+				    arch_debugfs_dir, NULL,
+				    &fops_direct_map_1G_split);
+	return 0;
+}
+
+late_initcall(direct_map_split_debugfs_init);
 
 void update_page_count(int level, unsigned long pages)
 {
@@ -85,12 +185,29 @@ void update_page_count(int level, unsigned long pages)
 	spin_unlock(&pgd_lock);
 }
 
+void update_split_page_event_count(int level)
+{
+	if (system_state == SYSTEM_RUNNING) {
+		split_page_event_count[level]++;
+		if (level == PG_LEVEL_2M) {
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+			count_vm_event(DIRECT_MAP_2M_SPLIT);
+#else
+			count_vm_event(DIRECT_MAP_4M_SPLIT);
+#endif
+		} else if (level == PG_LEVEL_1G) {
+			count_vm_event(DIRECT_MAP_1G_SPLIT);
+		}
+	}
+}
+
 static void split_page_count(int level)
 {
 	if (direct_pages_count[level] == 0)
 		return;
 
 	direct_pages_count[level]--;
+	update_split_page_event_count(level);
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
 
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 18e75974d4e3..439742d2435e 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -120,6 +120,14 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 #ifdef CONFIG_SWAP
 		SWAP_RA,
 		SWAP_RA_HIT,
+#endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+		DIRECT_MAP_2M_SPLIT,
+#else
+		DIRECT_MAP_4M_SPLIT,
+#endif
+		DIRECT_MAP_1G_SPLIT,
 #endif
 		NR_VM_EVENT_ITEMS
 };
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f8942160fc95..beaa2bb4f9dc 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1350,6 +1350,14 @@ const char * const vmstat_text[] = {
 	"swap_ra",
 	"swap_ra_hit",
 #endif
+#if defined(__x86_64__)
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+	"direct_map_2M_splits",
+#else
+	"direct_map_4M_splits",
+#endif
+	"direct_map_1G_splits",
+#endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
 };
 #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA || CONFIG_MEMCG */
-- 
2.24.1


  parent reply	other threads:[~2021-01-27 17:53 UTC|newest]

Thread overview: 32+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
     [not found] <BYAPR01MB40856478D5BE74CB6A7D5578CFBD9@BYAPR01MB4085.prod.exchangelabs.com>
2021-01-25 20:15 ` [PATCH] x86/mm: Tracking linear mapping split events since boot Dave Hansen
2021-01-25 20:32   ` Tejun Heo
2021-01-26  0:47     ` Dave Hansen
2021-01-26  0:53       ` Tejun Heo
2021-01-26  1:04         ` Dave Hansen
2021-01-26  1:17           ` Tejun Heo
2021-01-27 17:51           ` Saravanan D [this message]
2021-01-27 21:03             ` [PATCH V2] x86/mm: Tracking linear mapping split events Tejun Heo
2021-01-27 21:32               ` Dave Hansen
2021-01-27 21:36                 ` Tejun Heo
2021-01-27 21:42                   ` Saravanan D
2021-01-27 22:50                   ` [PATCH V3] " Saravanan D
2021-01-27 23:00                     ` Randy Dunlap
2021-01-27 23:56                       ` Saravanan D
2021-01-27 23:41                     ` Dave Hansen
2021-01-28  0:15                       ` Saravanan D
2021-01-28  4:35                       ` [PATCH V4] " Saravanan D
2021-01-28  4:51                         ` Matthew Wilcox
2021-01-28 10:49                           ` [PATCH V5] " Saravanan D
2021-01-28 15:04                             ` Matthew Wilcox
2021-01-28 19:49                               ` Saravanan D
2021-01-28 16:33                             ` Zi Yan
2021-01-28 16:41                               ` Dave Hansen
2021-01-28 16:56                                 ` Zi Yan
2021-01-28 16:59                               ` Song Liu
2021-01-28 19:17                             ` Dave Hansen
2021-01-28 21:20                               ` Saravanan D
2021-01-28 23:34                                 ` [PATCH V6] " Saravanan D
2021-01-28 23:41                                   ` Tejun Heo
2021-01-29 19:27                                   ` Johannes Weiner
2021-02-08 23:17                                     ` Saravanan D
2021-02-08 23:30                                   ` Dave Hansen

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20210127175124.3289879-1-saravanand@fb.com \
    --to=saravanand@fb.com \
    --cc=dave.hansen@linux.intel.com \
    --cc=kernel-team@fb.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=luto@kernel.org \
    --cc=peterz@infradead.org \
    --cc=x86@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.