[RFC PATCH 2/5] x86/ibs: Drive NUMA balancing via IBS access data

All of lore.kernel.org
 help / color / mirror / Atom feed

From: Bharata B Rao <bharata@amd.com>
To: <linux-kernel@vger.kernel.org>, <linux-mm@kvack.org>
Cc: <mgorman@suse.de>, <peterz@infradead.org>, <mingo@redhat.com>,
	<bp@alien8.de>, <dave.hansen@linux.intel.com>, <x86@kernel.org>,
	<akpm@linux-foundation.org>, <luto@kernel.org>,
	<tglx@linutronix.de>, <yue.li@memverge.com>,
	<Ravikumar.Bangoria@amd.com>, Bharata B Rao <bharata@amd.com>
Subject: [RFC PATCH 2/5] x86/ibs: Drive NUMA balancing via IBS access data
Date: Wed, 8 Feb 2023 13:05:30 +0530	[thread overview]
Message-ID: <20230208073533.715-3-bharata@amd.com> (raw)
In-Reply-To: <20230208073533.715-1-bharata@amd.com>

Feed the page access data obtained from IBS to NUMA balancing
as hint fault equivalents. The existing per-task and per-group
fault stats are now built from IBS-provided page access information.
With this it will not be necessary to scan the address space to
introduce NUMA hinting faults.

Use task_work framework to process the IBS sampled data. Actual
programming of IBS to generate page access information isn't
done yet.

Signed-off-by: Bharata B Rao <bharata@amd.com>
---
 arch/x86/mm/ibs.c             | 38 ++++++++++++++-
 include/linux/migrate.h       |  1 +
 include/linux/sched.h         |  1 +
 include/linux/vm_event_item.h |  1 +
 kernel/sched/fair.c           | 10 ++++
 mm/memory.c                   | 92 +++++++++++++++++++++++++++++++++++
 mm/vmstat.c                   |  1 +
 7 files changed, 143 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/ibs.c b/arch/x86/mm/ibs.c
index 411dba2a88d1..adbc587b1767 100644
--- a/arch/x86/mm/ibs.c
+++ b/arch/x86/mm/ibs.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 
 #include <linux/init.h>
+#include <linux/migrate.h>
+#include <linux/task_work.h>
 
 #include <asm/nmi.h>
 #include <asm/perf_event.h> /* TODO: Move defns like IBS_OP_ENABLE into non-perf header */
@@ -8,12 +10,30 @@
 
 static u64 ibs_config __read_mostly;
 
+struct ibs_access_work {
+	struct callback_head work;
+	u64 laddr, paddr;
+};
+
+void task_ibs_access_work(struct callback_head *work)
+{
+	struct ibs_access_work *iwork = container_of(work, struct ibs_access_work, work);
+	struct task_struct *p = current;
+
+	u64 laddr = iwork->laddr;
+	u64 paddr = iwork->paddr;
+
+	kfree(iwork);
+	do_numa_access(p, laddr, paddr);
+}
+
 static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
 {
 	u64 ops_ctl, ops_data3, ops_data2;
 	u64 remote_access;
 	u64 laddr = -1, paddr = -1;
 	struct mm_struct *mm = current->mm;
+	struct ibs_access_work *iwork;
 
 	rdmsrl(MSR_AMD64_IBSOPCTL, ops_ctl);
 
@@ -86,8 +106,24 @@ static int ibs_overflow_handler(unsigned int cmd, struct pt_regs *regs)
 	/* Is phys addr valid? */
 	if (ops_data3 & MSR_AMD64_IBSOPDATA3_PADDR_VALID)
 		rdmsrl(MSR_AMD64_IBSDCPHYSAD, paddr);
-	else
+	else {
 		count_vm_event(IBS_PADDR_INVALID);
+		goto handled;
+	}
+
+	/*
+	 * TODO: GFP_ATOMIC!
+	 */
+	iwork = kzalloc(sizeof(*iwork), GFP_ATOMIC);
+	if (!iwork)
+		goto handled;
+
+	count_vm_event(IBS_USEFUL_SAMPLES);
+
+	iwork->laddr = laddr;
+	iwork->paddr = paddr;
+	init_task_work(&iwork->work, task_ibs_access_work);
+	task_work_add(current, &iwork->work, TWA_RESUME);
 
 handled:
 	return NMI_HANDLED;
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index 3ef77f52a4f0..4dcce7885b0c 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -216,6 +216,7 @@ void migrate_device_pages(unsigned long *src_pfns, unsigned long *dst_pfns,
 			unsigned long npages);
 void migrate_device_finalize(unsigned long *src_pfns,
 			unsigned long *dst_pfns, unsigned long npages);
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr);
 
 #endif /* CONFIG_MIGRATION */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 853d08f7562b..19dd4ee07436 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2420,4 +2420,5 @@ static inline void sched_core_fork(struct task_struct *p) { }
 
 extern void sched_set_stop_task(int cpu, struct task_struct *stop);
 
+DECLARE_STATIC_KEY_FALSE(hw_access_hints);
 #endif
diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h
index 1d55e347d16c..2ccc7dee3c13 100644
--- a/include/linux/vm_event_item.h
+++ b/include/linux/vm_event_item.h
@@ -159,6 +159,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
 		IBS_LADDR_INVALID,
 		IBS_KERNEL_ADDR,
 		IBS_PADDR_INVALID,
+		IBS_USEFUL_SAMPLES,
 #endif
 #endif
 		NR_VM_EVENT_ITEMS
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0f8736991427..c9b9e62da779 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -47,6 +47,7 @@
 #include <linux/psi.h>
 #include <linux/ratelimit.h>
 #include <linux/task_work.h>
+#include <linux/migrate.h>
 
 #include <asm/switch_to.h>
 
@@ -3125,6 +3126,8 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
 	}
 }
 
+DEFINE_STATIC_KEY_FALSE(hw_access_hints);
+
 /*
  * Drive the periodic memory faults..
  */
@@ -3133,6 +3136,13 @@ static void task_tick_numa(struct rq *rq, struct task_struct *curr)
 	struct callback_head *work = &curr->numa_work;
 	u64 period, now;
 
+	/*
+	 * If we are using access hints from hardware (like using
+	 * IBS), don't scan the address space.
+	 */
+	if (static_branch_unlikely(&hw_access_hints))
+		return;
+
 	/*
 	 * We don't care about NUMA placement if we don't have memory.
 	 */
diff --git a/mm/memory.c b/mm/memory.c
index aad226daf41b..79096aba197c 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -4668,6 +4668,98 @@ int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 	return mpol_misplaced(page, vma, addr);
 }
 
+/*
+ * Called from task_work context to act upon the page access.
+ *
+ * Physical address (provided by IBS) is used directly instead
+ * of walking the page tables to get to the PTE/page. Hence we
+ * don't check if PTE is writable for the TNF_NO_GROUP
+ * optimization, which means RO pages are considered for grouping.
+ */
+void do_numa_access(struct task_struct *p, u64 laddr, u64 paddr)
+{
+	struct mm_struct *mm = p->mm;
+	struct vm_area_struct *vma;
+	struct page *page = NULL;
+	int page_nid = NUMA_NO_NODE;
+	int last_cpupid;
+	int target_nid;
+	int flags = 0;
+
+	if (!mm)
+		return;
+
+	if (!mmap_read_trylock(mm))
+		return;
+
+	vma = find_vma(mm, laddr);
+	if (!vma)
+		goto out_unlock;
+
+	if (!vma_migratable(vma) || !vma_policy_mof(vma) ||
+		is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP))
+		goto out_unlock;
+
+	if (!vma->vm_mm ||
+	    (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ)))
+		goto out_unlock;
+
+	if (!vma_is_accessible(vma))
+		goto out_unlock;
+
+	page = pfn_to_online_page(PHYS_PFN(paddr));
+	if (!page || is_zone_device_page(page))
+		goto out_unlock;
+
+	if (unlikely(!PageLRU(page)))
+		goto out_unlock;
+
+	/* TODO: handle PTE-mapped THP */
+	if (PageCompound(page))
+		goto out_unlock;
+
+	/*
+	 * Flag if the page is shared between multiple address spaces. This
+	 * is later used when determining whether to group tasks together
+	 */
+	if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
+		flags |= TNF_SHARED;
+
+	last_cpupid = page_cpupid_last(page);
+	page_nid = page_to_nid(page);
+
+	/*
+	 * For memory tiering mode, cpupid of slow memory page is used
+	 * to record page access time.  So use default value.
+	 */
+	if ((sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) &&
+	    !node_is_toptier(page_nid))
+		last_cpupid = (-1 & LAST_CPUPID_MASK);
+	else
+		last_cpupid = page_cpupid_last(page);
+
+	target_nid = numa_migrate_prep(page, vma, laddr, page_nid, &flags);
+	if (target_nid == NUMA_NO_NODE) {
+		put_page(page);
+		goto out;
+	}
+
+	/* Migrate to the requested node */
+	if (migrate_misplaced_page(page, vma, target_nid)) {
+		page_nid = target_nid;
+		flags |= TNF_MIGRATED;
+	} else {
+		flags |= TNF_MIGRATE_FAIL;
+	}
+
+out:
+	if (page_nid != NUMA_NO_NODE)
+		task_numa_fault(last_cpupid, page_nid, 1, flags);
+
+out_unlock:
+	mmap_read_unlock(mm);
+}
+
 static vm_fault_t do_numa_page(struct vm_fault *vmf)
 {
 	struct vm_area_struct *vma = vmf->vma;
diff --git a/mm/vmstat.c b/mm/vmstat.c
index c7a9d0d9ade8..33738426ae48 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1408,6 +1408,7 @@ const char * const vmstat_text[] = {
 	"ibs_invalid_laddr",
 	"ibs_kernel_addr",
 	"ibs_invalid_paddr",
+	"ibs_useful_samples",
 #endif
 #endif
 #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */
-- 
2.25.1

next prev parent reply	other threads:[~2023-02-08  7:36 UTC|newest]

Thread overview: 33+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-08  7:35 [RFC PATCH 0/5] Memory access profiler(IBS) driven NUMA balancing Bharata B Rao
2023-02-08  7:35 ` [RFC PATCH 1/5] x86/ibs: In-kernel IBS driver for page access profiling Bharata B Rao
2023-02-08  7:35 ` Bharata B Rao [this message]
2023-02-08  7:35 ` [RFC PATCH 3/5] x86/ibs: Enable per-process IBS from sched switch path Bharata B Rao
2023-02-08  7:35 ` [RFC PATCH 4/5] x86/ibs: Adjust access faults sampling period Bharata B Rao
2023-02-08  7:35 ` [RFC PATCH 5/5] x86/ibs: Delay the collection of HW-provided access info Bharata B Rao
2023-02-08 18:03 ` [RFC PATCH 0/5] Memory access profiler(IBS) driven NUMA balancing Peter Zijlstra
2023-02-08 18:12   ` Dave Hansen
2023-02-09  6:04     ` Bharata B Rao
2023-02-09 14:28       ` Dave Hansen
2023-02-10  4:28         ` Bharata B Rao
2023-02-10  4:40           ` Dave Hansen
2023-02-10 15:10             ` Bharata B Rao
2023-02-09  5:57   ` Bharata B Rao
2023-02-13  2:56     ` Huang, Ying
2023-02-13  3:23       ` Bharata B Rao
2023-02-13  3:34         ` Huang, Ying
2023-02-13  3:26 ` Huang, Ying
2023-02-13  5:52   ` Bharata B Rao
2023-02-13  6:30     ` Huang, Ying
2023-02-14  4:55       ` Bharata B Rao
2023-02-15  6:07         ` Huang, Ying
2023-02-24  3:28           ` Bharata B Rao
2023-02-16  8:41         ` Bharata B Rao
2023-02-17  6:03           ` Huang, Ying
2023-02-24  3:36             ` Bharata B Rao
2023-02-27  7:54               ` Huang, Ying
2023-03-01 11:21                 ` Bharata B Rao
2023-03-02  8:10                   ` Huang, Ying
2023-03-03  5:25                     ` Bharata B Rao
2023-03-03  5:53                       ` Huang, Ying
2023-03-06 15:30                         ` Bharata B Rao
2023-03-07  2:33                           ` Huang, Ying

find likely ancestor, descendant, or conflicting patches for this message:
( dfblob:411dba2a88d dfblob:adbc587b176 dfblob:3ef77f52a4f
dfblob:4dcce7885b0 dfblob:853d08f7562 dfblob:19dd4ee0743
dfblob:1d55e347d16 dfblob:2ccc7dee3c1 dfblob:0f873699142
dfblob:c9b9e62da77 dfblob:aad226daf41 dfblob:79096aba197
dfblob:c7a9d0d9ade dfblob:33738426ae4 )
 OR (
bs:"[RFC PATCH 2/5] x86/ibs: Drive NUMA balancing via IBS access data" )
	(help)

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230208073533.715-3-bharata@amd.com \
    --to=bharata@amd.com \
    --cc=Ravikumar.Bangoria@amd.com \
    --cc=akpm@linux-foundation.org \
    --cc=bp@alien8.de \
    --cc=dave.hansen@linux.intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=luto@kernel.org \
    --cc=mgorman@suse.de \
    --cc=mingo@redhat.com \
    --cc=peterz@infradead.org \
    --cc=tglx@linutronix.de \
    --cc=x86@kernel.org \
    --cc=yue.li@memverge.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.