linux-mm.kvack.org archive mirror
 help / color / mirror / Atom feed
From: Stefan Roesch <shr@devkernel.io>
To: kernel-team@fb.com
Cc: shr@devkernel.io, linux-mm@kvack.org, riel@surriel.com,
	mhocko@suse.com, david@redhat.com,
	linux-kselftest@vger.kernel.org, linux-doc@vger.kernel.org,
	akpm@linux-foundation.org, hannes@cmpxchg.org
Subject: [PATCH v3 1/3] mm: add new api to enable ksm per process
Date: Thu, 23 Feb 2023 20:39:58 -0800	[thread overview]
Message-ID: <20230224044000.3084046-2-shr@devkernel.io> (raw)
In-Reply-To: <20230224044000.3084046-1-shr@devkernel.io>

This adds a new prctl to API to enable and disable KSM on a per process
basis instead of only at the VMA basis (with madvise).

1) Introduce new MMF_VM_MERGE_ANY flag

This introduces the new flag MMF_VM_MERGE_ANY flag. When this flag is
set, kernel samepage merging (ksm) gets enabled for all vma's of a
process.

2) add flag to __ksm_enter

This change adds the flag parameter to __ksm_enter. This allows to
distinguish if ksm was called by prctl or madvise.

3) add flag to __ksm_exit call

This adds the flag parameter to the __ksm_exit() call. This allows to
distinguish if this call is for an prctl or madvise invocation.

4) invoke madvise for all vmas in scan_get_next_rmap_item

If the new flag MMF_VM_MERGE_ANY has been set for a process, iterate
over all the vmas and enable ksm if possible. For the vmas that can be
ksm enabled this is only done once.

5) support disabling of ksm for a process

This adds the ability to disable ksm for a process if ksm has been
enabled for the process.

6) add new prctl option to get and set ksm for a process

This adds two new options to the prctl system call
- enable ksm for all vmas of a process (if the vmas support it).
- query if ksm has been enabled for a process.

Signed-off-by: Stefan Roesch <shr@devkernel.io>
---
 include/linux/ksm.h            | 14 ++++---
 include/linux/sched/coredump.h |  1 +
 include/uapi/linux/prctl.h     |  2 +
 kernel/sys.c                   | 29 +++++++++++++++
 mm/ksm.c                       | 67 ++++++++++++++++++++++++++++++----
 5 files changed, 101 insertions(+), 12 deletions(-)

diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e232ba59b86..d38a05a36298 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,20 +18,24 @@
 #ifdef CONFIG_KSM
 int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		unsigned long end, int advice, unsigned long *vm_flags);
-int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
+int __ksm_enter(struct mm_struct *mm, int flag);
+void __ksm_exit(struct mm_struct *mm, int flag);
 
 static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
 {
+	if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
+		return __ksm_enter(mm, MMF_VM_MERGE_ANY);
 	if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
-		return __ksm_enter(mm);
+		return __ksm_enter(mm, MMF_VM_MERGEABLE);
 	return 0;
 }
 
 static inline void ksm_exit(struct mm_struct *mm)
 {
-	if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
-		__ksm_exit(mm);
+	if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+		__ksm_exit(mm, MMF_VM_MERGE_ANY);
+	else if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+		__ksm_exit(mm, MMF_VM_MERGEABLE);
 }
 
 /*
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0e17ae7fbfd3..0ee96ea7a0e9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm)
 #define MMF_INIT_MASK		(MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
 				 MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
 
+#define MMF_VM_MERGE_ANY	29
 #endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1312a137f7fb..759b3f53e53f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -290,4 +290,6 @@ struct prctl_mm_map {
 #define PR_SET_VMA		0x53564d41
 # define PR_SET_VMA_ANON_NAME		0
 
+#define PR_SET_MEMORY_MERGE		67
+#define PR_GET_MEMORY_MERGE		68
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index b3cab94545ed..495bab3ed2ad 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -15,6 +15,7 @@
 #include <linux/highuid.h>
 #include <linux/fs.h>
 #include <linux/kmod.h>
+#include <linux/ksm.h>
 #include <linux/perf_event.h>
 #include <linux/resource.h>
 #include <linux/kernel.h>
@@ -2659,6 +2660,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_SET_VMA:
 		error = prctl_set_vma(arg2, arg3, arg4, arg5);
 		break;
+#ifdef CONFIG_KSM
+	case PR_SET_MEMORY_MERGE:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (arg2) {
+			if (mmap_write_lock_killable(me->mm))
+				return -EINTR;
+
+			if (test_bit(MMF_VM_MERGEABLE, &me->mm->flags))
+				error = -EINVAL;
+			else if (!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags))
+				error = __ksm_enter(me->mm, MMF_VM_MERGE_ANY);
+			mmap_write_unlock(me->mm);
+		} else {
+			__ksm_exit(me->mm, MMF_VM_MERGE_ANY);
+		}
+		break;
+	case PR_GET_MEMORY_MERGE:
+		if (!capable(CAP_SYS_RESOURCE))
+			return -EPERM;
+
+		if (arg2 || arg3 || arg4 || arg5)
+			return -EINVAL;
+
+		error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+		break;
+#endif
 	default:
 		error = -EINVAL;
 		break;
diff --git a/mm/ksm.c b/mm/ksm.c
index 56808e3bfd19..23d6944f78ad 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1063,6 +1063,7 @@ static int unmerge_and_remove_all_rmap_items(void)
 
 			mm_slot_free(mm_slot_cache, mm_slot);
 			clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+			clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 			mmdrop(mm);
 		} else
 			spin_unlock(&ksm_mmlist_lock);
@@ -2329,6 +2330,17 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
 	return rmap_item;
 }
 
+static bool vma_ksm_mergeable(struct vm_area_struct *vma)
+{
+	if (vma->vm_flags & VM_MERGEABLE)
+		return true;
+
+	if (test_bit(MMF_VM_MERGE_ANY, &vma->vm_mm->flags))
+		return true;
+
+	return false;
+}
+
 static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 {
 	struct mm_struct *mm;
@@ -2405,8 +2417,20 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 		goto no_vmas;
 
 	for_each_vma(vmi, vma) {
-		if (!(vma->vm_flags & VM_MERGEABLE))
+		if (!vma_ksm_mergeable(vma))
 			continue;
+		if (!(vma->vm_flags & VM_MERGEABLE)) {
+			unsigned long flags = vma->vm_flags;
+
+			/* madvise failed, use next vma */
+			if (ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_MERGEABLE, &flags))
+				continue;
+			/* vma, not supported as being mergeable */
+			if (!(flags & VM_MERGEABLE))
+				continue;
+
+			vm_flags_set(vma, VM_MERGEABLE);
+		}
 		if (ksm_scan.address < vma->vm_start)
 			ksm_scan.address = vma->vm_start;
 		if (!vma->anon_vma)
@@ -2491,6 +2515,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
 
 		mm_slot_free(mm_slot_cache, mm_slot);
 		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
 		mmap_read_unlock(mm);
 		mmdrop(mm);
 	} else {
@@ -2595,8 +2620,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 			return 0;
 #endif
 
-		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
-			err = __ksm_enter(mm);
+		if (!test_bit(MMF_VM_MERGEABLE, &mm->flags) &&
+		    !test_bit(MMF_VM_MERGE_ANY, &mm->flags)) {
+			err = __ksm_enter(mm, MMF_VM_MERGEABLE);
 			if (err)
 				return err;
 		}
@@ -2622,7 +2648,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 }
 EXPORT_SYMBOL_GPL(ksm_madvise);
 
-int __ksm_enter(struct mm_struct *mm)
+int __ksm_enter(struct mm_struct *mm, int flag)
 {
 	struct ksm_mm_slot *mm_slot;
 	struct mm_slot *slot;
@@ -2655,7 +2681,7 @@ int __ksm_enter(struct mm_struct *mm)
 		list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
 	spin_unlock(&ksm_mmlist_lock);
 
-	set_bit(MMF_VM_MERGEABLE, &mm->flags);
+	set_bit(flag, &mm->flags);
 	mmgrab(mm);
 
 	if (needs_wakeup)
@@ -2664,12 +2690,39 @@ int __ksm_enter(struct mm_struct *mm)
 	return 0;
 }
 
-void __ksm_exit(struct mm_struct *mm)
+static void unmerge_vmas(struct mm_struct *mm)
+{
+	struct vm_area_struct *vma;
+	struct vma_iterator vmi;
+
+	vma_iter_init(&vmi, mm, 0);
+
+	mmap_read_lock(mm);
+	for_each_vma(vmi, vma) {
+		if (vma->vm_flags & VM_MERGEABLE) {
+			unsigned long flags = vma->vm_flags;
+
+			if (ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &flags))
+				continue;
+
+			vm_flags_clear(vma, VM_MERGEABLE);
+		}
+	}
+	mmap_read_unlock(mm);
+}
+
+void __ksm_exit(struct mm_struct *mm, int flag)
 {
 	struct ksm_mm_slot *mm_slot;
 	struct mm_slot *slot;
 	int easy_to_free = 0;
 
+	if (!(current->flags & PF_EXITING) && flag == MMF_VM_MERGE_ANY &&
+		test_bit(MMF_VM_MERGE_ANY, &mm->flags)) {
+		clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+		unmerge_vmas(mm);
+	}
+
 	/*
 	 * This process is exiting: if it's straightforward (as is the
 	 * case when ksmd was never running), free mm_slot immediately.
@@ -2696,7 +2749,7 @@ void __ksm_exit(struct mm_struct *mm)
 
 	if (easy_to_free) {
 		mm_slot_free(mm_slot_cache, mm_slot);
-		clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+		clear_bit(flag, &mm->flags);
 		mmdrop(mm);
 	} else if (mm_slot) {
 		mmap_write_lock(mm);
-- 
2.30.2



  reply	other threads:[~2023-02-24  4:41 UTC|newest]

Thread overview: 17+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2023-02-24  4:39 [PATCH v3 0/3] mm: process/cgroup ksm support Stefan Roesch
2023-02-24  4:39 ` Stefan Roesch [this message]
2023-03-08 16:47   ` [PATCH v3 1/3] mm: add new api to enable ksm per process Johannes Weiner
2023-03-08 22:16     ` Stefan Roesch
2023-03-09  4:59       ` Johannes Weiner
2023-03-09 22:33         ` Stefan Roesch
2023-02-24  4:39 ` [PATCH v3 2/3] mm: add new KSM process and sysfs knobs Stefan Roesch
2023-02-24  4:40 ` [PATCH v3 3/3] selftests/mm: add new selftests for KSM Stefan Roesch
2023-02-26  5:30   ` Andrew Morton
2023-02-27 17:19     ` Stefan Roesch
2023-02-27 17:24       ` Mathieu Desnoyers
2023-02-26  5:08 ` [PATCH v3 0/3] mm: process/cgroup ksm support Andrew Morton
2023-02-27 17:13   ` Stefan Roesch
2023-03-07 18:48   ` Stefan Roesch
2023-03-08 17:01 ` David Hildenbrand
2023-03-08 17:30   ` Johannes Weiner
2023-03-08 18:41     ` David Hildenbrand

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20230224044000.3084046-2-shr@devkernel.io \
    --to=shr@devkernel.io \
    --cc=akpm@linux-foundation.org \
    --cc=david@redhat.com \
    --cc=hannes@cmpxchg.org \
    --cc=kernel-team@fb.com \
    --cc=linux-doc@vger.kernel.org \
    --cc=linux-kselftest@vger.kernel.org \
    --cc=linux-mm@kvack.org \
    --cc=mhocko@suse.com \
    --cc=riel@surriel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).