From: Stefan Roesch <shr@devkernel.io>
To: kernel-team@fb.com
Cc: shr@devkernel.io, linux-mm@kvack.org, riel@surriel.com,
mhocko@suse.com, david@redhat.com,
linux-kselftest@vger.kernel.org, linux-doc@vger.kernel.org,
akpm@linux-foundation.org, hannes@cmpxchg.org
Subject: [PATCH v3 1/3] mm: add new api to enable ksm per process
Date: Thu, 23 Feb 2023 20:39:58 -0800 [thread overview]
Message-ID: <20230224044000.3084046-2-shr@devkernel.io> (raw)
In-Reply-To: <20230224044000.3084046-1-shr@devkernel.io>
This adds a new prctl to API to enable and disable KSM on a per process
basis instead of only at the VMA basis (with madvise).
1) Introduce new MMF_VM_MERGE_ANY flag
This introduces the new flag MMF_VM_MERGE_ANY flag. When this flag is
set, kernel samepage merging (ksm) gets enabled for all vma's of a
process.
2) add flag to __ksm_enter
This change adds the flag parameter to __ksm_enter. This allows to
distinguish if ksm was called by prctl or madvise.
3) add flag to __ksm_exit call
This adds the flag parameter to the __ksm_exit() call. This allows to
distinguish if this call is for an prctl or madvise invocation.
4) invoke madvise for all vmas in scan_get_next_rmap_item
If the new flag MMF_VM_MERGE_ANY has been set for a process, iterate
over all the vmas and enable ksm if possible. For the vmas that can be
ksm enabled this is only done once.
5) support disabling of ksm for a process
This adds the ability to disable ksm for a process if ksm has been
enabled for the process.
6) add new prctl option to get and set ksm for a process
This adds two new options to the prctl system call
- enable ksm for all vmas of a process (if the vmas support it).
- query if ksm has been enabled for a process.
Signed-off-by: Stefan Roesch <shr@devkernel.io>
---
include/linux/ksm.h | 14 ++++---
include/linux/sched/coredump.h | 1 +
include/uapi/linux/prctl.h | 2 +
kernel/sys.c | 29 +++++++++++++++
mm/ksm.c | 67 ++++++++++++++++++++++++++++++----
5 files changed, 101 insertions(+), 12 deletions(-)
diff --git a/include/linux/ksm.h b/include/linux/ksm.h
index 7e232ba59b86..d38a05a36298 100644
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -18,20 +18,24 @@
#ifdef CONFIG_KSM
int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
unsigned long end, int advice, unsigned long *vm_flags);
-int __ksm_enter(struct mm_struct *mm);
-void __ksm_exit(struct mm_struct *mm);
+int __ksm_enter(struct mm_struct *mm, int flag);
+void __ksm_exit(struct mm_struct *mm, int flag);
static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm)
{
+ if (test_bit(MMF_VM_MERGE_ANY, &oldmm->flags))
+ return __ksm_enter(mm, MMF_VM_MERGE_ANY);
if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags))
- return __ksm_enter(mm);
+ return __ksm_enter(mm, MMF_VM_MERGEABLE);
return 0;
}
static inline void ksm_exit(struct mm_struct *mm)
{
- if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
- __ksm_exit(mm);
+ if (test_bit(MMF_VM_MERGE_ANY, &mm->flags))
+ __ksm_exit(mm, MMF_VM_MERGE_ANY);
+ else if (test_bit(MMF_VM_MERGEABLE, &mm->flags))
+ __ksm_exit(mm, MMF_VM_MERGEABLE);
}
/*
diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h
index 0e17ae7fbfd3..0ee96ea7a0e9 100644
--- a/include/linux/sched/coredump.h
+++ b/include/linux/sched/coredump.h
@@ -90,4 +90,5 @@ static inline int get_dumpable(struct mm_struct *mm)
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\
MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK)
+#define MMF_VM_MERGE_ANY 29
#endif /* _LINUX_SCHED_COREDUMP_H */
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 1312a137f7fb..759b3f53e53f 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -290,4 +290,6 @@ struct prctl_mm_map {
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0
+#define PR_SET_MEMORY_MERGE 67
+#define PR_GET_MEMORY_MERGE 68
#endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index b3cab94545ed..495bab3ed2ad 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -15,6 +15,7 @@
#include <linux/highuid.h>
#include <linux/fs.h>
#include <linux/kmod.h>
+#include <linux/ksm.h>
#include <linux/perf_event.h>
#include <linux/resource.h>
#include <linux/kernel.h>
@@ -2659,6 +2660,34 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
+#ifdef CONFIG_KSM
+ case PR_SET_MEMORY_MERGE:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg2) {
+ if (mmap_write_lock_killable(me->mm))
+ return -EINTR;
+
+ if (test_bit(MMF_VM_MERGEABLE, &me->mm->flags))
+ error = -EINVAL;
+ else if (!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags))
+ error = __ksm_enter(me->mm, MMF_VM_MERGE_ANY);
+ mmap_write_unlock(me->mm);
+ } else {
+ __ksm_exit(me->mm, MMF_VM_MERGE_ANY);
+ }
+ break;
+ case PR_GET_MEMORY_MERGE:
+ if (!capable(CAP_SYS_RESOURCE))
+ return -EPERM;
+
+ if (arg2 || arg3 || arg4 || arg5)
+ return -EINVAL;
+
+ error = !!test_bit(MMF_VM_MERGE_ANY, &me->mm->flags);
+ break;
+#endif
default:
error = -EINVAL;
break;
diff --git a/mm/ksm.c b/mm/ksm.c
index 56808e3bfd19..23d6944f78ad 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1063,6 +1063,7 @@ static int unmerge_and_remove_all_rmap_items(void)
mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
mmdrop(mm);
} else
spin_unlock(&ksm_mmlist_lock);
@@ -2329,6 +2330,17 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot,
return rmap_item;
}
+static bool vma_ksm_mergeable(struct vm_area_struct *vma)
+{
+ if (vma->vm_flags & VM_MERGEABLE)
+ return true;
+
+ if (test_bit(MMF_VM_MERGE_ANY, &vma->vm_mm->flags))
+ return true;
+
+ return false;
+}
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2405,8 +2417,20 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
goto no_vmas;
for_each_vma(vmi, vma) {
- if (!(vma->vm_flags & VM_MERGEABLE))
+ if (!vma_ksm_mergeable(vma))
continue;
+ if (!(vma->vm_flags & VM_MERGEABLE)) {
+ unsigned long flags = vma->vm_flags;
+
+ /* madvise failed, use next vma */
+ if (ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_MERGEABLE, &flags))
+ continue;
+ /* vma, not supported as being mergeable */
+ if (!(flags & VM_MERGEABLE))
+ continue;
+
+ vm_flags_set(vma, VM_MERGEABLE);
+ }
if (ksm_scan.address < vma->vm_start)
ksm_scan.address = vma->vm_start;
if (!vma->anon_vma)
@@ -2491,6 +2515,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
mm_slot_free(mm_slot_cache, mm_slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
mmap_read_unlock(mm);
mmdrop(mm);
} else {
@@ -2595,8 +2620,9 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
return 0;
#endif
- if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) {
- err = __ksm_enter(mm);
+ if (!test_bit(MMF_VM_MERGEABLE, &mm->flags) &&
+ !test_bit(MMF_VM_MERGE_ANY, &mm->flags)) {
+ err = __ksm_enter(mm, MMF_VM_MERGEABLE);
if (err)
return err;
}
@@ -2622,7 +2648,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
}
EXPORT_SYMBOL_GPL(ksm_madvise);
-int __ksm_enter(struct mm_struct *mm)
+int __ksm_enter(struct mm_struct *mm, int flag)
{
struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
@@ -2655,7 +2681,7 @@ int __ksm_enter(struct mm_struct *mm)
list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node);
spin_unlock(&ksm_mmlist_lock);
- set_bit(MMF_VM_MERGEABLE, &mm->flags);
+ set_bit(flag, &mm->flags);
mmgrab(mm);
if (needs_wakeup)
@@ -2664,12 +2690,39 @@ int __ksm_enter(struct mm_struct *mm)
return 0;
}
-void __ksm_exit(struct mm_struct *mm)
+static void unmerge_vmas(struct mm_struct *mm)
+{
+ struct vm_area_struct *vma;
+ struct vma_iterator vmi;
+
+ vma_iter_init(&vmi, mm, 0);
+
+ mmap_read_lock(mm);
+ for_each_vma(vmi, vma) {
+ if (vma->vm_flags & VM_MERGEABLE) {
+ unsigned long flags = vma->vm_flags;
+
+ if (ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &flags))
+ continue;
+
+ vm_flags_clear(vma, VM_MERGEABLE);
+ }
+ }
+ mmap_read_unlock(mm);
+}
+
+void __ksm_exit(struct mm_struct *mm, int flag)
{
struct ksm_mm_slot *mm_slot;
struct mm_slot *slot;
int easy_to_free = 0;
+ if (!(current->flags & PF_EXITING) && flag == MMF_VM_MERGE_ANY &&
+ test_bit(MMF_VM_MERGE_ANY, &mm->flags)) {
+ clear_bit(MMF_VM_MERGE_ANY, &mm->flags);
+ unmerge_vmas(mm);
+ }
+
/*
* This process is exiting: if it's straightforward (as is the
* case when ksmd was never running), free mm_slot immediately.
@@ -2696,7 +2749,7 @@ void __ksm_exit(struct mm_struct *mm)
if (easy_to_free) {
mm_slot_free(mm_slot_cache, mm_slot);
- clear_bit(MMF_VM_MERGEABLE, &mm->flags);
+ clear_bit(flag, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
mmap_write_lock(mm);
--
2.30.2
next prev parent reply other threads:[~2023-02-24 4:41 UTC|newest]
Thread overview: 17+ messages / expand[flat|nested] mbox.gz Atom feed top
2023-02-24 4:39 [PATCH v3 0/3] mm: process/cgroup ksm support Stefan Roesch
2023-02-24 4:39 ` Stefan Roesch [this message]
2023-03-08 16:47 ` [PATCH v3 1/3] mm: add new api to enable ksm per process Johannes Weiner
2023-03-08 22:16 ` Stefan Roesch
2023-03-09 4:59 ` Johannes Weiner
2023-03-09 22:33 ` Stefan Roesch
2023-02-24 4:39 ` [PATCH v3 2/3] mm: add new KSM process and sysfs knobs Stefan Roesch
2023-02-24 4:40 ` [PATCH v3 3/3] selftests/mm: add new selftests for KSM Stefan Roesch
2023-02-26 5:30 ` Andrew Morton
2023-02-27 17:19 ` Stefan Roesch
2023-02-27 17:24 ` Mathieu Desnoyers
2023-02-26 5:08 ` [PATCH v3 0/3] mm: process/cgroup ksm support Andrew Morton
2023-02-27 17:13 ` Stefan Roesch
2023-03-07 18:48 ` Stefan Roesch
2023-03-08 17:01 ` David Hildenbrand
2023-03-08 17:30 ` Johannes Weiner
2023-03-08 18:41 ` David Hildenbrand
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20230224044000.3084046-2-shr@devkernel.io \
--to=shr@devkernel.io \
--cc=akpm@linux-foundation.org \
--cc=david@redhat.com \
--cc=hannes@cmpxchg.org \
--cc=kernel-team@fb.com \
--cc=linux-doc@vger.kernel.org \
--cc=linux-kselftest@vger.kernel.org \
--cc=linux-mm@kvack.org \
--cc=mhocko@suse.com \
--cc=riel@surriel.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).