From: Andrew Morton <akpm@linux-foundation.org>
To: mm-commits@vger.kernel.org, yuzhao@google.com,
willy@infradead.org, will@kernel.org, vbabka@suse.cz,
svens@linux.ibm.com, sj@kernel.org, dhowells@redhat.com,
david@redhat.com, dave@stgolabs.net, catalin.marinas@arm.com,
Liam.Howlett@Oracle.com, akpm@linux-foundation.org
Subject: [merged mm-stable] mm-mmap-change-do_brk_flags-to-expand-existing-vma-and-add-do_brk_munmap.patch removed from -mm tree
Date: Mon, 26 Sep 2022 19:49:00 -0700 [thread overview]
Message-ID: <20220927024901.1CB21C433C1@smtp.kernel.org> (raw)
The quilt patch titled
Subject: mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
has been removed from the -mm tree. Its filename was
mm-mmap-change-do_brk_flags-to-expand-existing-vma-and-add-do_brk_munmap.patch
This patch was dropped because it was merged into the mm-stable branch
of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
------------------------------------------------------
From: "Liam R. Howlett" <Liam.Howlett@Oracle.com>
Subject: mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap()
Date: Tue, 6 Sep 2022 19:48:50 +0000
Avoid allocating a new VMA when it a vma modification can occur. When a
brk() can expand or contract a VMA, then the single store operation will
only modify one index of the maple tree instead of causing a node to split
or coalesce. This avoids unnecessary allocations/frees of maple tree
nodes and VMAs.
Move some limit & flag verifications out of the do_brk_flags() function to
use only relevant checks in the code path of bkr() and vm_brk_flags().
Set the vma to check if it can expand in vm_brk_flags() if extra criteria
are met.
Drop userfaultfd from do_brk_flags() path and only use it in
vm_brk_flags() path since that is the only place a munmap will happen.
Add a wraper for munmap for the brk case called do_brk_munmap().
Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.com
Signed-off-by: Liam R. Howlett <Liam.Howlett@Oracle.com>
Tested-by: Yu Zhao <yuzhao@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: David Howells <dhowells@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org>
Cc: SeongJae Park <sj@kernel.org>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/mmap.c | 239 ++++++++++++++++++++++++++++++++++++++--------------
1 file changed, 178 insertions(+), 61 deletions(-)
--- a/mm/mmap.c~mm-mmap-change-do_brk_flags-to-expand-existing-vma-and-add-do_brk_munmap
+++ a/mm/mmap.c
@@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma
return next;
}
-static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags,
- struct list_head *uf);
+/*
+ * check_brk_limits() - Use platform specific check of range & verify mlock
+ * limits.
+ * @addr: The address to check
+ * @len: The size of increase.
+ *
+ * Return: 0 on success.
+ */
+static int check_brk_limits(unsigned long addr, unsigned long len)
+{
+ unsigned long mapped_addr;
+
+ mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
+ if (IS_ERR_VALUE(mapped_addr))
+ return mapped_addr;
+
+ return mlock_future_check(current->mm, current->mm->def_flags, len);
+}
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf);
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma,
+ unsigned long addr, unsigned long request,
+ unsigned long flags);
SYSCALL_DEFINE1(brk, unsigned long, brk)
{
unsigned long newbrk, oldbrk, origbrk;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *next;
+ struct vm_area_struct *brkvma, *next = NULL;
unsigned long min_brk;
bool populate;
bool downgraded = false;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, 0, 0);
if (mmap_write_lock_killable(mm))
return -EINTR;
@@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_lock to read.
+ * do_brk_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
+ /* Search one past newbrk */
+ mas_set(&mas, newbrk);
+ brkvma = mas_find(&mas, oldbrk);
+ BUG_ON(brkvma == NULL);
+ if (brkvma->vm_start >= oldbrk)
+ goto out; /* mapping intersects with an existing non-brk vma. */
/*
- * mm->brk must to be protected by write mmap_lock so update it
- * before downgrading mmap_lock. When __do_munmap() fails,
- * mm->brk will be restored from origbrk.
+ * mm->brk must be protected by write mmap_lock.
+ * do_brk_munmap() may downgrade the lock, so update it
+ * before calling do_brk_munmap().
*/
mm->brk = brk;
- ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true);
- if (ret < 0) {
- mm->brk = origbrk;
- goto out;
- } else if (ret == 1) {
+ mas.last = oldbrk - 1;
+ ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf);
+ if (ret == 1) {
downgraded = true;
- }
- goto success;
+ goto success;
+ } else if (!ret)
+ goto success;
+
+ mm->brk = origbrk;
+ goto out;
}
- /* Check against existing mmap mappings. */
- next = find_vma(mm, oldbrk);
+ if (check_brk_limits(oldbrk, newbrk - oldbrk))
+ goto out;
+
+ /*
+ * Only check if the next VMA is within the stack_guard_gap of the
+ * expansion area
+ */
+ mas_set(&mas, oldbrk);
+ next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap);
if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
goto out;
+ brkvma = mas_prev(&mas, mm->start_brk);
/* Ok, looks good - let it rip. */
- if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0)
+ if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0)
goto out;
+
mm->brk = brk;
success:
@@ -2762,38 +2802,55 @@ out:
}
/*
- * this is really a simplified "do_mmap". it only handles
- * anonymous maps. eventually we may be able to do some
- * brk-specific accounting here.
- */
-static int do_brk_flags(unsigned long addr, unsigned long len,
- unsigned long flags, struct list_head *uf)
+ * brk_munmap() - Unmap a parital vma.
+ * @mas: The maple tree state.
+ * @vma: The vma to be modified
+ * @newbrk: the start of the address to unmap
+ * @oldbrk: The end of the address to unmap
+ * @uf: The userfaultfd list_head
+ *
+ * Returns: 1 on success.
+ * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if
+ * possible.
+ */
+static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long newbrk, unsigned long oldbrk,
+ struct list_head *uf)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
- pgoff_t pgoff = addr >> PAGE_SHIFT;
- int error;
- unsigned long mapped_addr;
- validate_mm_mt(mm);
-
- /* Until we need other flags, refuse anything except VM_EXEC. */
- if ((flags & (~VM_EXEC)) != 0)
- return -EINVAL;
- flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
-
- mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED);
- if (IS_ERR_VALUE(mapped_addr))
- return mapped_addr;
+ struct mm_struct *mm = vma->vm_mm;
+ int ret;
- error = mlock_future_check(mm, mm->def_flags, len);
- if (error)
- return error;
+ arch_unmap(mm, newbrk, oldbrk);
+ ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true);
+ validate_mm_mt(mm);
+ return ret;
+}
- /* Clear old maps, set up prev and uf */
- if (munmap_vma_range(mm, addr, len, &prev, uf))
- return -ENOMEM;
+/*
+ * do_brk_flags() - Increase the brk vma if the flags match.
+ * @mas: The maple tree state.
+ * @addr: The start address
+ * @len: The length of the increase
+ * @vma: The vma,
+ * @flags: The VMA Flags
+ *
+ * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags
+ * do not match then create a new anonymous VMA. Eventually we may be able to
+ * do some brk-specific accounting here.
+ */
+static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma,
+ unsigned long addr, unsigned long len,
+ unsigned long flags)
+{
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *prev = NULL;
- /* Check against address space limits *after* clearing old maps... */
+ validate_mm_mt(mm);
+ /*
+ * Check against address space limits by the changed size
+ * Note: This happens *after* clearing old mappings in some code paths.
+ */
+ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags;
if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT))
return -ENOMEM;
@@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long ad
if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
return -ENOMEM;
- /* Can we just expand an old private anonymous mapping? */
- vma = vma_merge(mm, prev, addr, addr + len, flags,
- NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
- if (vma)
- goto out;
-
/*
- * create a vma struct for an anonymous mapping
+ * Expand the existing vma if possible; Note that singular lists do not
+ * occur after forking, so the expand will only happen on new VMAs.
*/
- vma = vm_area_alloc(mm);
- if (!vma) {
- vm_unacct_memory(len >> PAGE_SHIFT);
- return -ENOMEM;
+ if (vma &&
+ (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) &&
+ ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) {
+ mas->index = vma->vm_start;
+ mas->last = addr + len - 1;
+ vma_adjust_trans_huge(vma, addr, addr + len, 0);
+ if (vma->anon_vma) {
+ anon_vma_lock_write(vma->anon_vma);
+ anon_vma_interval_tree_pre_update_vma(vma);
+ }
+ vma->vm_end = addr + len;
+ vma->vm_flags |= VM_SOFTDIRTY;
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_expand_failed;
+
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
+ khugepaged_enter_vma(vma, flags);
+ goto out;
}
+ prev = vma;
+
+ /* create a vma struct for an anonymous mapping */
+ vma = vm_area_alloc(mm);
+ if (!vma)
+ goto vma_alloc_fail;
vma_set_anonymous(vma);
vma->vm_start = addr;
vma->vm_end = addr + len;
- vma->vm_pgoff = pgoff;
+ vma->vm_pgoff = addr >> PAGE_SHIFT;
vma->vm_flags = flags;
vma->vm_page_prot = vm_get_page_prot(flags);
- if (vma_link(mm, vma, prev))
- goto no_vma_link;
+ mas_set_range(mas, vma->vm_start, addr + len - 1);
+ if (mas_store_gfp(mas, vma, GFP_KERNEL))
+ goto mas_store_fail;
+
+ if (!prev)
+ prev = mas_prev(mas, 0);
+ __vma_link_list(mm, vma, prev);
+ mm->map_count++;
out:
perf_event_mmap(vma);
mm->total_vm += len >> PAGE_SHIFT;
@@ -2837,18 +2918,29 @@ out:
validate_mm_mt(mm);
return 0;
-no_vma_link:
+mas_store_fail:
vm_area_free(vma);
+vma_alloc_fail:
+ vm_unacct_memory(len >> PAGE_SHIFT);
+ return -ENOMEM;
+
+mas_expand_failed:
+ if (vma->anon_vma) {
+ anon_vma_interval_tree_post_update_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
+ }
return -ENOMEM;
}
int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
{
struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma = NULL;
unsigned long len;
int ret;
bool populate;
LIST_HEAD(uf);
+ MA_STATE(mas, &mm->mm_mt, addr, addr);
len = PAGE_ALIGN(request);
if (len < request)
@@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, uns
if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_brk_flags(addr, len, flags, &uf);
+ /* Until we need other flags, refuse anything except VM_EXEC. */
+ if ((flags & (~VM_EXEC)) != 0)
+ return -EINVAL;
+
+ ret = check_brk_limits(addr, len);
+ if (ret)
+ goto limits_failed;
+
+ if (find_vma_intersection(mm, addr, addr + len))
+ ret = do_munmap(mm, addr, len, &uf);
+
+ if (ret)
+ goto munmap_failed;
+
+ vma = mas_prev(&mas, 0);
+ if (!vma || vma->vm_end != addr || vma_policy(vma) ||
+ !can_vma_merge_after(vma, flags, NULL, NULL,
+ addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL))
+ vma = NULL;
+
+ ret = do_brk_flags(&mas, vma, addr, len, flags);
populate = ((mm->def_flags & VM_LOCKED) != 0);
mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
return ret;
+
+munmap_failed:
+limits_failed:
+ mmap_write_unlock(mm);
+ return ret;
}
EXPORT_SYMBOL(vm_brk_flags);
_
Patches currently in -mm which might be from Liam.Howlett@Oracle.com are
reply other threads:[~2022-09-27 2:52 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220927024901.1CB21C433C1@smtp.kernel.org \
--to=akpm@linux-foundation.org \
--cc=Liam.Howlett@Oracle.com \
--cc=catalin.marinas@arm.com \
--cc=dave@stgolabs.net \
--cc=david@redhat.com \
--cc=dhowells@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mm-commits@vger.kernel.org \
--cc=sj@kernel.org \
--cc=svens@linux.ibm.com \
--cc=vbabka@suse.cz \
--cc=will@kernel.org \
--cc=willy@infradead.org \
--cc=yuzhao@google.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.