From: Andrew Morton <akpm@linux-foundation.org>
To: mm-commits@vger.kernel.org, zhengqi.arch@bytedance.com,
willy@infradead.org, song.bao.hua@hisilicon.com,
osalvador@suse.de, mike.kravetz@oracle.com, mhocko@suse.com,
fam.zheng@bytedance.com, duanxiongchun@bytedance.com,
david@redhat.com, corbet@lwn.net, chenhuang5@huawei.com,
bodeddub@amazon.com, songmuchun@bytedance.com,
akpm@linux-foundation.org
Subject: [merged] mm-sparsemem-use-page-table-lock-to-protect-kernel-pmd-operations.patch removed from -mm tree
Date: Thu, 24 Mar 2022 18:31:57 -0700 [thread overview]
Message-ID: <20220325013158.53813C340ED@smtp.kernel.org> (raw)
The patch titled
Subject: mm: sparsemem: use page table lock to protect kernel pmd operations
has been removed from the -mm tree. Its filename was
mm-sparsemem-use-page-table-lock-to-protect-kernel-pmd-operations.patch
This patch was dropped because it was merged into mainline or a subsystem tree
------------------------------------------------------
From: Muchun Song <songmuchun@bytedance.com>
Subject: mm: sparsemem: use page table lock to protect kernel pmd operations
The init_mm.page_table_lock is used to protect kernel page tables, we can
use it to serialize splitting vmemmap PMD mappings instead of mmap write
lock, which can increase the concurrency of vmemmap_remap_free().
Actually, It increase the concurrency between allocations of HugeTLB
pages. But it is not the only benefit. There are a lot of users of mmap
read lock of init_mm. The mmap write lock is holding through
vmemmap_remap_free(), removing mmap write lock usage to make it does not
affect other users of mmap read lock. It is not making anything worse and
always a win to move.
Now the kernel page table walker does not hold the page_table_lock when
walking pmd entries. There may be consistency issue of a pmd entry,
because pmd entry might change from a huge pmd entry to a PTE page table.
There is only one user of kernel page table walker, namely ptdump. The
ptdump already considers the consistency, which use a local variable to
cache the value of pmd entry. But we also need to update ->action to
ACTION_CONTINUE to make sure the walker does not walk every pte entry
again when concurrent thread has split the huge pmd.
Link: https://lkml.kernel.org/r/20211101031651.75851-4-songmuchun@bytedance.com
Signed-off-by: Muchun Song <songmuchun@bytedance.com>
Cc: Barry Song <song.bao.hua@hisilicon.com>
Cc: Bodeddula Balasubramaniam <bodeddub@amazon.com>
Cc: Chen Huang <chenhuang5@huawei.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Fam Zheng <fam.zheng@bytedance.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Matthew Wilcox <willy@infradead.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Oscar Salvador <osalvador@suse.de>
Cc: Qi Zheng <zhengqi.arch@bytedance.com>
Cc: Xiongchun Duan <duanxiongchun@bytedance.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
mm/ptdump.c | 16 ++++++++++----
mm/sparse-vmemmap.c | 47 +++++++++++++++++++++++++++---------------
2 files changed, 43 insertions(+), 20 deletions(-)
--- a/mm/ptdump.c~mm-sparsemem-use-page-table-lock-to-protect-kernel-pmd-operations
+++ a/mm/ptdump.c
@@ -40,8 +40,10 @@ static int ptdump_pgd_entry(pgd_t *pgd,
if (st->effective_prot)
st->effective_prot(st, 0, pgd_val(val));
- if (pgd_leaf(val))
+ if (pgd_leaf(val)) {
st->note_page(st, addr, 0, pgd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -61,8 +63,10 @@ static int ptdump_p4d_entry(p4d_t *p4d,
if (st->effective_prot)
st->effective_prot(st, 1, p4d_val(val));
- if (p4d_leaf(val))
+ if (p4d_leaf(val)) {
st->note_page(st, addr, 1, p4d_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -82,8 +86,10 @@ static int ptdump_pud_entry(pud_t *pud,
if (st->effective_prot)
st->effective_prot(st, 2, pud_val(val));
- if (pud_leaf(val))
+ if (pud_leaf(val)) {
st->note_page(st, addr, 2, pud_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
@@ -101,8 +107,10 @@ static int ptdump_pmd_entry(pmd_t *pmd,
if (st->effective_prot)
st->effective_prot(st, 3, pmd_val(val));
- if (pmd_leaf(val))
+ if (pmd_leaf(val)) {
st->note_page(st, addr, 3, pmd_val(val));
+ walk->action = ACTION_CONTINUE;
+ }
return 0;
}
--- a/mm/sparse-vmemmap.c~mm-sparsemem-use-page-table-lock-to-protect-kernel-pmd-operations
+++ a/mm/sparse-vmemmap.c
@@ -53,8 +53,7 @@ struct vmemmap_remap_walk {
struct list_head *vmemmap_pages;
};
-static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start,
- struct vmemmap_remap_walk *walk)
+static int __split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
{
pmd_t __pmd;
int i;
@@ -76,15 +75,34 @@ static int split_vmemmap_huge_pmd(pmd_t
set_pte_at(&init_mm, addr, pte, entry);
}
- /* Make pte visible before pmd. See comment in pmd_install(). */
- smp_wmb();
- pmd_populate_kernel(&init_mm, pmd, pgtable);
-
- flush_tlb_kernel_range(start, start + PMD_SIZE);
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pmd_leaf(*pmd))) {
+ /* Make pte visible before pmd. See comment in pmd_install(). */
+ smp_wmb();
+ pmd_populate_kernel(&init_mm, pmd, pgtable);
+ flush_tlb_kernel_range(start, start + PMD_SIZE);
+ } else {
+ pte_free_kernel(&init_mm, pgtable);
+ }
+ spin_unlock(&init_mm.page_table_lock);
return 0;
}
+static int split_vmemmap_huge_pmd(pmd_t *pmd, unsigned long start)
+{
+ int leaf;
+
+ spin_lock(&init_mm.page_table_lock);
+ leaf = pmd_leaf(*pmd);
+ spin_unlock(&init_mm.page_table_lock);
+
+ if (!leaf)
+ return 0;
+
+ return __split_vmemmap_huge_pmd(pmd, start);
+}
+
static void vmemmap_pte_range(pmd_t *pmd, unsigned long addr,
unsigned long end,
struct vmemmap_remap_walk *walk)
@@ -121,13 +139,12 @@ static int vmemmap_pmd_range(pud_t *pud,
pmd = pmd_offset(pud, addr);
do {
- if (pmd_leaf(*pmd)) {
- int ret;
+ int ret;
+
+ ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK);
+ if (ret)
+ return ret;
- ret = split_vmemmap_huge_pmd(pmd, addr & PMD_MASK, walk);
- if (ret)
- return ret;
- }
next = pmd_addr_end(addr, end);
vmemmap_pte_range(pmd, addr, next, walk);
} while (pmd++, addr = next, addr != end);
@@ -321,10 +338,8 @@ int vmemmap_remap_free(unsigned long sta
*/
BUG_ON(start - reuse != PAGE_SIZE);
- mmap_write_lock(&init_mm);
+ mmap_read_lock(&init_mm);
ret = vmemmap_remap_range(reuse, end, &walk);
- mmap_write_downgrade(&init_mm);
-
if (ret && walk.nr_walked) {
end = reuse + walk.nr_walked * PAGE_SIZE;
/*
_
Patches currently in -mm which might be from songmuchun@bytedance.com are
reply other threads:[~2022-03-25 1:33 UTC|newest]
Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20220325013158.53813C340ED@smtp.kernel.org \
--to=akpm@linux-foundation.org \
--cc=bodeddub@amazon.com \
--cc=chenhuang5@huawei.com \
--cc=corbet@lwn.net \
--cc=david@redhat.com \
--cc=duanxiongchun@bytedance.com \
--cc=fam.zheng@bytedance.com \
--cc=linux-kernel@vger.kernel.org \
--cc=mhocko@suse.com \
--cc=mike.kravetz@oracle.com \
--cc=mm-commits@vger.kernel.org \
--cc=osalvador@suse.de \
--cc=song.bao.hua@hisilicon.com \
--cc=songmuchun@bytedance.com \
--cc=willy@infradead.org \
--cc=zhengqi.arch@bytedance.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.