From: Dev Jain <dev.jain@arm.com>
To: Yang Shi <yang@os.amperecomputing.com>,
ryan.roberts@arm.com, will@kernel.org, catalin.marinas@arm.com,
akpm@linux-foundation.org, Miko.Lenczewski@arm.com,
scott@os.amperecomputing.com, cl@gentwo.org
Cc: linux-arm-kernel@lists.infradead.org, linux-kernel@vger.kernel.org
Subject: Re: [PATCH 3/4] arm64: mm: support large block mapping when rodata=full
Date: Tue, 29 Jul 2025 18:04:26 +0530 [thread overview]
Message-ID: <d9960dbc-376d-4d33-9334-e4eb546b69ae@arm.com> (raw)
In-Reply-To: <20250724221216.1998696-4-yang@os.amperecomputing.com>
On 25/07/25 3:41 am, Yang Shi wrote:
> [----- snip -----]
>
> #ifdef CONFIG_ARM64_PAN
> diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
> index 3d5fb37424ab..f63b39613571 100644
> --- a/arch/arm64/mm/mmu.c
> +++ b/arch/arm64/mm/mmu.c
> @@ -480,6 +480,8 @@ void create_kpti_ng_temp_pgd(pgd_t *pgdir, phys_addr_t phys, unsigned long virt,
> int flags);
> #endif
>
> +#define INVALID_PHYS_ADDR -1
> +
> static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
> enum pgtable_type pgtable_type)
> {
> @@ -487,7 +489,9 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
> struct ptdesc *ptdesc = pagetable_alloc(GFP_PGTABLE_KERNEL & ~__GFP_ZERO, 0);
> phys_addr_t pa;
>
> - BUG_ON(!ptdesc);
> + if (!ptdesc)
> + return INVALID_PHYS_ADDR;
> +
> pa = page_to_phys(ptdesc_page(ptdesc));
>
> switch (pgtable_type) {
> @@ -509,15 +513,29 @@ static phys_addr_t __pgd_pgtable_alloc(struct mm_struct *mm,
> }
>
> static phys_addr_t __maybe_unused
> -pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
> +split_pgtable_alloc(enum pgtable_type pgtable_type)
> {
> return __pgd_pgtable_alloc(&init_mm, pgtable_type);
> }
>
> +static phys_addr_t __maybe_unused
> +pgd_pgtable_alloc_init_mm(enum pgtable_type pgtable_type)
> +{
> + phys_addr_t pa;
> +
> + pa = __pgd_pgtable_alloc(&init_mm, pgtable_type);
> + BUG_ON(pa == INVALID_PHYS_ADDR);
> + return pa;
> +}
> +
> static phys_addr_t
> pgd_pgtable_alloc_special_mm(enum pgtable_type pgtable_type)
> {
> - return __pgd_pgtable_alloc(NULL, pgtable_type);
> + phys_addr_t pa;
> +
> + pa = __pgd_pgtable_alloc(NULL, pgtable_type);
> + BUG_ON(pa == INVALID_PHYS_ADDR);
> + return pa;
> }
>
> /*
> @@ -552,6 +570,254 @@ void __init create_pgd_mapping(struct mm_struct *mm, phys_addr_t phys,
> pgd_pgtable_alloc_special_mm, flags);
> }
>
> +static DEFINE_MUTEX(pgtable_split_lock);
Thanks for taking a separate lock.
> +
> +static int split_cont_pte(pmd_t *pmdp, unsigned long addr, unsigned long end)
> +{
> + pte_t *ptep;
> + unsigned long next;
> + unsigned int nr;
> + unsigned long span;
> +
> + ptep = pte_offset_kernel(pmdp, addr);
> +
> + do {
> + pte_t *_ptep;
> +
> + nr = 0;
> + next = pte_cont_addr_end(addr, end);
> + if (next < end)
> + nr = max(nr, ((end - next) / CONT_PTE_SIZE));
> + span = nr * CONT_PTE_SIZE;
> +
> + _ptep = PTR_ALIGN_DOWN(ptep, sizeof(*ptep) * CONT_PTES);
> + ptep += pte_index(next) - pte_index(addr) + nr * CONT_PTES;
> +
> + if (((addr | next) & ~CONT_PTE_MASK) == 0)
> + continue;
> +
> + if (!pte_cont(__ptep_get(_ptep)))
> + continue;
> +
> + for (int i = 0; i < CONT_PTES; i++, _ptep++)
> + __set_pte(_ptep, pte_mknoncont(__ptep_get(_ptep)));
> + } while (addr = next + span, addr != end);
> +
> + return 0;
> +}
> +
> +static int split_pmd(pmd_t *pmdp, unsigned long addr, unsigned long end)
> +{
> + unsigned long next;
> + unsigned int nr;
> + unsigned long span;
> + int ret = 0;
> +
> + do {
> + pmd_t pmd;
> +
> + nr = 1;
> + next = pmd_addr_end(addr, end);
> + if (next < end)
> + nr = max(nr, ((end - next) / PMD_SIZE));
> + span = (nr - 1) * PMD_SIZE;
> +
> + if (((addr | next) & ~PMD_MASK) == 0)
> + continue;
> +
> + pmd = pmdp_get(pmdp);
> + if (pmd_leaf(pmd)) {
> + phys_addr_t pte_phys;
> + pte_t *ptep;
> + pmdval_t pmdval = PMD_TYPE_TABLE | PMD_TABLE_UXN |
> + PMD_TABLE_AF;
> + unsigned long pfn = pmd_pfn(pmd);
> + pgprot_t prot = pmd_pgprot(pmd);
> +
> + pte_phys = split_pgtable_alloc(TABLE_PTE);
> + if (pte_phys == INVALID_PHYS_ADDR)
> + return -ENOMEM;
> +
> + if (pgprot_val(prot) & PMD_SECT_PXN)
> + pmdval |= PMD_TABLE_PXN;
> +
> + prot = __pgprot((pgprot_val(prot) & ~PTE_TYPE_MASK) |
> + PTE_TYPE_PAGE);
> + prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> + ptep = (pte_t *)phys_to_virt(pte_phys);
> + for (int i = 0; i < PTRS_PER_PTE; i++, ptep++, pfn++)
> + __set_pte(ptep, pfn_pte(pfn, prot));
> +
> + dsb(ishst);
> +
> + __pmd_populate(pmdp, pte_phys, pmdval);
> + }
> +
> + ret = split_cont_pte(pmdp, addr, next);
> + if (ret)
> + break;
> + } while (pmdp += nr, addr = next + span, addr != end);
> +
> + return ret;
> +}
> +
> +static int split_cont_pmd(pud_t *pudp, unsigned long addr, unsigned long end)
> +{
> + pmd_t *pmdp;
> + unsigned long next;
> + unsigned int nr;
> + unsigned long span;
> + int ret = 0;
> +
> + pmdp = pmd_offset(pudp, addr);
> +
> + do {
> + pmd_t *_pmdp;
> +
> + nr = 0;
> + next = pmd_cont_addr_end(addr, end);
> + if (next < end)
> + nr = max(nr, ((end - next) / CONT_PMD_SIZE));
> + span = nr * CONT_PMD_SIZE;
> +
> + if (((addr | next) & ~CONT_PMD_MASK) == 0) {
> + pmdp += pmd_index(next) - pmd_index(addr) +
> + nr * CONT_PMDS;
> + continue;
> + }
> +
> + _pmdp = PTR_ALIGN_DOWN(pmdp, sizeof(*pmdp) * CONT_PMDS);
> + if (!pmd_cont(pmdp_get(_pmdp)))
> + goto split;
> +
> + for (int i = 0; i < CONT_PMDS; i++, _pmdp++)
> + set_pmd(_pmdp, pmd_mknoncont(pmdp_get(_pmdp)));
> +
> +split:
> + ret = split_pmd(pmdp, addr, next);
> + if (ret)
> + break;
> +
> + pmdp += pmd_index(next) - pmd_index(addr) + nr * CONT_PMDS;
> + } while (addr = next + span, addr != end);
> +
> + return ret;
> +}
> +
> +static int split_pud(p4d_t *p4dp, unsigned long addr, unsigned long end)
> +{
> + pud_t *pudp;
> + unsigned long next;
> + unsigned int nr;
> + unsigned long span;
> + int ret = 0;
> +
> + pudp = pud_offset(p4dp, addr);
> +
> + do {
> + pud_t pud;
> +
> + nr = 1;
> + next = pud_addr_end(addr, end);
> + if (next < end)
> + nr = max(nr, ((end - next) / PUD_SIZE));
> + span = (nr - 1) * PUD_SIZE;
> +
> + if (((addr | next) & ~PUD_MASK) == 0)
> + continue;
> +
> + pud = pudp_get(pudp);
> + if (pud_leaf(pud)) {
> + phys_addr_t pmd_phys;
> + pmd_t *pmdp;
> + pudval_t pudval = PUD_TYPE_TABLE | PUD_TABLE_UXN |
> + PUD_TABLE_AF;
> + unsigned long pfn = pud_pfn(pud);
> + pgprot_t prot = pud_pgprot(pud);
> + unsigned int step = PMD_SIZE >> PAGE_SHIFT;
> +
> + pmd_phys = split_pgtable_alloc(TABLE_PMD);
> + if (pmd_phys == INVALID_PHYS_ADDR)
> + return -ENOMEM;
> +
> + if (pgprot_val(prot) & PMD_SECT_PXN)
> + pudval |= PUD_TABLE_PXN;
> +
> + prot = __pgprot((pgprot_val(prot) & ~PMD_TYPE_MASK) |
> + PMD_TYPE_SECT);
> + prot = __pgprot(pgprot_val(prot) | PTE_CONT);
> + pmdp = (pmd_t *)phys_to_virt(pmd_phys);
> + for (int i = 0; i < PTRS_PER_PMD; i++, pmdp++) {
> + set_pmd(pmdp, pfn_pmd(pfn, prot));
> + pfn += step;
> + }
> +
> + dsb(ishst);
> +
> + __pud_populate(pudp, pmd_phys, pudval);
> + }
> +
> + ret = split_cont_pmd(pudp, addr, next);
> + if (ret)
> + break;
> + } while (pudp += nr, addr = next + span, addr != end);
> +
> + return ret;
> +}
> +
> +static int split_p4d(pgd_t *pgdp, unsigned long addr, unsigned long end)
> +{
> + p4d_t *p4dp;
> + unsigned long next;
> + int ret = 0;
> +
> + p4dp = p4d_offset(pgdp, addr);
> +
> + do {
> + next = p4d_addr_end(addr, end);
> +
> + ret = split_pud(p4dp, addr, next);
> + if (ret)
> + break;
> + } while (p4dp++, addr = next, addr != end);
> +
> + return ret;
> +}
> +
> +static int split_pgd(pgd_t *pgdp, unsigned long addr, unsigned long end)
> +{
> + unsigned long next;
> + int ret = 0;
> +
> + do {
> + next = pgd_addr_end(addr, end);
> + ret = split_p4d(pgdp, addr, next);
> + if (ret)
> + break;
> + } while (pgdp++, addr = next, addr != end);
> +
> + return ret;
> +}
> +
> +int split_kernel_pgtable_mapping(unsigned long start, unsigned long end)
> +{
> + int ret;
> +
> + if (!system_supports_bbml2_noabort())
> + return 0;
> +
> + if (start != PAGE_ALIGN(start) || end != PAGE_ALIGN(end))
> + return -EINVAL;
> +
> + mutex_lock(&pgtable_split_lock);
> + arch_enter_lazy_mmu_mode();
> + ret = split_pgd(pgd_offset_k(start), start, end);
> + arch_leave_lazy_mmu_mode();
> + mutex_unlock(&pgtable_split_lock);
> +
> + return ret;
> +}
> +
> /*
--- snip ---
I'm afraid I'll have to agree with Ryan :) Looking at the signature of split_kernel_pgtable_mapping,
one would expect that this function splits all block mappings in this region. But that is just a
nit; it does not seem right to me that we are iterating over the entire space when we know *exactly* where
we have to make the split, just to save on pgd/p4d/pud loads - the effect of which is probably cancelled
out by unnecessary iterations your approach takes to skip the intermediate blocks.
If we are concerned that most change_memory_common() operations are for a single page, then
we can do something like:
unsigned long size = end - start;
bool end_split, start_split = false;
if (start not aligned to block mapping)
start_split = split(start);
/*
* split the end only if the start wasn't split, or
* if it cannot be guaranteed that start and end lie
* on the same contig block
*/
if (!start_split || (size > CONT_PTE_SIZE))
end_split = split(end);
next prev parent reply other threads:[~2025-07-29 12:45 UTC|newest]
Thread overview: 33+ messages / expand[flat|nested] mbox.gz Atom feed top
2025-07-24 22:11 [v5 PATCH 0/4] arm64: support FEAT_BBM level 2 and large block mapping when rodata=full Yang Shi
2025-07-24 22:11 ` [PATCH 1/4] arm64: Enable permission change on arm64 kernel block mappings Yang Shi
2025-07-24 22:11 ` [PATCH 2/4] arm64: cpufeature: add AmpereOne to BBML2 allow list Yang Shi
2025-08-01 14:36 ` Ryan Roberts
2025-08-04 23:20 ` Christoph Lameter (Ampere)
2025-07-24 22:11 ` [PATCH 3/4] arm64: mm: support large block mapping when rodata=full Yang Shi
2025-07-29 12:34 ` Dev Jain [this message]
2025-08-05 21:28 ` Yang Shi
2025-08-06 0:10 ` Yang Shi
2025-08-01 14:35 ` Ryan Roberts
2025-08-04 10:07 ` Ryan Roberts
2025-08-05 18:53 ` Yang Shi
2025-08-06 7:20 ` Ryan Roberts
2025-08-07 0:44 ` Yang Shi
2025-07-24 22:11 ` [PATCH 4/4] arm64: mm: split linear mapping if BBML2 is not supported on secondary CPUs Yang Shi
2025-07-26 11:10 ` kernel test robot
2025-08-01 16:14 ` Ryan Roberts
2025-08-05 18:59 ` Yang Shi
2025-08-05 7:54 ` Ryan Roberts
2025-08-05 17:45 ` Yang Shi
-- strict thread matches above, loose matches on Subject: below --
2025-05-31 2:41 [v4 PATCH 0/4] arm64: support FEAT_BBM level 2 and large block mapping when rodata=full Yang Shi
2025-05-31 2:41 ` [PATCH 3/4] arm64: mm: support " Yang Shi
2025-06-16 11:58 ` Ryan Roberts
2025-06-16 12:33 ` Ryan Roberts
2025-06-17 21:01 ` Yang Shi
2025-06-16 16:24 ` Ryan Roberts
2025-06-17 21:09 ` Yang Shi
2025-06-23 13:26 ` Ryan Roberts
2025-06-23 19:12 ` Yang Shi
2025-06-26 22:39 ` Yang Shi
2025-07-23 17:38 ` Dev Jain
2025-07-23 20:51 ` Yang Shi
2025-07-24 11:43 ` Dev Jain
2025-07-24 17:59 ` Yang Shi
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=d9960dbc-376d-4d33-9334-e4eb546b69ae@arm.com \
--to=dev.jain@arm.com \
--cc=Miko.Lenczewski@arm.com \
--cc=akpm@linux-foundation.org \
--cc=catalin.marinas@arm.com \
--cc=cl@gentwo.org \
--cc=linux-arm-kernel@lists.infradead.org \
--cc=linux-kernel@vger.kernel.org \
--cc=ryan.roberts@arm.com \
--cc=scott@os.amperecomputing.com \
--cc=will@kernel.org \
--cc=yang@os.amperecomputing.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.