Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] arm64: mm: call pagetable dtor when freeing hot-removed page tables
From: Alistair Popple @ 2026-05-21  3:27 UTC (permalink / raw)
  To: linux-arm-kernel
  Cc: linux-kernel, linux-mm, catalin.marinas, will, david, akpm,
	Alistair Popple

Since 5e8eb9aeeda3 ("arm64: mm: always call PTE/PMD ctor in
__create_pgd_mapping()") page-table allocation on ARM64 always
calls pagetable_{pte,pmd,pud,p4d}_ctor(). This sets the page_type
to PGTY_table, increments NR_PAGETABLE and possible allocates a PTL.
However the matching pagetable_dtor() calls were never added.

With DEBUG_VM enabled on kernel versions prior to v6.17 without
2dfcd1608f3a9 ("mm/page_alloc: let page freeing clear any set page
type") this leads to the following warning when freeing these pages due
to page->page_type sharing page->_mapcount:

  BUG: Bad page state in process ... pfn:284fbb
  page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x284fbb
  flags: 0x17fffc000000000(node=0|zone=2|lastcpupid=0x1ffff)
  page_type: f2(table)
  page dumped because: nonzero mapcount
  Call trace:
   bad_page+0x13c/0x160
   __free_frozen_pages+0x6cc/0x860
   ___free_pages+0xf4/0x180
   free_pages+0x54/0x80
   free_hotplug_page_range.part.0+0x58/0x90
   free_empty_tables+0x438/0x500
   __remove_pgd_mapping.constprop.0+0x60/0xa8
   arch_remove_memory+0x48/0x80
   try_remove_memory+0x158/0x1d8
   offline_and_remove_memory+0x138/0x180

It can also lead to leaking the ptl allocation if ALLOC_SPLIT_PTLOCKS
is defined and incorrect NR_PAGETABLE stats. Fix this by calling
pagetable_dtor() in free_hotplug_pgtable_page() prior to freeing the
page to undo the effects of calling pagetable_*_ctor().

Fixes: 5e8eb9aeeda3 ("arm64: mm: always call PTE/PMD ctor in __create_pgd_mapping()")
Signed-off-by: Alistair Popple <apopple@nvidia.com>
---
 arch/arm64/mm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c
index 8e1d80a7033e..0c24fe650e95 100644
--- a/arch/arm64/mm/mmu.c
+++ b/arch/arm64/mm/mmu.c
@@ -1422,6 +1422,7 @@ static void free_hotplug_page_range(struct page *page, size_t size,
 
 static void free_hotplug_pgtable_page(struct page *page)
 {
+	pagetable_dtor(page_ptdesc(page));
 	free_hotplug_page_range(page, PAGE_SIZE, NULL);
 }
 
-- 
2.54.0



^ permalink raw reply related

* Re: [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
From: Emil Tsalapatis @ 2026-05-21  3:17 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel
In-Reply-To: <20260520235052.4180316-4-tj@kernel.org>

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
> only - it's used by the verifier to inline the kfunc when the call site is
> non-sleepable. There is no sleepable equivalent for kernel callers; the
> kfunc bpf_arena_alloc_pages itself is BPF-only.
>
> sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
> paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
> wrapper but passing sleepable=true to arena_alloc_pages().
>
> Signed-off-by: Tejun Heo <tj@kernel.org>

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

> ---
>  include/linux/bpf.h |  8 ++++++++
>  kernel/bpf/arena.c  | 13 +++++++++++++
>  2 files changed, 21 insertions(+)
>
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 831996c411cf..64968ca6db51 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
>  void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
>  					  u64 flags);
>  void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
> +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
> +				      u64 flags);
>  #else
>  static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
>  							int node_id, u64 flags)
> @@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
>  static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  }
> +
> +static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
> +						    int node_id, u64 flags)
> +{
> +	return NULL;
> +}
>  #endif
>  
>  extern const struct bpf_map_ops bpf_map_offload_ops;
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 1c0b87ecc817..a811cf6170fa 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -934,6 +934,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
>  
>  	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
>  }
> +
> +void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
> +				      int node_id, u64 flags)
> +{
> +	struct bpf_map *map = p__map;
> +	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
> +
> +	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
> +		return NULL;
> +
> +	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
> +}
> +
>  __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
>  {
>  	struct bpf_map *map = p__map;



^ permalink raw reply

* Re: [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
From: Emil Tsalapatis @ 2026-05-21  3:16 UTC (permalink / raw)
  To: Tejun Heo, David Vernet, Andrea Righi, Changwoo Min,
	Alexei Starovoitov, Andrii Nakryiko, Daniel Borkmann,
	Martin KaFai Lau, Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel
In-Reply-To: <20260520235052.4180316-3-tj@kernel.org>

On Wed May 20, 2026 at 7:50 PM EDT, Tejun Heo wrote:
> From: Kumar Kartikeya Dwivedi <memxor@gmail.com>
>
> BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
> over arena memory is awkward today. Data has to be staged through a trusted
> kernel pointer with extra code and copying on the BPF side. While reads
> through arena pointers can use a fault-safe helper, writes don't have a good
> solution. The in-line alternative would need instruction emulation or asm
> fixup labels.
>
> Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
> handed-in arena pointer, without bounds checking. A per-arena scratch page
> is installed by the arch fault path into empty arena kernel PTEs - x86 from
> page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
> translation faults, both after the existing exception-table and KFENCE
> handling. The faulting instruction retries and the access is also reported
> through the program's BPF stream, preserving error reporting.
>
> bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
> from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
> the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
> half-guard is excluded because well-behaved kfuncs only access forward from
> arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
> a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.
>
> The install is lock-free via ptep_try_set(). On race-loss the winning
> installer's PTE is already valid, so the access retry succeeds. The arena
> clear path uses ptep_get_and_clear() so installer and clearer race through
> atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
> entries just cause one extra re-fault, cheaper than a global IPI on every
> install.
>
> Scratch exists only to keep the kernel from oopsing on an in-line arena
> access. Its presence at a PTE means the BPF program has already
> malfunctioned, and the violation is reported through the program's BPF
> stream. The only requirement for behavior on a scratched PTE is that the
> kernel doesn't crash. In particular, any user-side access through such a PTE
> may segfault. The shared scratch page is freed once during map destruction.
>
> BPF instruction faults continue to use the existing JIT exception-table
> path. This patch changes only the kernel-text fault path. No UAPI flag is
> added. The new behavior is the default.
>
> v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)
>
> Suggested-by: Alexei Starovoitov <ast@kernel.org>
> Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Cc: David Hildenbrand <david@kernel.org>
> ---

Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>

>  Documentation/bpf/kfuncs.rst |  14 +++
>  arch/arm64/mm/fault.c        |  10 +-
>  arch/x86/mm/fault.c          |  12 ++-
>  include/linux/bpf.h          |   1 +
>  include/linux/bpf_defs.h     |  11 +++
>  kernel/bpf/arena.c           | 177 +++++++++++++++++++++++++++--------
>  kernel/bpf/core.c            |   5 +
>  7 files changed, 183 insertions(+), 47 deletions(-)
>  create mode 100644 include/linux/bpf_defs.h
>
> diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
> index 75e6c078e0e7..6d497e720998 100644
> --- a/Documentation/bpf/kfuncs.rst
> +++ b/Documentation/bpf/kfuncs.rst
> @@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
>  PTR_TO_BTF_ID type matching if two types have the exact same name, with one
>  being suffixed with ``___init``.
>  
> +2.8 Accessing arena memory through kfunc arguments
> +--------------------------------------------------
> +
> +A read or write at any address inside an arena does not oops the kernel.
> +Unallocated arena pages are lazily backed by a scratch page and the
> +access is reported through the program's BPF stream as an error. Only
> +the BPF program's correctness is affected; the kernel itself remains
> +intact.
> +
> +The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
> +is also covered by this recovery. A kfunc handed an arena pointer may
> +therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
> +against the arena. Larger accesses must verify the range explicitly.
> +
>  .. _BPF_kfunc_lifecycle_expectations:
>  
>  3. kfunc lifecycle expectations
> diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
> index 920a8b244d59..0d58d667fcd8 100644
> --- a/arch/arm64/mm/fault.c
> +++ b/arch/arm64/mm/fault.c
> @@ -9,6 +9,7 @@
>  
>  #include <linux/acpi.h>
>  #include <linux/bitfield.h>
> +#include <linux/bpf_defs.h>
>  #include <linux/extable.h>
>  #include <linux/kfence.h>
>  #include <linux/signal.h>
> @@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
>  	} else if (addr < PAGE_SIZE) {
>  		msg = "NULL pointer dereference";
>  	} else {
> -		if (esr_fsc_is_translation_fault(esr) &&
> -		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
> -			return;
> +		if (esr_fsc_is_translation_fault(esr)) {
> +			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
> +				return;
> +			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
> +				return;
> +		}
>  
>  		msg = "paging request";
>  	}
> diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
> index f0e77e084482..b0f103ddbd23 100644
> --- a/arch/x86/mm/fault.c
> +++ b/arch/x86/mm/fault.c
> @@ -8,6 +8,7 @@
>  #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
>  #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
>  #include <linux/memblock.h>		/* max_low_pfn			*/
> +#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
>  #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
>  #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
>  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
> @@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
>  	if (IS_ENABLED(CONFIG_EFI))
>  		efi_crash_gracefully_on_page_fault(address);
>  
> -	/* Only not-present faults should be handled by KFENCE. */
> -	if (!(error_code & X86_PF_PROT) &&
> -	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
> -		return;
> +	/* Only not-present faults should be handled by KFENCE or BPF arena. */
> +	if (!(error_code & X86_PF_PROT)) {
> +		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
> +			return;
> +		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
> +			return;
> +	}
>  
>  oops:
>  	/*
> diff --git a/include/linux/bpf.h b/include/linux/bpf.h
> index 0136a108d083..831996c411cf 100644
> --- a/include/linux/bpf.h
> +++ b/include/linux/bpf.h
> @@ -6,6 +6,7 @@
>  
>  #include <uapi/linux/bpf.h>
>  #include <uapi/linux/filter.h>
> +#include <linux/bpf_defs.h>
>  
>  #include <crypto/sha2.h>
>  #include <linux/workqueue.h>
> diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
> new file mode 100644
> index 000000000000..d98e033b8c0b
> --- /dev/null
> +++ b/include/linux/bpf_defs.h
> @@ -0,0 +1,11 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * Subset of bpf.h declarations, split out so files that need only these
> + * declarations can avoid bpf.h's full include cost.
> + */
> +#ifndef _LINUX_BPF_DEFS_H
> +#define _LINUX_BPF_DEFS_H
> +
> +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
> +
> +#endif /* _LINUX_BPF_DEFS_H */
> diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
> index 08d008cc471e..1c0b87ecc817 100644
> --- a/kernel/bpf/arena.c
> +++ b/kernel/bpf/arena.c
> @@ -53,6 +53,7 @@ struct bpf_arena {
>  	u64 user_vm_start;
>  	u64 user_vm_end;
>  	struct vm_struct *kern_vm;
> +	struct page *scratch_page;
>  	struct range_tree rt;
>  	/* protects rt */
>  	rqspinlock_t spinlock;
> @@ -118,6 +119,11 @@ struct apply_range_data {
>  	int i;
>  };
>  
> +struct clear_range_data {
> +	struct llist_head *free_pages;
> +	struct page *scratch_page;
> +};
> +
>  static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
>  {
>  	struct apply_range_data *d = data;
> @@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
>  	flush_cache_vmap(start, start + size);
>  }
>  
> -static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
> +static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
>  {
> +	struct clear_range_data *d = data;
>  	pte_t old_pte;
>  	struct page *page;
>  
> -	/* sanity check */
> -	old_pte = ptep_get(pte);
> +	/*
> +	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
> +	 * Both sides must be atomic.
> +	 */
> +	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
>  	if (pte_none(old_pte) || !pte_present(old_pte))
> -		return 0; /* nothing to do */
> +		return 0;
>  
>  	page = pte_page(old_pte);
>  	if (WARN_ON_ONCE(!page))
>  		return -EINVAL;
>  
> -	pte_clear(&init_mm, addr, pte);
> +	/*
> +	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
> +	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
> +	 * here. Without the skip, scratch_page would be freed.
> +	 */
> +	if (page == d->scratch_page)
> +		return 0;
> +
> +	__llist_add(&page->pcp_llist, d->free_pages);
> +	return 0;
> +}
>  
> -	/* Add page to the list so it is freed later */
> -	if (free_pages)
> -		__llist_add(&page->pcp_llist, free_pages);
> +static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
> +{
> +	struct page *scratch_page = data;
>  
> +	if (!pte_none(ptep_get(pte)))
> +		return 0;
> +	/*
> +	 * Best-effort install. ptep_try_set() returns false only if another
> +	 * installer (real allocation or concurrent fault) won the cmpxchg.
> +	 * Their PTE is already valid, so the access retry succeeds.
> +	 *
> +	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
> +	 * cause one extra re-fault through this same path.
> +	 */
> +	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
>  	return 0;
>  }
>  
>  static int populate_pgtable_except_pte(struct bpf_arena *arena)
>  {
> +	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
>  	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
> -				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
> +				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
>  }
>  
>  static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
> @@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
>  	init_irq_work(&arena->free_irq, arena_free_irq);
>  	INIT_WORK(&arena->free_work, arena_free_worker);
>  	bpf_map_init_from_attr(&arena->map, attr);
> +
> +	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
> +	if (err)
> +		goto err_free_arena;
> +
>  	range_tree_init(&arena->rt);
>  	err = range_tree_set(&arena->rt, 0, attr->max_entries);
> -	if (err) {
> -		bpf_map_area_free(arena);
> -		goto err;
> -	}
> +	if (err)
> +		goto err_free_scratch;
>  	mutex_init(&arena->lock);
>  	raw_res_spin_lock_init(&arena->spinlock);
>  	err = populate_pgtable_except_pte(arena);
> -	if (err) {
> -		range_tree_destroy(&arena->rt);
> -		bpf_map_area_free(arena);
> -		goto err;
> -	}
> +	if (err)
> +		goto err_destroy_rt;
>  
>  	return &arena->map;
> +
> +err_destroy_rt:
> +	range_tree_destroy(&arena->rt);
> +err_free_scratch:
> +	__free_page(arena->scratch_page);
> +err_free_arena:
> +	bpf_map_area_free(arena);
>  err:
>  	free_vm_area(kern_vm);
>  	return ERR_PTR(err);
> @@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
>  
>  static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
>  {
> +	struct bpf_arena *arena = data;
>  	struct page *page;
>  	pte_t pte;
>  
> @@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
>  	if (!pte_present(pte)) /* sanity check */
>  		return 0;
>  	page = pte_page(pte);
> +	/*
> +	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
> +	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
> +	 */
> +	if (page == arena->scratch_page)
> +		return 0;
>  	/*
>  	 * We do not update pte here:
>  	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
> @@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
>  	 * free those pages.
>  	 */
>  	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
> -				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
> +				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
>  	free_vm_area(arena->kern_vm);
>  	range_tree_destroy(&arena->rt);
> +	__free_page(arena->scratch_page);
>  	bpf_map_area_free(arena);
>  }
>  
> @@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  		return VM_FAULT_RETRY;
>  
>  	page = vmalloc_to_page((void *)kaddr);
> -	if (page)
> +	if (page) {
> +		if (page == arena->scratch_page)
> +			/* BPF triggered scratch here; don't lazy-alloc over it */
> +			goto out_sigsegv;
>  		/* already have a page vmap-ed */
>  		goto out;
> +	}
>  
>  	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
>  
>  	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
>  		/* User space requested to segfault when page is not allocated by bpf prog */
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  
>  	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
>  	if (ret)
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  
>  	struct apply_range_data data = { .pages = &page, .i = 0 };
>  	/* Account into memcg of the process that created bpf_arena */
>  	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  	}
>  
>  	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
>  	if (ret) {
>  		range_tree_set(&arena->rt, vmf->pgoff, 1);
>  		free_pages_nolock(page, 0);
> -		goto out_unlock_sigsegv;
> +		goto out_sigsegv_memcg;
>  	}
>  	flush_vmap_cache(kaddr, PAGE_SIZE);
>  	bpf_map_memcg_exit(old_memcg, new_memcg);
> @@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>  	vmf->page = page;
>  	return 0;
> -out_unlock_sigsegv:
> +out_sigsegv_memcg:
>  	bpf_map_memcg_exit(old_memcg, new_memcg);
> +out_sigsegv:
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
>  	return VM_FAULT_SIGSEGV;
>  }
> @@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>  	struct llist_head free_pages;
>  	struct llist_node *pos, *t;
>  	struct arena_free_span *s;
> +	struct clear_range_data cdata;
>  	unsigned long flags;
>  	int ret = 0;
>  
> @@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
>  	range_tree_set(&arena->rt, pgoff, page_cnt);
>  
>  	init_llist_head(&free_pages);
> +	cdata.free_pages = &free_pages;
> +	cdata.scratch_page = arena->scratch_page;
>  	/* clear ptes and collect struct pages */
>  	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
> -				     apply_range_clear_cb, &free_pages);
> +				     apply_range_clear_cb, &cdata);
>  
>  	/* drop the lock to do the tlb flush and zap pages */
>  	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
> @@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
>  	struct arena_free_span *s;
>  	u64 arena_vm_start, user_vm_start;
>  	struct llist_head free_pages;
> +	struct clear_range_data cdata;
>  	struct page *page;
>  	unsigned long full_uaddr;
>  	long kaddr, page_cnt, pgoff;
> @@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
>  	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
>  
>  	init_llist_head(&free_pages);
> +	cdata.free_pages = &free_pages;
> +	cdata.scratch_page = arena->scratch_page;
>  	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
>  	user_vm_start = bpf_arena_get_user_vm_start(arena);
>  
> @@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
>  
>  		/* clear ptes and collect pages in free_pages llist */
>  		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
> -					     apply_range_clear_cb, &free_pages);
> +					     apply_range_clear_cb, &cdata);
>  
>  		range_tree_set(&arena->rt, pgoff, page_cnt);
>  	}
> @@ -928,23 +986,12 @@ static int __init kfunc_init(void)
>  }
>  late_initcall(kfunc_init);
>  
> -void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
> +static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
> +					      unsigned long addr, unsigned long fault_ip)
>  {
>  	struct bpf_stream_stage ss;
> -	struct bpf_prog *prog;
>  	u64 user_vm_start;
>  
> -	/*
> -	 * The RCU read lock is held to safely traverse the latch tree, but we
> -	 * don't need its protection when accessing the prog, since it will not
> -	 * disappear while we are handling the fault.
> -	 */
> -	rcu_read_lock();
> -	prog = bpf_prog_ksym_find(fault_ip);
> -	rcu_read_unlock();
> -	if (!prog)
> -		return;
> -
>  	/* Use main prog for stream access */
>  	prog = prog->aux->main_prog_aux->prog;
>  
> @@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
>  		bpf_stream_dump_stack(ss);
>  	}));
>  }
> +
> +bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
> +{
> +	struct bpf_arena *arena;
> +	struct bpf_prog *prog;
> +	unsigned long kbase;
> +	unsigned long page_addr = addr & PAGE_MASK;
> +
> +	prog = bpf_prog_find_from_stack();
> +	if (!prog)
> +		return false;
> +
> +	arena = prog->aux->arena;
> +	/* a prog not using arena may be on stack, so arena can be NULL */
> +	if (!arena)
> +		return false;
> +
> +	kbase = bpf_arena_get_kern_vm_start(arena);
> +
> +	/*
> +	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
> +	 * Lower guard is unreachable from kfuncs; an address there indicates
> +	 * a different bug class - leave it to the regular kernel oops path.
> +	 */
> +	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
> +		return false;
> +
> +	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
> +			    apply_range_set_scratch_cb, arena->scratch_page);
> +	flush_vmap_cache(page_addr, PAGE_SIZE);
> +	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
> +	return true;
> +}
> +
> +void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
> +{
> +	struct bpf_prog *prog;
> +
> +	/*
> +	 * The RCU read lock is held to safely traverse the latch tree, but we
> +	 * don't need its protection when accessing the prog, since it will not
> +	 * disappear while we are handling the fault.
> +	 */
> +	rcu_read_lock();
> +	prog = bpf_prog_ksym_find(fault_ip);
> +	rcu_read_unlock();
> +	if (!prog)
> +		return;
> +	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
> +}
> diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
> index 066b86e7233c..fa368d8920d9 100644
> --- a/kernel/bpf/core.c
> +++ b/kernel/bpf/core.c
> @@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
>  {
>  	return 0;
>  }
> +__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
> +					unsigned long fault_ip)
> +{
> +	return false;
> +}
>  
>  #ifdef CONFIG_BPF_SYSCALL
>  static int __init bpf_global_ma_init(void)



^ permalink raw reply

* [soc:zx/soc 1/1] htmldocs: Documentation/arch/arm/zte/zx297520v3.rst:66: WARNING: Title underline too short.
From: kernel test robot @ 2026-05-21  2:57 UTC (permalink / raw)
  To: Stefan Dösinger 
  Cc: oe-kbuild-all, linux-arm-kernel, arm, Linus Walleij,
	Krzysztof Kozlowski, linux-doc

tree:   https://git.kernel.org/pub/scm/linux/kernel/git/soc/soc.git zx/soc
head:   220ae5d36dba278003d265aabd080ffa78553f5a
commit: 220ae5d36dba278003d265aabd080ffa78553f5a [1/1] ARM: zte: Add zx297520v3 platform support
compiler: clang version 20.1.8 (https://github.com/llvm/llvm-project 87f0227cb60147a26a1eeb4fb06e3b505e9c7261)
docutils: docutils (Docutils 0.21.2, Python 3.13.5, on linux)
reproduce: (https://download.01.org/0day-ci/archive/20260521/202605210401.8D6jRbz8-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605210401.8D6jRbz8-lkp@intel.com/

All warnings (new ones prefixed by >>):

   WARNING: Documentation/ABI/testing/sysfs-class-reboot-mode-reboot_modes:36: abi_sys_class_reboot_mode_driver_reboot_modes doesn't have a description
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/os_mode is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:364; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:234
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/os_mode_index is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:373; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:243
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/touchpad/enabled is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:636; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:252
   WARNING: /sys/bus/usb/devices/<busnum>-<devnum>:<config num>.<interface num>/<hid-bus>:<vendor-id>:<product-id>.<num>/touchpad/enabled_index is defined 2 times: Documentation/ABI/testing/sysfs-driver-hid-lenovo-go:645; Documentation/ABI/testing/sysfs-driver-hid-lenovo-go-s:261
>> Documentation/arch/arm/zte/zx297520v3.rst:66: WARNING: Title underline too short.
--
   3. Building for built-in U-Boot
   --------------------------- [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:90: WARNING: Enumerated list ends without a blank line; unexpected unindent. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:116: WARNING: Inline literal start-string without end-string. [docutils]
   Documentation/arch/arm/zte/zx297520v3.rst:137: ERROR: Unexpected indentation. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:138: WARNING: Block quote ends without a blank line; unexpected unindent. [docutils]
   Documentation/arch/arm/zte/zx297520v3.rst:164: WARNING: Inline literal start-string without end-string. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:164: WARNING: Inline interpreted text or phrase reference start-string without end-string. [docutils]
>> Documentation/arch/arm/zte/zx297520v3.rst:7: WARNING: Document or section may not begin with a transition. [docutils]
   Documentation/arch/riscv/zicfilp.rst:79: WARNING: Inline literal start-string without end-string. [docutils]
   Documentation/core-api/kref:328: ./include/linux/kref.h:72: WARNING: Invalid C declaration: Expected end of definition. [error at 96]
   int kref_put_mutex (struct kref *kref, void (*release)(struct kref *kref), struct mutex *mutex) __cond_acquires(true# mutex)
   ------------------------------------------------------------------------------------------------^
   Documentation/core-api/kref:328: ./include/linux/kref.h:94: WARNING: Invalid C declaration: Expected end of definition. [error at 92]


vim +66 Documentation/arch/arm/zte/zx297520v3.rst

     6	
   > 7	...............................................................................
     8	
     9	Author:	Stefan Dösinger
    10	
    11	Date  : 27 Jan 2026
    12	
    13	1. Hardware description
    14	---------------------------
    15	Zx297520v3 SoCs use a 64 bit capable Cortex-A53 CPU and GICv3, although they
    16	run in arm32 mode only. The CPU has support EL3, but no hypervisor (EL2) and
    17	it seems to lack VFP and NEON.
    18	
    19	The SoC is used in a number of cheap LTE to WiFi routers, both battery powered
    20	MiFis and stationary CPEs. In addition to the CPU these devices usually have
    21	64 MB Ram (although some is shared with the LTE chip), 128 MB NAND flash, an
    22	SDIO connected RTL8192-type Wifi chip limited to 2.4 ghz operation, USB 2,
    23	and buttons. Devices with as low as 32 MB or as high as 128 MB ram exist, as
    24	do devices with 8 or 16 MB of NOR flash.
    25	
    26	Some devices, especially the stationary ones, have 100 mbit Ethernet and an
    27	Ethernet switch.
    28	
    29	Usually the devices have LEDs for status indication, although some have SPI or
    30	I2C connected displays
    31	
    32	Some have an SD card slot. If it exists, it is a better choice for the root
    33	file system because it easily outperforms the built-in NAND.
    34	
    35	The LTE interface runs on a separate DSP called ZSP880. It is probably derived
    36	from LSI ZSPs and has an undocumented instruction set. The ZSP communicates
    37	with the main CPU via SRAM and DRAM and a mailbox hardware that can generate
    38	IRQs on either ends.
    39	
    40	There is also a Cortex M0 CPU, which is responsible for early HW initialization
    41	and starting the Cortex A53 CPU. It does not have any essential purpose once
    42	U-Boot is started. A SRAM-Based handover protocol exists to run custom code on
    43	this CPU.
    44	
    45	2. Booting via USB
    46	---------------------------
    47	
    48	The Boot ROM has support for booting custom code via USB. This mode can be
    49	entered by connecting a Boot PIN to GND or by modifying the third byte on NAND
    50	(set it to anything other than 0x5A aka 'Z'). A free software tool to start
    51	custom U-Boot and kernels can be found here:
    52	
    53	https://github.com/zx297520v3-mainline/zx297520v3-loader
    54	
    55	If USB download mode is entered but no boot commands are sent through USB, the
    56	device will proceed to boot normally after a few seconds. It is therefore
    57	possible to enable USB boot permanently and still leave the default boot files
    58	in place.
    59	
    60	https://github.com/zx297520v3-mainline/u-boot-mainline
    61	
    62	Contains an U-Boot version that can be used with the USB loader and sets up the
    63	CPU and interrupt controller to comply with Linux's booting requirements.
    64	
    65	3. Building for built-in U-Boot
  > 66	---------------------------
    67	The devices come with an ancient U-Boot that loads legacy uImages from NAND and
    68	boots them without a chance for the user to interrupt. The images are stored in
    69	files ap_cpuap.bin and ap_recovery.bin on a jffs2 partition named imagefs,
    70	usually mtd4. A file named "fotaflag" switches between the two modes.
    71	
    72	In addition to the uImage header, those files have a 384 byte signature header,
    73	which is used for authenticating the images on some devices. Most devices have
    74	this authentication disabled and it is enough to pad the uImage files with 384
    75	zero bytes.
    76	
    77	Builtin U-Boot also poorly sets up the CPU. Read the next section for details
    78	on this. It has no support for loading DTBs, so CONFIG_ARM_APPENDED_DTB is
    79	needed.
    80	
    81	So to build an image that boots from NAND the following steps are necessary:
    82	
    83	1) Patch the assembly code from section 3 into arch/arm/kernel/head.S.
    84	2) make zx29_defconfig
    85	3) make [-j x]
    86	4) cat arch/arm/boot/zImage arch/arm/boot/dts/zte/[device].dtb > kernel+dtb
    87	5) mkimage -A arm -O linux -T kernel -C none -a 0x20008000 -d kernel+dtb uimg
    88	6) dd if=/dev/zero bs=1 count=384 of=ap_recovery.bin
    89	7) cat uimg >> ap_recovery.bin
  > 90	8) Place this file onto imagefs on the device. Delete ap_cpuap.bin if the
    91	free space is not enough.
    92	9) Create the file fotaflag: echo -n FOTA-RECOVERY > fotaflag
    93	
    94	For development, booting ap_recovery.bin is recommended because the normal boot
    95	mode arms the watchdog before starting the kernel.
    96	
    97	4. CPU and GIC Setup
    98	---------------------------
    99	
   100	Generally CPU and GICv3 need to be set up according to the requirements spelled
   101	out in Documentation/arch/arm64/booting.rst. For zx297520v3 this means:
   102	
   103	1. GICD_CTLR.DS=1 to disable GIC security
   104	2. Enable access to ICC_SRE
   105	3. Disable trapping IRQs into monitor mode
   106	4. Configure EL2 and below to run in insecure mode.
   107	5. Configure timer PPIs to active-low.
   108	
   109	The kernel sources provided by ZTE do not boot either (interrupts do not work
   110	at all). They are incomplete in other aspects too, so it is assumed that there
   111	is some workaround similar to the one described in this document somewhere in
   112	the binary blobs.
   113	
   114	The assembly code below is given as an example of how to achieve this:
   115	
 > 116	```
   117	#include <linux/irqchip/arm-gic-v3.h>
   118	#include <asm/assembler.h>
   119	#include <asm/cp15.h>
   120	
   121	@ Detect sane bootloaders and skip the hack
   122	ldr	r3, =0xf2000000
   123	ldr	r3, [r3]
   124	ldr	r4, =(GICD_CTLR_ARE_NS | GICD_CTLR_DS)
   125	cmp	r3, r4
   126	beq	skip_zx_hack
   127	@ This allows EL1 to handle ints hat are normally handled by EL2/3.
   128	ldr	r3, =0xf2000000
   129	str     r4, [r3]
   130	
   131	cps     #MON_MODE
   132	
   133	@ Work in non-secure physical address space: SCR_EL3.NS = 1. At least the UART
   134	@ seems to respond only to non-secure addresses. I have taken insipiration from
   135	@ Raspberry pi's armstub7.S here.
   136	mov	r3, #0x131			@ non-secure, Make F, A bits in CPSR writeable
   137						@ Allow hypervisor call.
 > 138	mcr     p15, 0, r3, c1, c1, 0
   139	
   140	@ AP_PPI_MODE_REG: Configure timer PPIs (10, 11, 13, 14) to active-low.
   141	ldr	r3, =0xF22020a8
   142	ldr	r4, =0x50
   143	str	r4, [r3]
   144	ldr	r3, =0xF22020ac
   145	ldr	r4, =0x14
   146	str	r4, [r3]
   147	
   148	@ Enable EL2 access to ICC_SRE (bit 3, ICC_SRE_EL3.Enable). Enable system reg
   149	@ access to GICv3 registers (bit 0, ICC_SRE_EL3.SRE) for EL1 and EL3.
   150	mrc	p15, 6, r3, c12, c12, 5         @ ICC_SRE_EL3
   151	orr	r3, #0x9                        @ FIXME: No defines for SRE_EL3 values?
   152	mcr	p15, 6, r3, c12, c12, 5
   153	mrc	p15, 0, r3, c12, c12, 5         @ ICC_SRE_EL1
   154	orr	r3, #(ICC_SRE_EL1_SRE)
   155	mcr	p15, 0, r3, c12, c12, 5
   156	
   157	@ Like ICC_SRE_EL3, enable EL1 access to ICC_SRE and system register access
   158	@ for EL2.
   159	mrc	p15, 4, r3, c12, c9, 5          @ ICC_SRE_EL2 aka ICC_HSRE
   160	orr	r3, r3, #(ICC_SRE_EL2_ENABLE | ICC_SRE_EL2_SRE)
   161	mcr	p15, 4, r3, c12, c9, 5
   162	isb
   163	
 > 164	@ Back to SVC mode

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply

* Re: [PATCH] Bluetooth: btmtk: remove extra copy in cmd array init
From: Jiajia Liu @ 2026-05-21  2:26 UTC (permalink / raw)
  To: Luiz Augusto von Dentz
  Cc: Marcel Holtmann, Matthias Brugger, AngeloGioacchino Del Regno,
	linux-bluetooth, linux-kernel, linux-arm-kernel, linux-mediatek
In-Reply-To: <CABBYNZLLNFNuSv0UBNQ7C2HTTg5W2m41hBTNpPw822GMAVNuhQ@mail.gmail.com>

On Wed, May 20, 2026 at 08:55:46AM -0400, Luiz Augusto von Dentz wrote:
> Hi Jiajia,
> 
> On Tue, May 19, 2026 at 10:15 PM Jiajia Liu <liujiajia@kylinos.cn> wrote:
> >
> > In btmtk_setup_firmware_79xx, the data length indicated by wmt_params.dlen
> > in the cmd buffer is MTK_SEC_MAP_NEED_SEND_SIZE + 1. Except for the first
> > byte, the remaining length is MTK_SEC_MAP_NEED_SEND_SIZE. memcpy copied one
> > more byte to cmd + 1 than the remaining length. Align the length passed to
> > memcpy to avoid exceeding current section map.
> >
> > Signed-off-by: Jiajia Liu <liujiajia@kylinos.cn>
> > ---
> >  drivers/bluetooth/btmtk.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> >
> > diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c
> > index ea7a031000cd..53cba71cb07f 100644
> > --- a/drivers/bluetooth/btmtk.c
> > +++ b/drivers/bluetooth/btmtk.c
> > @@ -188,7 +188,7 @@ int btmtk_setup_firmware_79xx(struct hci_dev *hdev, const char *fwname,
> >                                        MTK_FW_ROM_PATCH_GD_SIZE +
> >                                        MTK_FW_ROM_PATCH_SEC_MAP_SIZE * i +
> >                                        MTK_SEC_MAP_COMMON_SIZE,
> > -                                      MTK_SEC_MAP_NEED_SEND_SIZE + 1);
> > +                                      MTK_SEC_MAP_NEED_SEND_SIZE);
> >
> >                                 wmt_params.op = BTMTK_WMT_PATCH_DWNLD;
> >                                 wmt_params.status = &status;
> > --
> > 2.53.0
> >
> 
> Have you tested this on the actual hardware? If not we need a Tested-by.

Yes, I have tested with MT7922 (0489:e0d8) on linux 7.1-rc4 applied this patch
and the following two.

Bluetooth: btmtk: accept too short WMT FUNC_CTRL events
Bluetooth: btmtk: fix urb->setup_packet leak in error paths

setup log of boot and rfkill switch:

$ dmesg | grep hci0

[    6.108240] Bluetooth: hci0: HW/SW Version: 0x008a008a, Build Time: 20260224103448
[    8.933508] Bluetooth: hci0: Device setup in 2765295 usecs
[    8.938846] Bluetooth: hci0: HCI Enhanced Setup Synchronous Connection command is advertised, but not supported.
[   57.209143] Bluetooth: hci0: HW/SW Version: 0x008a008a, Build Time: 20260224103448
[   57.366004] Bluetooth: hci0: Device setup in 160450 usecs
[   57.371248] Bluetooth: hci0: HCI Enhanced Setup Synchronous Connection command is advertised, but not supported.
[  203.687643] Bluetooth: hci0: HW/SW Version: 0x008a008a, Build Time: 20260224103448
[  203.844163] Bluetooth: hci0: Device setup in 158989 usecs
[  203.849426] Bluetooth: hci0: HCI Enhanced Setup Synchronous Connection command is advertised, but not supported.
[  214.723250] Bluetooth: hci0: HW/SW Version: 0x008a008a, Build Time: 20260224103448
[  214.879380] Bluetooth: hci0: Device setup in 155239 usecs
[  214.884644] Bluetooth: hci0: HCI Enhanced Setup Synchronous Connection command is advertised, but not supported.

> 
> -- 
> Luiz Augusto von Dentz


^ permalink raw reply

* [PATCH v2] arm64: dts: ti: Add LincolnTech OLDI LCD-185 Overlay for AM625-BeaglePlay
From: Swamil Jain @ 2026-05-21  2:06 UTC (permalink / raw)
  To: nm, vigneshr, kristo, robh, krzk+dt, conor+dt, tomi.valkeinen
  Cc: r-sharma3, devarsht, praneeth, linux-arm-kernel, devicetree,
	linux-kernel, s-jain1

From: Aradhya Bhatia <a-bhatia1@ti.com>

The panel is Lincoln Technology Solutions LCD185-101CT[0]. It is a
Dual-Link LVDS panel and supports WUXGA resolution (1920x1200).
Furthermore, it has an i2c based touch controller: Goodix-GT928.

Add DT overlay for the OLDI panel to connect with BeaglePlay platform.

[0]: https://lincolntechsolutions.com/wp-content/uploads/2024/09/LCD185-101CTL1ARNTT_DS_R1.3.pdf

Signed-off-by: Aradhya Bhatia <a-bhatia1@ti.com>
Signed-off-by: Swamil Jain <s-jain1@ti.com>
---
Changelog:
v1->v2:
Move overlay-specific pinmux configurations from base device tree to
k3-am625-beagleplay-lincolntech-lcd185-panel.dtso file.

link to v1: https://lore.kernel.org/all/20260514225502.2327771-1-s-jain1@ti.com/
---
 arch/arm64/boot/dts/ti/Makefile               |   4 +
 ...5-beagleplay-lincolntech-lcd185-panel.dtso | 165 ++++++++++++++++++
 2 files changed, 169 insertions(+)
 create mode 100644 arch/arm64/boot/dts/ti/k3-am625-beagleplay-lincolntech-lcd185-panel.dtso

diff --git a/arch/arm64/boot/dts/ti/Makefile b/arch/arm64/boot/dts/ti/Makefile
index 7642c06ca834..f0436a102fce 100644
--- a/arch/arm64/boot/dts/ti/Makefile
+++ b/arch/arm64/boot/dts/ti/Makefile
@@ -12,6 +12,7 @@
 dtb-$(CONFIG_ARCH_K3) += k3-am625-beagleplay.dtb
 dtb-$(CONFIG_ARCH_K3) += k3-am625-beagleplay-csi2-ov5640.dtbo
 dtb-$(CONFIG_ARCH_K3) += k3-am625-beagleplay-csi2-tevi-ov5640.dtbo
+dtb-$(CONFIG_ARCH_K3) += k3-am625-beagleplay-lincolntech-lcd185-panel.dtbo
 dtb-$(CONFIG_ARCH_K3) += k3-am625-phyboard-lyra-rdk.dtb
 dtb-$(CONFIG_ARCH_K3) += k3-am625-sk.dtb
 dtb-$(CONFIG_ARCH_K3) += k3-am625-tqma62xx-mba62xx.dtb
@@ -177,6 +178,8 @@ k3-am625-beagleplay-csi2-ov5640-dtbs := k3-am625-beagleplay.dtb \
 	k3-am625-beagleplay-csi2-ov5640.dtbo
 k3-am625-beagleplay-csi2-tevi-ov5640-dtbs := k3-am625-beagleplay.dtb \
 	k3-am625-beagleplay-csi2-tevi-ov5640.dtbo
+k3-am625-beagleplay-lincolntech-lcd185-panel-dtbs := k3-am625-beagleplay.dtb \
+	k3-am625-beagleplay-lincolntech-lcd185-panel.dtbo
 k3-am625-phyboard-lyra-disable-eth-phy-dtbs := k3-am625-phyboard-lyra-rdk.dtb \
 	k3-am6xx-phycore-disable-eth-phy.dtbo
 k3-am625-phyboard-lyra-disable-rtc-dtbs := k3-am625-phyboard-lyra-rdk.dtb \
@@ -287,6 +290,7 @@ k3-j784s4-evm-usxgmii-exp1-exp2-dtbs := k3-j784s4-evm.dtb \
 	k3-j784s4-evm-usxgmii-exp1-exp2.dtbo
 dtb- += k3-am625-beagleplay-csi2-ov5640.dtb \
 	k3-am625-beagleplay-csi2-tevi-ov5640.dtb \
+	k3-am625-beagleplay-lincolntech-lcd185-panel.dtb \
 	k3-am625-phyboard-lyra-disable-eth-phy.dtb \
 	k3-am625-phyboard-lyra-disable-rtc.dtb \
 	k3-am625-phyboard-lyra-disable-spi-nor.dtb \
diff --git a/arch/arm64/boot/dts/ti/k3-am625-beagleplay-lincolntech-lcd185-panel.dtso b/arch/arm64/boot/dts/ti/k3-am625-beagleplay-lincolntech-lcd185-panel.dtso
new file mode 100644
index 000000000000..e7cadd48d439
--- /dev/null
+++ b/arch/arm64/boot/dts/ti/k3-am625-beagleplay-lincolntech-lcd185-panel.dtso
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-2.0-or-later OR MIT
+/**
+ * Lincoln tech Solutions OLDI panel (LCD185-101CT) and touch DT overlay for AM625-BeaglePlay
+ *
+ * AM625-BeaglePlay: https://www.beagleboard.org/boards/beagleplay
+ * Panel datasheet: https://lincolntechsolutions.com/wp-content/uploads/2024/09/LCD185-101CTL1ARNTT_DS_R1.3.pdf
+ *
+ * Copyright (C) 2026 Texas Instruments Incorporated - http://www.ti.com/
+ */
+
+/dts-v1/;
+/plugin/;
+
+#include <dt-bindings/gpio/gpio.h>
+#include <dt-bindings/interrupt-controller/irq.h>
+#include "k3-pinctrl.h"
+
+&{/} {
+	backlight: backlight {
+		compatible = "pwm-backlight";
+		pinctrl-names = "default";
+		pinctrl-0 = <&backlight_pins_default>;
+		brightness-levels = <0 4 8 16 32 64 128 255>;
+		default-brightness-level = <6>;
+		enable-gpios = <&main_gpio0 0 GPIO_ACTIVE_HIGH>;
+		pwms = <&epwm0 1 20000 0>;
+	};
+
+	lcd {
+		compatible = "lincolntech,lcd185-101ct";
+		backlight = <&backlight>;
+		/*
+		* Note that the OLDI TX 0 transmits the odd set of pixels
+		* while the OLDI TX 1 transmits the even set. This is a
+		* fixed configuration in the IP integration and is not
+		* changeable. The properties, "dual-lvds-odd-pixels" and
+		* "dual-lvds-even-pixels" have been used to merely
+		* identify if a Dual Link configuration is required.
+		* Swapping them will cause an error in the dss oldi driver.
+		*/
+		power-supply = <&vsys_5v0>;
+		ports {
+			#address-cells = <1>;
+			#size-cells = <0>;
+
+			port@0 {
+				reg = <0>;
+				dual-lvds-odd-pixels;
+				lcd_in0: endpoint {
+					remote-endpoint = <&oldi_0_out>;
+				};
+			};
+			port@1 {
+				reg = <1>;
+				dual-lvds-even-pixels;
+				lcd_in1: endpoint {
+					remote-endpoint = <&oldi_1_out>;
+				};
+			};
+		};
+	};
+};
+
+&main_pmx0 {
+	touchscreen_pins_default: touchscreen-default-pins {
+		pinctrl-single,pins = <
+			AM62X_IOPAD(0x01b4, PIN_OUTPUT, 7) /* (A13) SPI0_CS0.GPIO1_15 */
+			AM62X_IOPAD(0x00a0, PIN_INPUT, 7) /* (K25) GPMC0_WPn.GPIO0_39 */
+		>;
+	};
+
+	backlight_pins_default: bl-default-pins {
+		pinctrl-single,pins = <
+			AM62X_IOPAD(0x0000, PIN_OUTPUT, 7) /* (H24) OSPI0_CLK.GPIO0_0 */
+			AM62X_IOPAD(0x01b8, PIN_OUTPUT, 2) /* (C13) SPI0_CS1.EHRPWM0_B */
+		>;
+	};
+};
+
+&dss {
+	status = "okay";
+};
+
+&oldi0 {
+	status = "okay";
+	ti,companion-oldi = <&oldi1>;
+};
+
+&oldi1 {
+	status = "okay";
+	ti,secondary-oldi;
+	ti,companion-oldi = <&oldi0>;
+};
+
+&oldi0_port0 {
+	oldi_0_in: endpoint {
+		remote-endpoint = <&dpi0_out0>;
+	};
+};
+
+&oldi0_port1 {
+	oldi_0_out: endpoint {
+		remote-endpoint = <&lcd_in0>;
+	};
+};
+
+&oldi1_port0 {
+	oldi_1_in: endpoint {
+		remote-endpoint = <&dpi0_out1>;
+	};
+};
+
+&oldi1_port1 {
+	oldi_1_out: endpoint {
+		remote-endpoint = <&lcd_in1>;
+	};
+};
+
+&dss_ports {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	/* VP1: Output to OLDI */
+	port@0 {
+		reg = <0>;
+		#address-cells = <1>;
+		#size-cells = <0>;
+
+		dpi0_out0: endpoint@0 {
+			reg = <0>;
+			remote-endpoint = <&oldi_0_in>;
+		};
+		dpi0_out1: endpoint@1 {
+			reg = <1>;
+			remote-endpoint = <&oldi_1_in>;
+		};
+	};
+};
+
+&main_i2c2 {
+	#address-cells = <1>;
+	#size-cells = <0>;
+
+	eeprom@57 {
+		compatible = "atmel,24c256";
+		reg = <0x57>;
+	};
+
+	touchscreen@5d {
+		compatible = "goodix,gt928";
+		reg = <0x5d>;
+		pinctrl-names = "default";
+	        pinctrl-0 = <&touchscreen_pins_default>;
+		interrupt-parent = <&main_gpio0>;
+		interrupts = <39 IRQ_TYPE_EDGE_FALLING>;
+		irq-gpios = <&main_gpio0 39 GPIO_ACTIVE_HIGH>;
+		reset-gpios = <&main_gpio1 15 GPIO_ACTIVE_HIGH>;
+		touchscreen-size-x = <1920>;
+		touchscreen-size-y = <1200>;
+	};
+};
+
+&epwm0 {
+	status = "okay";
+};


^ permalink raw reply related

* [PATCH v3 2/3] clk: nuvoton: ma35d1: fix PLL_CTL1_FRAC bit field width and fractional calc
From: Joey Lu @ 2026-05-21  1:42 UTC (permalink / raw)
  To: mturquette, sboyd
  Cc: ychuang3, schung, yclu4, linux-arm-kernel, linux-clk,
	linux-kernel, Joey Lu
In-Reply-To: <20260521014220.77955-1-a0987203069@gmail.com>

PLL_CTL1_FRAC was defined as GENMASK(31, 24), covering only 8 bits.
The hardware fractional field occupies bits [31:8] (24 bits), so the
mask must be GENMASK(31, 8).

The previous fractional-mode calculation used FIELD_MAX(PLL_CTL1_FRAC)
as the denominator to obtain 2 decimal places.  With the corrected 24-bit
mask the old divisor is wrong; replace the arithmetic with a proper
24-bit fixed-point rounding to 3 decimal places using the kernel's
DIV_ROUND_CLOSEST_ULL helper:

  n_frac = n * 1000 + DIV_ROUND_CLOSEST_ULL(x * 1000, 1 << 24)

Fixes: 691521a367cf ("clk: nuvoton: Add clock driver for ma35d1 clock controller")
Signed-off-by: Joey Lu <a0987203069@gmail.com>
---
 drivers/clk/nuvoton/clk-ma35d1-pll.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/drivers/clk/nuvoton/clk-ma35d1-pll.c b/drivers/clk/nuvoton/clk-ma35d1-pll.c
index bfedd45bd04b..eb9d69d2077b 100644
--- a/drivers/clk/nuvoton/clk-ma35d1-pll.c
+++ b/drivers/clk/nuvoton/clk-ma35d1-pll.c
@@ -48,7 +48,7 @@
 #define PLL_CTL1_PD		BIT(0)
 #define PLL_CTL1_BP		BIT(1)
 #define PLL_CTL1_OUTDIV		GENMASK(6, 4)
-#define PLL_CTL1_FRAC		GENMASK(31, 24)
+#define PLL_CTL1_FRAC		GENMASK(31, 8)
 #define PLL_CTL2_SLOPE		GENMASK(23, 0)
 
 #define INDIV_MIN		1
@@ -113,9 +113,9 @@ static unsigned long ma35d1_calc_pll_freq(u8 mode, u32 *reg_ctl, unsigned long p
 		pll_freq = div_u64(pll_freq, m * p);
 	} else {
 		x = FIELD_GET(PLL_CTL1_FRAC, reg_ctl[1]);
-		/* 2 decimal places floating to integer (ex. 1.23 to 123) */
-		n = n * 100 + ((x * 100) / FIELD_MAX(PLL_CTL1_FRAC));
-		pll_freq = div_u64(parent_rate * n, 100 * m * p);
+		/* convert 24-bit fraction to 3 decimal digits, rounding to closest */
+		n = n * 1000 + DIV_ROUND_CLOSEST_ULL((u64)x * 1000, 1ULL << 24);
+		pll_freq = div_u64((u64)parent_rate * n, 1000 * m * p);
 	}
 	return pll_freq;
 }
-- 
2.43.0



^ permalink raw reply related

* [PATCH v3 1/3] clk: nuvoton: ma35d1: fix ignored div_u64 return values in PLL freq calculation
From: Joey Lu @ 2026-05-21  1:42 UTC (permalink / raw)
  To: mturquette, sboyd
  Cc: ychuang3, schung, yclu4, linux-arm-kernel, linux-clk,
	linux-kernel, Joey Lu, Brian Masney
In-Reply-To: <20260521014220.77955-1-a0987203069@gmail.com>

div_u64() does not modify its argument in place; the return value must
be assigned.  Both ma35d1_calc_smic_pll_freq() and ma35d1_calc_pll_freq()
called div_u64() and discarded the result, leaving pll_freq holding the
undivided product and thus returning a frequency orders of magnitude too
high.

Fixes: 691521a367cf ("clk: nuvoton: Add clock driver for ma35d1 clock controller")

Reviewed-by: Brian Masney <bmasney@redhat.com>
Signed-off-by: Joey Lu <a0987203069@gmail.com>
---
 drivers/clk/nuvoton/clk-ma35d1-pll.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/clk/nuvoton/clk-ma35d1-pll.c b/drivers/clk/nuvoton/clk-ma35d1-pll.c
index 4620acfe47e8..bfedd45bd04b 100644
--- a/drivers/clk/nuvoton/clk-ma35d1-pll.c
+++ b/drivers/clk/nuvoton/clk-ma35d1-pll.c
@@ -92,7 +92,7 @@ static unsigned long ma35d1_calc_smic_pll_freq(u32 pll0_ctl0,
 	p = FIELD_GET(SPLL0_CTL0_OUTDIV, pll0_ctl0);
 	outdiv = 1 << p;
 	pll_freq = (u64)parent_rate * n;
-	div_u64(pll_freq, m * outdiv);
+	pll_freq = div_u64(pll_freq, m * outdiv);
 	return pll_freq;
 }
 
@@ -110,7 +110,7 @@ static unsigned long ma35d1_calc_pll_freq(u8 mode, u32 *reg_ctl, unsigned long p
 
 	if (mode == PLL_MODE_INT) {
 		pll_freq = (u64)parent_rate * n;
-		div_u64(pll_freq, m * p);
+		pll_freq = div_u64(pll_freq, m * p);
 	} else {
 		x = FIELD_GET(PLL_CTL1_FRAC, reg_ctl[1]);
 		/* 2 decimal places floating to integer (ex. 1.23 to 123) */
-- 
2.43.0



^ permalink raw reply related

* [PATCH v3 3/3] clk: nuvoton: ma35d1: fix ma35d1_clk_pll_determine_rate logic
From: Joey Lu @ 2026-05-21  1:42 UTC (permalink / raw)
  To: mturquette, sboyd
  Cc: ychuang3, schung, yclu4, linux-arm-kernel, linux-clk,
	linux-kernel, Joey Lu
In-Reply-To: <20260521014220.77955-1-a0987203069@gmail.com>

ma35d1_clk_pll_determine_rate() called ma35d1_pll_find_closest()
unconditionally before the switch statement, and then every case
branch overwrote pll_freq by reading the current hardware registers.
For CAPLL and DDRPLL this means find_closest() ran unnecessarily
(and incorrectly, since those PLLs are read-only) and its result
was silently discarded.

Fix by moving the find_closest() call inside the APLL/EPLL/VPLL
branch where it belongs.  Group CAPLL and DDRPLL together as
read-only PLLs that simply report their current rate; handle them
with an explicit if/else to keep the CAPLL (SMIC design) and DDRPLL
(standard design) paths distinct.

Fixes: 691521a367cf ("clk: nuvoton: Add clock driver for ma35d1 clock controller")
Signed-off-by: Joey Lu <a0987203069@gmail.com>
---
 drivers/clk/nuvoton/clk-ma35d1-pll.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/drivers/clk/nuvoton/clk-ma35d1-pll.c b/drivers/clk/nuvoton/clk-ma35d1-pll.c
index eb9d69d2077b..c7c0dc91a012 100644
--- a/drivers/clk/nuvoton/clk-ma35d1-pll.c
+++ b/drivers/clk/nuvoton/clk-ma35d1-pll.c
@@ -255,32 +255,32 @@ static int ma35d1_clk_pll_determine_rate(struct clk_hw *hw,
 	if (req->best_parent_rate < PLL_FREF_MIN_FREQ || req->best_parent_rate > PLL_FREF_MAX_FREQ)
 		return -EINVAL;
 
-	ret = ma35d1_pll_find_closest(pll, req->rate, req->best_parent_rate,
-				      reg_ctl, &pll_freq);
-	if (ret < 0)
-		return ret;
-
 	switch (pll->id) {
 	case CAPLL:
+	case DDRPLL:
+		/* Read-only PLLs: return current rate */
 		reg_ctl[0] = readl_relaxed(pll->ctl0_base);
-		pll_freq = ma35d1_calc_smic_pll_freq(reg_ctl[0], req->best_parent_rate);
+		if (pll->id == CAPLL) {
+			pll_freq = ma35d1_calc_smic_pll_freq(reg_ctl[0], req->best_parent_rate);
+		} else {
+			reg_ctl[1] = readl_relaxed(pll->ctl1_base);
+			pll_freq = ma35d1_calc_pll_freq(pll->mode, reg_ctl, req->best_parent_rate);
+		}
 		req->rate = pll_freq;
-
 		return 0;
-	case DDRPLL:
 	case APLL:
 	case EPLL:
 	case VPLL:
-		reg_ctl[0] = readl_relaxed(pll->ctl0_base);
-		reg_ctl[1] = readl_relaxed(pll->ctl1_base);
-		pll_freq = ma35d1_calc_pll_freq(pll->mode, reg_ctl, req->best_parent_rate);
+		/* Configurable PLLs: find closest achievable rate */
+		ret = ma35d1_pll_find_closest(pll, req->rate, req->best_parent_rate,
+					      reg_ctl, &pll_freq);
+		if (ret < 0)
+			return ret;
 		req->rate = pll_freq;
-
 		return 0;
 	}
 
 	req->rate = 0;
-
 	return 0;
 }
 
-- 
2.43.0



^ permalink raw reply related

* [PATCH v3 0/3] clk: nuvoton: ma35d1: fix PLL frequency calculation
From: Joey Lu @ 2026-05-21  1:42 UTC (permalink / raw)
  To: mturquette, sboyd
  Cc: ychuang3, schung, yclu4, linux-arm-kernel, linux-clk,
	linux-kernel, Joey Lu

Fix four bugs in the MA35D1 PLL clock driver that cause incorrect
frequency values returned from recalc_rate() and determine_rate().

v1 combined all fixes into a single commit.  At reviewer request,
split into one patch per logical fix:

  1/3 - fix div_u64 return value being discarded (affects both
        ma35d1_calc_smic_pll_freq and ma35d1_calc_pll_freq INT mode)

  2/3 - fix PLL_CTL1_FRAC mask width (8-bit -> 24-bit) and update
        the fractional-mode arithmetic accordingly

  3/3 - fix ma35d1_clk_pll_determine_rate: move find_closest() into
        the configurable-PLL branch; unify read-only PLL handling

Changes in v3 (vs v2):
  - 2/3: replace the manual round-to-nearest expression
    "(u32)(((u64)x * 1000 + 500) >> 24)" with the kernel helper
    DIV_ROUND_CLOSEST_ULL((u64)x * 1000, 1ULL << 24); the result
    is mathematically identical but more readable and idiomatic

Joey Lu (3):
  clk: nuvoton: ma35d1: fix ignored div_u64 return values in PLL freq
    calculation
  clk: nuvoton: ma35d1: fix PLL_CTL1_FRAC bit field width and fractional
    calc
  clk: nuvoton: ma35d1: fix ma35d1_clk_pll_determine_rate logic

 drivers/clk/nuvoton/clk-ma35d1-pll.c | 38 ++++++++++++++--------------
 1 file changed, 19 insertions(+), 19 deletions(-)

-- 
2.43.0



^ permalink raw reply

* Re: [PATCH v2] media: rkvdec: fix PM runtime teardown ordering in remove
From: Nicolas Dufresne @ 2026-05-21  1:10 UTC (permalink / raw)
  To: Francesco Saverio Pavone, jonas, detlev.casanova, hverkuil,
	mchehab
  Cc: ezequiel, heiko, stable, linux-media, linux-rockchip,
	linux-arm-kernel, linux-kernel
In-Reply-To: <f9e63fbbd99c11f303ac8e8f5aec6b2bd528cf99.camel@collabora.com>

[-- Attachment #1: Type: text/plain, Size: 4150 bytes --]

Le mercredi 20 mai 2026 à 20:51 -0400, Nicolas Dufresne a écrit :
> Le lundi 18 mai 2026 à 16:54 +0200, Francesco Saverio Pavone a écrit :
> > From: Jonas Karlman <jonas@kwiboo.se>
> > 
> > The current remove() path calls rkvdec_v4l2_cleanup() and
> > pm_runtime_disable() before pm_runtime_dont_use_autosuspend(), and
> > frees the empty IOMMU domain after that. With autosuspend still
> > armed when the domain goes away, the VDPU381 can be left in a dirty
> > state across module reload and suspend/resume cycles.
> > 
> > On RK3588 this surfaces as a VP9 inter-prediction bug: from the
> > second ALTREF frame onward, motion blocks decode with U=V=0 (BT.709
> > green), while intra and static blocks stay correct. Reordering the
> > teardown to dont_use_autosuspend() -> iommu_domain_free() ->
> > pm_runtime_disable() -> v4l2_cleanup() makes the symptom go away.
> > 
> > Tested on a Radxa Rock 5B+ (RK3588, 8 GB LPDDR5) with both the
> > libva-v4l2-request mpv pipeline and Chromium's V4L2 stateless
> > decoder. With the fix, 300 random pixel samples on VP9 Profile 0
> > clips at 1080p and 1440p match a libvpx software reference exactly
> > (worst delta 0). Without it, the same 1080p sample at frame 4,
> > pixel (960, 270) reads HW=(0,112,0) vs SW=(204,147,116). HEVC and
> > H.264 stateless decoding via mpv keep running on hardware with no
> > fallback.
> > 
> > Fixes: ff8c5622f9f7 ("media: rkvdec: Restore iommu addresses on errors")
> > Cc: <stable@vger.kernel.org>
> > Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
> > Tested-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>
> > Signed-off-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>
> 
> Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
> Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
> 
> cheers,
> Nicolas
> 
> > ---
> > Changes in v2:
> >  - Add Cc: <stable@vger.kernel.org>; media-CI flagged that the
> >    Fixes: target (ff8c5622f9f7) is present in the 6.17, 6.18, 6.19
> >    and 7.0 stable branches, so the fix should reach them too.
> >    Link to v1:
> > https://lore.kernel.org/all/20260518105413.42147-1-pavone.lawyer@gmail.com/
> >    Media-CI report:
> > https://linux-media.pages.freedesktop.org/-/users/patchwork/-/jobs/100124849/artifacts/report.htm
> > 
> >  drivers/media/platform/rockchip/rkvdec/rkvdec.c | 5 +++--
> >  1 file changed, 3 insertions(+), 2 deletions(-)
> > 
> > diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> > b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> > index 6f5f0422d317..bb95b090a25b 100644
> > --- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> > +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> > @@ -2066,12 +2066,13 @@ static void rkvdec_remove(struct platform_device
> > *pdev)
> >  
> >  	cancel_delayed_work_sync(&rkvdec->watchdog_work);
> >  
> > -	rkvdec_v4l2_cleanup(rkvdec);
> > -	pm_runtime_disable(&pdev->dev);
> >  	pm_runtime_dont_use_autosuspend(&pdev->dev);
> >  
> >  	if (rkvdec->empty_domain)
> >  		iommu_domain_free(rkvdec->empty_domain);
> > +
> > +	pm_runtime_disable(&pdev->dev);
> > +	rkvdec_v4l2_cleanup(rkvdec);

After consulting the sashiko.dev report, this made me reconsider the fix. A
problem that pre-existed it seems, but made a little worse. Basically, userspace
can still open and call into the API until rkvdec_v4l2_cleanup() is called.

Didn't research too much, but may you can extract:

	media_device_unregister(&rkvdec->mdev);
	video_unregister_device(&rkvdec->vdev);

And move this at the top of the remove function. This will prevent further
access by userspace, avoiding races. While at it, remove useless
rkvdec_v4l2_cleanup() helper and merge it in, its only used once.

For the rest of your report, I'm under the impression remove won't be called
unless all the open devices has been closed, which will call
v4l2_m2m_ctx_release(), which synchronously abort any pending job.

https://sashiko.dev/#/patchset/20260518145414.64514-1-pavone.lawyer%40gmail.com

> >  }
> >  
> >  #ifdef CONFIG_PM

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v14 08/44] arm64: RMI: Ensure that the RMM has GPT entries for memory
From: Gavin Shan @ 2026-05-21  0:58 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-9-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> The RMM maintains the state of all the granules in the system to make
> sure that the host is abiding by the rules. This state can be maintained
> at different granularity, per page (TRACKING_FINE) or per region
> (TRACKING_COARSE). The region size depends on the underlying
> "RMI_GRANULE_SIZE". For a "coarse" region all pages in the region must
> be of the same state, this implies we need to have "fine" tracking for
> DRAM, so that we can delegated individual pages.
> 
> For now we only support a statically carved out memory for tracking
> granules for the "fine" regions. This can be extended in the future to
> allow modifying the tracking granularity and remove the need for a
> static allocation.
> 
> Similarly, the firmware may create L0 GPT entries describing the total
> address space. But if we change the "PAS" (Physical Address Space) of a
> granule then the firmware may need to create L1 tables to track the PAS
> at a finer granularity.
> 
> Note: support is currently missing for SROs which means that if the RMM
> needs memory donating this will fail (and render CCA unusable in Linux).
> This effectively means that the L1 GPT tables must be created before
> Linux starts.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>   * Moved out of KVM
> ---
>   arch/arm64/include/asm/rmi_cmds.h |   2 +
>   arch/arm64/kernel/rmi.c           | 103 ++++++++++++++++++++++++++++++
>   2 files changed, 105 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
> index 9179934925c5..9078a2920a7c 100644
> --- a/arch/arm64/include/asm/rmi_cmds.h
> +++ b/arch/arm64/include/asm/rmi_cmds.h
> @@ -33,6 +33,8 @@ struct rmi_sro_state {
>   } while (RMI_RETURN_STATUS(res.a0) == RMI_BUSY ||			\
>   	 RMI_RETURN_STATUS(res.a0) == RMI_BLOCKED)
>   
> +bool rmi_is_available(void);
> +
>   unsigned long rmi_sro_execute(struct rmi_sro_state *sro, gfp_t gfp);
>   void rmi_sro_free(struct rmi_sro_state *sro);
>   
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index a14ead5dedda..52a415e99500 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -7,6 +7,8 @@
>   
>   #include <asm/rmi_cmds.h>
>   
> +static bool arm64_rmi_is_available;
> +
>   unsigned long rmm_feat_reg0;
>   unsigned long rmm_feat_reg1;
>   
> @@ -88,6 +90,102 @@ static int rmi_configure(void)
>   	return 0;
>   }
>   
> +/*
> + * For now we set the tracking_region_size to 0 for RMI_RMM_CONFIG_SET().
> + * TODO: Support other tracking sizes (via Kconfig option).
> + */
> +#ifdef CONFIG_PAGE_SIZE_4KB
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_1G
> +#elif defined(CONFIG_PAGE_SIZE_16KB)
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_32M
> +#elif defined(CONFIG_PAGE_SIZE_64KB)
> +#define RMM_GRANULE_TRACKING_SIZE	SZ_512M
> +#endif
> +

RMM_GRANULE_TRACKING_SIZE is never used in this series.

> +/*
> + * Make sure the area is tracked by RMM at FINE granularity.
> + * We do not support changing the tracking yet.
> + */
> +static int rmi_verify_memory_tracking(phys_addr_t start, phys_addr_t end)
> +{
> +	while (start < end) {
> +		unsigned long ret, category, state, next;
> +
> +		ret = rmi_granule_tracking_get(start, end, &category, &state, &next);
> +		if (ret != RMI_SUCCESS ||
> +		    state != RMI_TRACKING_FINE ||
> +		    category != RMI_MEM_CATEGORY_CONVENTIONAL) {
> +			/* TODO: Set granule tracking in this case */
> +			pr_err("Granule tracking for region isn't fine/conventional: %llx",
> +			       start);
> +			return -ENODEV;
> +		}
> +		start = next;
> +	}
> +
> +	return 0;
> +}
> +
> +static unsigned long rmi_l0gpt_size(void)
> +{
> +	return 1UL << (30 + FIELD_GET(RMI_FEATURE_REGISTER_1_L0GPTSZ,
> +				      rmm_feat_reg1));
> +}
> +

rmi_l0gpt_size() is only used by rmi_create_gpts(), its logic can be
combined to that function.

> +static int rmi_create_gpts(phys_addr_t start, phys_addr_t end)
> +{
> +	unsigned long l0gpt_sz = rmi_l0gpt_size();
> +
> +	start = ALIGN_DOWN(start, l0gpt_sz);
> +	end = ALIGN(end, l0gpt_sz);
> +
> +	while (start < end) {
> +		int ret = rmi_gpt_l1_create(start);
> +
> +		/*
> +		 * Make sure the L1 GPT tables are created for the region.
> +		 * RMI_ERROR_GPT indicates the L1 table already exists.
> +		 */
> +		if (ret && ret != RMI_ERROR_GPT) {
> +			/*
> +			 * FIXME: Handle SRO so that memory can be donated for
> +			 * the tables.
> +			 */
> +			pr_err("GPT Level1 table missing for %llx\n", start);
> +			return -ENOMEM;
> +		}
> +		start += l0gpt_sz;
> +	}
> +
> +	return 0;
> +}
> +
> +static int rmi_init_metadata(void)
> +{
> +	phys_addr_t start, end;
> +	const struct memblock_region *r;
> +
> +	for_each_mem_region(r) {
> +		int ret;
> +
> +		start = memblock_region_memory_base_pfn(r) << PAGE_SHIFT;
> +		end = memblock_region_memory_end_pfn(r) << PAGE_SHIFT;
> +		ret = rmi_verify_memory_tracking(start, end);
> +		if (ret)
> +			return ret;
> +		ret = rmi_create_gpts(start, end);
> +		if (ret)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
> +bool rmi_is_available(void)
> +{
> +	return arm64_rmi_is_available;
> +}
> +
>   static int __init arm64_init_rmi(void)
>   {
>   	/* Continue without realm support if we can't agree on a version */
> @@ -101,6 +199,11 @@ static int __init arm64_init_rmi(void)
>   
>   	if (rmi_configure())
>   		return 0;
> +	if (rmi_init_metadata())
> +		return 0;
> +
> +	arm64_rmi_is_available = true;
> +	pr_info("RMI configured");
>   
>   	return 0;
>   }

Thanks,
Gavin



^ permalink raw reply

* Re: [PATCH] media: rkvdec: hevc: cap EXT SPS RPS control counts before descriptor assembly
From: Michael Bommarito @ 2026-05-21  0:57 UTC (permalink / raw)
  To: Detlev Casanova
  Cc: Ezequiel Garcia, Mauro Carvalho Chehab, Heiko Stuebner,
	linux-media, linux-rockchip, linux-arm-kernel, linux-kernel
In-Reply-To: <67bd72ba-6dab-4bdb-a391-27545e287e94@collabora.com>

On Tue, May 19, 2026 at 9:04 AM Detlev Casanova
<detlev.casanova@collabora.com> wrote:
> Still, did you try just changing the cap to 64 (.cfg.dims = { 64 },) ?
> You'd need a test that sets the control from userspace though.
>
> It should refuse setting the control if there are more than 64 elements,
> therefore the hevc decoder will not run any function using the count
> values from the SPS (See  rkvdec-vdpu381-hevc.c:601)

Sure, I can test that and send a v2 for ST.  My understanding is that
we have four spots we need to check across the flow though:

1. ST count > 64
2. LT count > 32
3. num_negative_pics / num_positive_pics > 16
4. delta_idx_minus1 + 1 > i

So would you also want the same .cfg approach for the LT cap?

Thanks,
Mike


^ permalink raw reply

* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Gavin Shan @ 2026-05-21  0:51 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-8-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> RMM v2.0 brings the ability to set the RMM's granule size. Check the
> feature registers and configure the RMM so that it matches the host's
> page size. This means that operations can be done with a granulatity
> equal to PAGE_SIZE.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>   * Moved out of KVM.
> ---
>   arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 42 insertions(+)
> 
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index 99c1ccc35c11..a14ead5dedda 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>   	return 0;
>   }
>   
> +static int rmi_configure(void)
> +{
> +	struct rmm_config *config __free(free_page) = NULL;
> +	unsigned long ret;
> +
> +	config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
> +	if (!config)
> +		return -ENOMEM;
> +
> +	switch (PAGE_SIZE) {
> +	case SZ_4K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
> +		break;
> +	case SZ_16K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
> +		break;
> +	case SZ_64K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
> +		break;
> +	default:
> +		pr_err("Unsupported PAGE_SIZE for RMM\n");
> +		return -EINVAL;
> +	}
> +
> +	ret = rmi_rmm_config_set(virt_to_phys(config));
> +	if (ret) {
> +		pr_err("RMM config set failed\n");
> +		return -EINVAL;
> +	}
> +

Looking at branch 'topics/rmm-v2.0-poc_2' of RMM implementation, the granule size
is fixed to be 4KB at present. I'm not sure if I have looked into correct RMM
implementation, but 'topics/rmm-v2.0-poc_2' is recommended one in the cover
letter.

Besides, there has checks in the handler of the RMI command to make sure that
struct rmm_config::tracking_region_size to be 1GB, indicated by zero. It maybe
worthy to set it before call to rmi_rmm_config_set().

	config.tracking_region_size = 0; /* 1GB */
	ret = rmi_rmm_config_set(virt_to_phys(config));


> +	ret = rmi_rmm_activate();
> +	if (ret) {
> +		pr_err("RMM activate failed\n");
> +		return -ENXIO;
> +	}
> +
> +	return 0;
> +}
> +
>   static int __init arm64_init_rmi(void)
>   {
>   	/* Continue without realm support if we can't agree on a version */
> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>   	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>   		return 0;
>   
> +	if (rmi_configure())
> +		return 0;
> +
>   	return 0;
>   }
>   subsys_initcall(arm64_init_rmi);

Thanks,
Gavin



^ permalink raw reply

* Re: [PATCH v2] media: rkvdec: fix PM runtime teardown ordering in remove
From: Nicolas Dufresne @ 2026-05-21  0:51 UTC (permalink / raw)
  To: Francesco Saverio Pavone, jonas, detlev.casanova, hverkuil,
	mchehab
  Cc: ezequiel, heiko, stable, linux-media, linux-rockchip,
	linux-arm-kernel, linux-kernel
In-Reply-To: <20260518145414.64514-1-pavone.lawyer@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3054 bytes --]

Le lundi 18 mai 2026 à 16:54 +0200, Francesco Saverio Pavone a écrit :
> From: Jonas Karlman <jonas@kwiboo.se>
> 
> The current remove() path calls rkvdec_v4l2_cleanup() and
> pm_runtime_disable() before pm_runtime_dont_use_autosuspend(), and
> frees the empty IOMMU domain after that. With autosuspend still
> armed when the domain goes away, the VDPU381 can be left in a dirty
> state across module reload and suspend/resume cycles.
> 
> On RK3588 this surfaces as a VP9 inter-prediction bug: from the
> second ALTREF frame onward, motion blocks decode with U=V=0 (BT.709
> green), while intra and static blocks stay correct. Reordering the
> teardown to dont_use_autosuspend() -> iommu_domain_free() ->
> pm_runtime_disable() -> v4l2_cleanup() makes the symptom go away.
> 
> Tested on a Radxa Rock 5B+ (RK3588, 8 GB LPDDR5) with both the
> libva-v4l2-request mpv pipeline and Chromium's V4L2 stateless
> decoder. With the fix, 300 random pixel samples on VP9 Profile 0
> clips at 1080p and 1440p match a libvpx software reference exactly
> (worst delta 0). Without it, the same 1080p sample at frame 4,
> pixel (960, 270) reads HW=(0,112,0) vs SW=(204,147,116). HEVC and
> H.264 stateless decoding via mpv keep running on hardware with no
> fallback.
> 
> Fixes: ff8c5622f9f7 ("media: rkvdec: Restore iommu addresses on errors")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
> Tested-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>
> Signed-off-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>

Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

cheers,
Nicolas

> ---
> Changes in v2:
>  - Add Cc: <stable@vger.kernel.org>; media-CI flagged that the
>    Fixes: target (ff8c5622f9f7) is present in the 6.17, 6.18, 6.19
>    and 7.0 stable branches, so the fix should reach them too.
>    Link to v1:
> https://lore.kernel.org/all/20260518105413.42147-1-pavone.lawyer@gmail.com/
>    Media-CI report:
> https://linux-media.pages.freedesktop.org/-/users/patchwork/-/jobs/100124849/artifacts/report.htm
> 
>  drivers/media/platform/rockchip/rkvdec/rkvdec.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> index 6f5f0422d317..bb95b090a25b 100644
> --- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> @@ -2066,12 +2066,13 @@ static void rkvdec_remove(struct platform_device
> *pdev)
>  
>  	cancel_delayed_work_sync(&rkvdec->watchdog_work);
>  
> -	rkvdec_v4l2_cleanup(rkvdec);
> -	pm_runtime_disable(&pdev->dev);
>  	pm_runtime_dont_use_autosuspend(&pdev->dev);
>  
>  	if (rkvdec->empty_domain)
>  		iommu_domain_free(rkvdec->empty_domain);
> +
> +	pm_runtime_disable(&pdev->dev);
> +	rkvdec_v4l2_cleanup(rkvdec);
>  }
>  
>  #ifdef CONFIG_PM

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Gavin Shan @ 2026-05-21  0:39 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-7-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> Query the RMI version number and check if it is a compatible version.
> The first two feature registers are read and exposed for future code to
> use.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> v14:
>   * This moves the basic RMI setup into the 'kernel' directory. This is
>     because RMI will be used for some features outside of KVM so should
>     be available even if KVM isn't compiled in.
> ---
>   arch/arm64/include/asm/rmi_cmds.h |  3 ++
>   arch/arm64/kernel/Makefile        |  2 +-
>   arch/arm64/kernel/cpufeature.c    |  1 +
>   arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
>   4 files changed, 70 insertions(+), 1 deletion(-)
>   create mode 100644 arch/arm64/kernel/rmi.c
> 

[...]

> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> new file mode 100644
> index 000000000000..99c1ccc35c11
> --- /dev/null
> +++ b/arch/arm64/kernel/rmi.c
> @@ -0,0 +1,65 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023-2025 ARM Ltd.
> + */
> +
> +#include <linux/memblock.h>
> +
> +#include <asm/rmi_cmds.h>
> +
> +unsigned long rmm_feat_reg0;
> +unsigned long rmm_feat_reg1;
> +
> +static int rmi_check_version(void)
> +{
> +	struct arm_smccc_res res;
> +	unsigned short version_major, version_minor;
> +	unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
> +						     RMI_ABI_MINOR_VERSION);
> +	unsigned long aa64pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
> +
> +	/* If RME isn't supported, then RMI can't be */
> +	if (cpuid_feature_extract_unsigned_field(aa64pfr0, ID_AA64PFR0_EL1_RME_SHIFT) == 0)
> +		return -ENXIO;
> +
> +	arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
> +
> +	if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
> +		return -ENXIO;
> +
> +	version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
> +	version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
> +
> +	if (res.a0 != RMI_SUCCESS) {
> +		unsigned short high_version_major, high_version_minor;
> +
> +		high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
> +		high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
> +
> +		pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
> +		       version_major, version_minor,
> +		       high_version_major, high_version_minor,
> +		       RMI_ABI_MAJOR_VERSION,
> +		       RMI_ABI_MINOR_VERSION);
> +		return -ENXIO;
> +	}
> +
> +	pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
> +
> +	return 0;
> +}
> +
> +static int __init arm64_init_rmi(void)
> +{
> +	/* Continue without realm support if we can't agree on a version */
> +	if (rmi_check_version())
> +		return 0;

Is this still a valid point that we have to return zero on errors returned
from rmi_check_version() or other other function calls like rmi_features()?
arm64_init_rmi() is triggered by subsys_initcall() where the return value
needs to indicate success or failure. It's fine to return error code from
arm64_init_rmi() in the path.

> +
> +	if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
> +		return 0;
> +	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
> +		return 0;
> +
> +	return 0;
> +}
> +subsys_initcall(arm64_init_rmi);

Thanks,
Gavin



^ permalink raw reply

* Re: [PATCH v14 05/44] arm64: RMI: Add wrappers for RMI calls
From: Gavin Shan @ 2026-05-21  0:21 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-6-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> The wrappers make the call sites easier to read and deal with the
> boiler plate of handling the error codes from the RMM.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes from v13:
>   * Update to RMM v2.0-bet1 spec including some SRO support (there still
>     some FIXMEs where SRO support is incomplete).
> Changes from v12:
>   * Update to RMM v2.0 specification
> Changes from v8:
>   * Switch from arm_smccc_1_2_smc() to arm_smccc_1_2_invoke() in
>     rmi_rtt_read_entry() for consistency.
> Changes from v7:
>   * Minor renaming of parameters and updated comments
> Changes from v5:
>   * Further improve comments
> Changes from v4:
>   * Improve comments
> Changes from v2:
>   * Make output arguments optional.
>   * Mask RIPAS value rmi_rtt_read_entry()
>   * Drop unused rmi_rtt_get_phys()
> ---
>   arch/arm64/include/asm/rmi_cmds.h | 661 ++++++++++++++++++++++++++++++
>   1 file changed, 661 insertions(+)
>   create mode 100644 arch/arm64/include/asm/rmi_cmds.h
> 
> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
> new file mode 100644
> index 000000000000..04f7066894e9
> --- /dev/null
> +++ b/arch/arm64/include/asm/rmi_cmds.h
> @@ -0,0 +1,661 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 ARM Ltd.
> + */
> +
> +#ifndef __ASM_RMI_CMDS_H
> +#define __ASM_RMI_CMDS_H
> +
> +#include <linux/arm-smccc.h>
> +

[...]

> +
> +/**
> + * rmi_rtt_destroy() - Destroy an RTT
> + * @rd: PA of the RD
> + * @ipa: Base of the IPA range described by the RTT
> + * @level: Depth of the RTT within the tree
> + * @out_rtt: Pointer to write the PA of the RTT which was destroyed
> + * @out_top: Pointer to write the top IPA of non-live RTT entries
> + *

In most cases, the parameters are well explained in RMM-v2.0-bet1 spec, I think
it's nice to keep the code and the spec synchronized. For those specific parameters
of this function, they're well explained in RMM-v2.0-bet1 spec as below.

    @rd: PA of the RD for the target realm
    @ipa: Base of the IPA range described by the RTT
    @level: RTT level
    @out_rtt: PA of the RTT which was destroyed
    @out_top: Top IPA of non-live RTT entries, from entry at which the RTT walk terminated

> + * Destroys an RTT. The RTT must be non-live, i.e. none of the entries in the
> + * table are in ASSIGNED or TABLE state.
> + *
> + * Return: RMI return code.
> + */
> +static inline int rmi_rtt_destroy(unsigned long rd,
> +				  unsigned long ipa,
> +				  long level,
> +				  unsigned long *out_rtt,
> +				  unsigned long *out_top)
> +{
> +	struct arm_smccc_res res;
> +
> +	arm_smccc_1_1_invoke(SMC_RMI_RTT_DESTROY, rd, ipa, level, &res);
> +
> +	if (out_rtt)
> +		*out_rtt = res.a1;
> +	if (out_top)
> +		*out_top = res.a2;
> +
> +	return res.a0;
> +}
> +

[...]

Thanks,
Gavin



^ permalink raw reply

* [PATCH] pwm: imx27: Fix variable truncation in .apply()
From: Ronaldo Nunez @ 2026-05-21  0:00 UTC (permalink / raw)
  To: linux-pwm
  Cc: Uwe Kleine-König, Frank Li, Sascha Hauer,
	Pengutronix Kernel Team, Fabio Estevam, imx, linux-arm-kernel,
	linux-kernel, Ronaldo Nunez

This patch fixes a variable truncation when calculating period in
microseconds as part of the solution for the ERR051198 in .apply()
callback.

The problem was identified when reducing the duty cycle through sysfs,
with enable set to 1. The condition to fix errata ERR051198 for period
smaller than 2us is always being met, due to a truncation on tmp,
variable from .apply() callback, caused by the multiplication of
NSEC_PER_SEC, PWMPR (period register) and the prescaler which can easily
overflow u32. Declaring tmp as u64 makes it large enough to accommodate
larger multiplication results.

Testing:
- Hardware: Udoo Neo Extended with iMX6SoloX SoC
- Tools: Verified with a logic analyzer

Signed-off-by: Ronaldo Nunez <rnunez@baylibre.com>
---
 drivers/pwm/pwm-imx27.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c
index 3d34cdc4a3a5..c8b801fcb525 100644
--- a/drivers/pwm/pwm-imx27.c
+++ b/drivers/pwm/pwm-imx27.c
@@ -200,7 +200,7 @@ static void pwm_imx27_wait_fifo_slot(struct pwm_chip *chip,
 static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 			   const struct pwm_state *state)
 {
-	unsigned long period_cycles, duty_cycles, prescale, period_us, tmp;
+	unsigned long period_cycles, duty_cycles, prescale, period_us;
 	struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip);
 	unsigned long long c;
 	unsigned long long clkrate;
@@ -208,6 +208,7 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	int val;
 	int ret;
 	u32 cr;
+	u64 tmp;
 
 	clkrate = clk_get_rate(imx->clks[PWM_IMX27_PER].clk);
 	c = clkrate * state->period;
@@ -249,6 +250,11 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	val = readl(imx->mmio_base + MX3_PWMPR);
 	val = val >= MX3_PWMPR_MAX ? MX3_PWMPR_MAX : val;
 	cr = readl(imx->mmio_base + MX3_PWMCR);
+
+	/*
+	 * tmp stores period in nanoseconds. Result fits in u64 since
+	 * val <= 0xfffe and prescaler in [1, 0x1000].
+	 */
 	tmp = NSEC_PER_SEC * (u64)(val + 2) * MX3_PWMCR_PRESCALER_GET(cr);
 	tmp = DIV_ROUND_UP_ULL(tmp, clkrate);
 	period_us = DIV_ROUND_UP_ULL(tmp, 1000);
-- 
2.53.0



^ permalink raw reply related

* Re: [PATCH v7 17/28] media: rockchip: rga: check scaling factor
From: Nicolas Dufresne @ 2026-05-20 23:58 UTC (permalink / raw)
  To: Sven Püschel, Jacob Chen, Ezequiel Garcia,
	Mauro Carvalho Chehab, Heiko Stuebner, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Hans Verkuil
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	devicetree, kernel, sebastian.reichel, m.tretter, p.zabel
In-Reply-To: <20260521-spu-rga3-v7-17-3f33e8c7145f@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 8851 bytes --]

Le jeudi 21 mai 2026 à 00:44 +0200, Sven Püschel a écrit :
> Check the scaling factor to avoid potential problems. This is relevant
> for the upcoming RGA3 support, as it can hang when the scaling factor
> is exceeded.
> 
> The check is done at streamon when the other side is already streaming
> to avoid incorrectly failing if the application configures the other
> side after calling streamon. As try_fmt shouldn't be state aware,
> it cannot be used to limit the format based on the scaling factor.
> Therefore the check is done just before the actual streaming would be
> started.
> 
> As the driver allows changing the rotation and selection while
> streaming, add additional checks to ensure these changes
> don't exceed the scaling factor.
> 
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

> 
> ---
> Changes in v6:
> - Dropped scaling adjustment in s_fmt, as this didn't match the try_fmt
>   result (which shouldn't have it to avoid making it stateful)
> - Moved scaling check to the prepare_streaming callback instead of
>   overwriting the ioctl directly
> - Consider rotation when checking the scaling
> - Check scaling factor when adjusting rotation and selection while
>   streaming
> ---
>  drivers/media/platform/rockchip/rga/rga-buf.c | 28 ++++++++++++
>  drivers/media/platform/rockchip/rga/rga-hw.c  |  1 +
>  drivers/media/platform/rockchip/rga/rga-hw.h  |  1 +
>  drivers/media/platform/rockchip/rga/rga.c     | 63 +++++++++++++++++++++++++--
>  drivers/media/platform/rockchip/rga/rga.h     |  4 ++
>  5 files changed, 94 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rga/rga-buf.c b/drivers/media/platform/rockchip/rga/rga-buf.c
> index ffc6162b2e681..dcaba66f5c1fc 100644
> --- a/drivers/media/platform/rockchip/rga/rga-buf.c
> +++ b/drivers/media/platform/rockchip/rga/rga-buf.c
> @@ -197,6 +197,33 @@ static void rga_buf_return_buffers(struct vb2_queue *q,
>  	}
>  }
>  
> +static int rga_buf_prepare_streaming(struct vb2_queue *q)
> +{
> +	struct rga_ctx *ctx = vb2_get_drv_priv(q);
> +	const struct rga_hw *hw = ctx->rga->hw;
> +	int ret;
> +
> +	/* It's safe to check the streaming state of the other queue,
> +	 * as the streamon ioctl's can't race due to the lock set in
> +	 * the queue_init function.
> +	 */
> +	if ((V4L2_TYPE_IS_OUTPUT(q->type) &&
> +	     vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx))) ||
> +	    (V4L2_TYPE_IS_CAPTURE(q->type) &&
> +	     vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx)))) {
> +		/*
> +		 * As the other side is already streaming,
> +		 * check that the max scaling factor isn't exceeded.
> +		 */
> +		ret = rga_check_scaling(hw, &ctx->in.crop, &ctx->out.crop,
> +					ctx->rotate);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  static int rga_buf_start_streaming(struct vb2_queue *q, unsigned int count)
>  {
>  	struct rga_ctx *ctx = vb2_get_drv_priv(q);
> @@ -232,6 +259,7 @@ const struct vb2_ops rga_qops = {
>  	.buf_prepare = rga_buf_prepare,
>  	.buf_queue = rga_buf_queue,
>  	.buf_cleanup = rga_buf_cleanup,
> +	.prepare_streaming = rga_buf_prepare_streaming,
>  	.start_streaming = rga_buf_start_streaming,
>  	.stop_streaming = rga_buf_stop_streaming,
>  };
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
> index 567d39e58d33f..f2900812ba76f 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.c
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.c
> @@ -584,6 +584,7 @@ const struct rga_hw rga2_hw = {
>  	.max_width = MAX_WIDTH,
>  	.min_height = MIN_HEIGHT,
>  	.max_height = MAX_HEIGHT,
> +	.max_scaling_factor = MAX_SCALING_FACTOR,
>  	.stride_alignment = 4,
>  
>  	.setup_cmdbuf = rga_hw_setup_cmdbuf,
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.h b/drivers/media/platform/rockchip/rga/rga-hw.h
> index c2e34be751939..805ec23e5e3f4 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.h
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.h
> @@ -14,6 +14,7 @@
>  
>  #define MIN_WIDTH 34
>  #define MIN_HEIGHT 34
> +#define MAX_SCALING_FACTOR 16
>  
>  #define RGA_TIMEOUT 500
>  
> diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
> index 394b14b9469df..22954bbae55fc 100644
> --- a/drivers/media/platform/rockchip/rga/rga.c
> +++ b/drivers/media/platform/rockchip/rga/rga.c
> @@ -127,7 +127,9 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  {
>  	struct rga_ctx *ctx = container_of(ctrl->handler, struct rga_ctx,
>  					   ctrl_handler);
> +	const struct rga_hw *hw = ctx->rga->hw;
>  	unsigned long flags;
> +	int ret = 0;
>  
>  	spin_lock_irqsave(&ctx->rga->ctrl_lock, flags);
>  	switch (ctrl->id) {
> @@ -138,6 +140,13 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		ctx->vflip = ctrl->val;
>  		break;
>  	case V4L2_CID_ROTATE:
> +		if (vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx)) &&
> +		    vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx))) {
> +			ret = rga_check_scaling(hw, &ctx->in.crop,
> +						&ctx->out.crop, ctrl->val);
> +			if (ret < 0)
> +				goto s_ctrl_done;
> +		}
>  		ctx->rotate = ctrl->val;
>  		break;
>  	case V4L2_CID_BG_COLOR:
> @@ -145,8 +154,10 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		break;
>  	}
>  	ctx->cmdbuf_dirty = true;
> +
> +s_ctrl_done:
>  	spin_unlock_irqrestore(&ctx->rga->ctrl_lock, flags);
> -	return 0;
> +	return ret;
>  }
>  
>  static const struct v4l2_ctrl_ops rga_ctrl_ops = {
> @@ -182,6 +193,38 @@ static int rga_setup_ctrls(struct rga_ctx *ctx)
>  	return 0;
>  }
>  
> +static bool check_scaling_factor(const struct rga_hw *hw, u32 src_size,
> +				 u32 dst_size)
> +{
> +	if (src_size < dst_size)
> +		return src_size * hw->max_scaling_factor >= dst_size;
> +	else
> +		return dst_size * hw->max_scaling_factor >= src_size;
> +}
> +
> +int rga_check_scaling(const struct rga_hw *hw, const struct v4l2_rect *crop_in,
> +		      const struct v4l2_rect *crop_out, u32 rotate)
> +{
> +	u32 scaled_width;
> +	u32 scaled_height;
> +
> +	if (rotate == 90 || rotate == 270) {
> +		scaled_width = crop_out->height;
> +		scaled_height = crop_out->width;
> +	} else {
> +		scaled_width = crop_out->width;
> +		scaled_height = crop_out->height;
> +	}
> +
> +	if (!check_scaling_factor(hw, crop_in->width, scaled_width))
> +		return -EINVAL;
> +
> +	if (!check_scaling_factor(hw, crop_in->height, scaled_height))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static struct rga_fmt *rga_fmt_find(struct rockchip_rga *rga, u32 pixelformat)
>  {
>  	unsigned int i;
> @@ -525,7 +568,6 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  	struct rga_ctx *ctx = file_to_rga_ctx(file);
>  	struct rockchip_rga *rga = ctx->rga;
>  	struct rga_frame *f;
> -	int ret = 0;
>  
>  	f = rga_get_frame(ctx, s->type);
>  	if (IS_ERR(f))
> @@ -569,10 +611,25 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  		return -EINVAL;
>  	}
>  
> +	if (vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx)) &&
> +	    vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx))) {
> +		int ret = 0;
> +
> +		if (V4L2_TYPE_IS_OUTPUT(s->type))
> +			ret = rga_check_scaling(rga->hw, &s->r, &ctx->out.crop,
> +						ctx->rotate);
> +		else
> +			ret = rga_check_scaling(rga->hw, &ctx->in.crop, &s->r,
> +						ctx->rotate);
> +
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	f->crop = s->r;
>  	ctx->cmdbuf_dirty = true;
>  
> -	return ret;
> +	return 0;
>  }
>  
>  static const struct v4l2_ioctl_ops rga_ioctl_ops = {
> diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
> index 5360f092fecf0..df525c6aea8b6 100644
> --- a/drivers/media/platform/rockchip/rga/rga.h
> +++ b/drivers/media/platform/rockchip/rga/rga.h
> @@ -123,6 +123,9 @@ static inline struct rga_vb_buffer *vb_to_rga(struct vb2_v4l2_buffer *vb)
>  
>  struct rga_frame *rga_get_frame(struct rga_ctx *ctx, enum v4l2_buf_type type);
>  
> +int rga_check_scaling(const struct rga_hw *hw, const struct v4l2_rect *crop_in,
> +		      const struct v4l2_rect *crop_out, u32 rotate);
> +
>  /* RGA Buffers Manage */
>  extern const struct vb2_ops rga_qops;
>  
> @@ -151,6 +154,7 @@ struct rga_hw {
>  	size_t cmdbuf_size;
>  	u32 min_width, min_height;
>  	u32 max_width, max_height;
> +	u8 max_scaling_factor;
>  	u8 stride_alignment;
>  
>  	void (*setup_cmdbuf)(struct rga_ctx *ctx);

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v7 16/28] media: rockchip: rga: reuse cmdbuf contents
From: Nicolas Dufresne @ 2026-05-20 23:55 UTC (permalink / raw)
  To: Sven Püschel, Jacob Chen, Ezequiel Garcia,
	Mauro Carvalho Chehab, Heiko Stuebner, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Hans Verkuil
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	devicetree, kernel, sebastian.reichel, m.tretter, p.zabel
In-Reply-To: <20260521-spu-rga3-v7-16-3f33e8c7145f@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 5751 bytes --]

Le jeudi 21 mai 2026 à 00:44 +0200, Sven Püschel a écrit :
> Reuse the command buffer contents instead of completely writing it
> for every frame. Therefore we only need to replace the source and
> destination addresses for each frame. This reduces the amount of CPU
> and memory operations done in each frame. A new cmdbuf_dirty flag notes
> if the cmdbuf has to be rewritten on the next frame.
> 
> The initial idea of initializing the cmdbuf on streamon broke the
> ability to update controls while streaming (e.g. mirroring).
> 
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

> 
> ---
> Changes in v6:
> - Reworked the commit to not setup the cmdbuf on streamon but rather
>   re-initialize it on the next frame when something changed.
> - Sasahiko flagged the cmdbuf setup at streamon:
>   https://sashiko.dev/#/patchset/20260428-spu-rga3-v5-0-eb7f5d019d86%40pengutronix.de?part=17
> - Dropped Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
>   due to the reworked patch and commit message contents
> 
> Changes in v5:
> - Don't set the flipping and rotation values at streamon and preventing
>   the userspace from chainging them at runtime
> ---
>  drivers/media/platform/rockchip/rga/rga-hw.c | 13 +++++++++----
>  drivers/media/platform/rockchip/rga/rga.c    | 11 +++++++++--
>  drivers/media/platform/rockchip/rga/rga.h    |  2 ++
>  3 files changed, 20 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
> index dac3cb6aa17d3..567d39e58d33f 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.c
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.c
> @@ -417,8 +417,6 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  {
>  	struct rockchip_rga *rga = ctx->rga;
>  
> -	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> -
>  	rga_cmd_set_src_addr(ctx, src->dma_desc_pa);
>  	/*
>  	 * Due to hardware bug,
> @@ -427,11 +425,9 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  	rga_cmd_set_src1_addr(ctx, dst->dma_desc_pa);
>  
>  	rga_cmd_set_dst_addr(ctx, dst->dma_desc_pa);
> -	rga_cmd_set_mode(ctx);
>  
>  	rga_cmd_set_src_info(ctx, &src->offset);
>  	rga_cmd_set_dst_info(ctx, &dst->offset);
> -	rga_cmd_set_trans_info(ctx);
>  
>  	rga_write(rga, RGA_CMD_BASE, ctx->cmdbuf_phy);
>  
> @@ -440,6 +436,14 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  				   PAGE_SIZE, DMA_BIDIRECTIONAL);
>  }
>  
> +static void rga_hw_setup_cmdbuf(struct rga_ctx *ctx)
> +{
> +	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> +
> +	rga_cmd_set_mode(ctx);
> +	rga_cmd_set_trans_info(ctx);
> +}
> +
>  static void rga_hw_start(struct rockchip_rga *rga,
>  			 struct rga_vb_buffer *src,  struct rga_vb_buffer *dst)
>  {
> @@ -582,6 +586,7 @@ const struct rga_hw rga2_hw = {
>  	.max_height = MAX_HEIGHT,
>  	.stride_alignment = 4,
>  
> +	.setup_cmdbuf = rga_hw_setup_cmdbuf,
>  	.start = rga_hw_start,
>  	.handle_irq = rga_handle_irq,
>  	.get_version = rga_get_version,
> diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
> index d080cb672740b..394b14b9469df 100644
> --- a/drivers/media/platform/rockchip/rga/rga.c
> +++ b/drivers/media/platform/rockchip/rga/rga.c
> @@ -38,6 +38,11 @@ static void device_run(void *prv)
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&rga->ctrl_lock, flags);
> +	if (ctx->cmdbuf_dirty) {
> +		ctx->cmdbuf_dirty = false;
> +		rga->hw->setup_cmdbuf(ctx);
> +	}
> +	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
>  
>  	rga->curr = ctx;
>  
> @@ -47,8 +52,6 @@ static void device_run(void *prv)
>  	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
>  
>  	rga->hw->start(rga, vb_to_rga(src), vb_to_rga(dst));
> -
> -	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
>  }
>  
>  static irqreturn_t rga_isr(int irq, void *prv)
> @@ -141,6 +144,7 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		ctx->fill_color = ctrl->val;
>  		break;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  	spin_unlock_irqrestore(&ctx->rga->ctrl_lock, flags);
>  	return 0;
>  }
> @@ -228,6 +232,7 @@ static int rga_open(struct file *file)
>  		ret = -ENOMEM;
>  		goto rel_ctx;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  
>  	ctx->rga = rga;
>  	/* Set default formats */
> @@ -448,6 +453,7 @@ static int vidioc_s_fmt(struct file *file, void *priv, struct v4l2_format *f)
>  	frm->crop.height = pix_fmt->height;
>  
>  	frm->pix = *pix_fmt;
> +	ctx->cmdbuf_dirty = true;
>  
>  	v4l2_dbg(debug, 1, &rga->v4l2_dev,
>  		 "[%s] fmt - %p4cc %dx%d (stride %d, sizeimage %d)\n",
> @@ -564,6 +570,7 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  	}
>  
>  	f->crop = s->r;
> +	ctx->cmdbuf_dirty = true;
>  
>  	return ret;
>  }
> diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
> index 38518146910a6..5360f092fecf0 100644
> --- a/drivers/media/platform/rockchip/rga/rga.h
> +++ b/drivers/media/platform/rockchip/rga/rga.h
> @@ -55,6 +55,7 @@ struct rga_ctx {
>  
>  	void *cmdbuf_virt;
>  	dma_addr_t cmdbuf_phy;
> +	bool cmdbuf_dirty;
>  
>  	int osequence;
>  	int csequence;
> @@ -152,6 +153,7 @@ struct rga_hw {
>  	u32 max_width, max_height;
>  	u8 stride_alignment;
>  
> +	void (*setup_cmdbuf)(struct rga_ctx *ctx);
>  	void (*start)(struct rockchip_rga *rga,
>  		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
>  	bool (*handle_irq)(struct rockchip_rga *rga);

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.

scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
kernel VA. On exhaustion, the pool grows by claiming more pages via
bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
mapping address; callers translate to the BPF-arena form themselves if
needed.

Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.

scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/build_policy.c |   4 ++
 kernel/sched/ext.c          |  11 ++++
 kernel/sched/ext_arena.c    | 127 ++++++++++++++++++++++++++++++++++++
 kernel/sched/ext_arena.h    |  18 +++++
 kernel/sched/ext_internal.h |   5 ++
 5 files changed, 165 insertions(+)
 create mode 100644 kernel/sched/ext_arena.c
 create mode 100644 kernel/sched/ext_arena.h

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 5e76c9177d54..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -59,12 +59,16 @@
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 # include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
 # include "ext_types.h"
 # include "ext_internal.h"
 # include "ext_cid.h"
+# include "ext_arena.h"
 # include "ext_idle.h"
 # include "ext.c"
 # include "ext_cid.c"
+# include "ext_arena.c"
 # include "ext_idle.c"
 #endif
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 56f94ac32ba0..fb91079c1244 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
 	kfree(sch);
@@ -7155,6 +7156,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7473,6 +7480,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..53174033765d
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+	SCX_ARENA_MIN_ORDER		= 3,	/* 8-byte minimum sub-allocation */
+	SCX_ARENA_GROW_PAGES		= 4,	/* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+	if (!sch->arena_map)
+		return 0;
+
+	sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+	if (!sch->arena_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+				  void *data)
+{
+	int order = pool->min_alloc_order;
+	size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+	unsigned long end_bit = chunk_sz >> order;
+	unsigned long b, e;
+
+	for_each_set_bitrange(b, e, chunk->bits, end_bit)
+		gen_pool_free(pool, chunk->start_addr + (b << order),
+			      (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+	if (!sch->arena_pool)
+		return;
+	gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+	gen_pool_destroy(sch->arena_pool);
+	sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+	u64 kern_vm_start;
+	u32 uaddr32;
+	void *p;
+	int ret;
+
+	if (!sch->arena_map || !sch->arena_pool)
+		return -EINVAL;
+
+	p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+					    page_cnt, NUMA_NO_NODE, 0);
+	if (!p)
+		return -ENOMEM;
+
+	uaddr32 = (u32)(unsigned long)p;
+	kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+	ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+			   page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+	if (ret) {
+		bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+	unsigned long kern_va;
+	u32 page_cnt;
+
+	might_sleep();
+
+	if (!sch->arena_pool)
+		return NULL;
+
+	kern_va = gen_pool_alloc(sch->arena_pool, size);
+	if (!kern_va) {
+		page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+				 (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+		if (scx_arena_grow(sch, page_cnt))
+			return NULL;
+		kern_va = gen_pool_alloc(sch->arena_pool, size);
+		if (!kern_va)
+			return NULL;
+	}
+
+	return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+	if (sch->arena_pool && kern_va)
+		gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index d40cfd29ddaa..ff7e882bd67a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1116,8 +1116,13 @@ struct scx_sched {
 	 * Arena map auto-discovered from member progs at struct_ops attach.
 	 * cid-form schedulers must use exactly one arena across all member
 	 * progs. NULL on cpu-form.
+	 *
+	 * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+	 * at the kernel-side mapping address. Grows on demand and pages are
+	 * not released until sched destroy.
 	 */
 	struct bpf_map		*arena_map;
+	struct gen_pool		*arena_pool;
 
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
-- 
2.54.0



^ permalink raw reply related

* [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

struct bpf_arena is opaque to callers outside arena.c. Add two helpers
for struct_ops subsystems that need to reach into an arena:

  bpf_arena_map_kern_vm_start(struct bpf_map *map)
    returns @map's kern_vm_start. A sched_ext follow-up needs this
    to translate kern_va <-> uaddr.

  bpf_prog_arena(struct bpf_prog *prog)
    returns the bpf_map of the arena referenced by @prog (NULL if
    @prog references no arena). The verifier enforces at most one
    arena per program. Used by struct_ops callers that auto-discover
    an arena from a member prog and need to take a map reference.

Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/arena.c  | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b99d786e98c..e1ba57c10aaa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock);
 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index a811cf6170fa..51b9ae36feb6 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
 	return arena ? arena->user_vm_start : 0;
 }
 
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+	return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+	struct bpf_arena *arena = prog->aux->arena;
+
+	return arena ? &arena->map : NULL;
+}
+
 static long arena_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
-- 
2.54.0



^ permalink raw reply related

* [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.

With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
 kernel/sched/ext_cid.c                | 20 +-------
 kernel/sched/ext_internal.h           | 10 +++-
 tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
 tools/sched_ext/scx_qmap.bpf.c        |  5 +-
 5 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fb91079c1244..94562e3350c6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
 		update_locked_rq(rq);
 
 	if (scx_is_cid_type()) {
-		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
-		lockdep_assert_irqs_disabled();
-		scx_cpumask_to_cmask(cpumask, cmask);
-		sch->ops_cid.set_cmask(task, cmask);
+		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+		unsigned long uaddr = (unsigned long)kern_va -
+			bpf_arena_map_kern_vm_start(sch->arena_map);
+		/*
+		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+		 * holds the rq lock with IRQs disabled, which makes us the sole
+		 * user of the scratch area.
+		 */
+		scx_cpumask_to_cmask(cpumask, kern_va);
+		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
 	} else {
 		sch->ops.set_cpumask(task, cpumask);
 	}
@@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
 static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->is_cid_type || !sch->arena_pool)
+		return 0;
+
+	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+	if (!sch->set_cmask_scratch)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		*slot = scx_arena_alloc(sch, size);
+		if (!*slot)
+			return -ENOMEM;
+		scx_cmask_init(*slot, 0, num_possible_cpus());
+	}
+	return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->set_cmask_scratch)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		scx_arena_free(sch, *slot, size);
+	}
+	free_percpu(sch->set_cmask_scratch);
+	sch->set_cmask_scratch = NULL;
+}
+
 static void scx_sched_free_rcu_work(struct work_struct *work)
 {
 	struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_set_cmask_scratch_free(sch);
 	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
@@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		goto err_disable;
 	}
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 0c91b951fd33..808c6390da5a 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
  */
 #include <linux/cacheinfo.h>
 
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * cid tables.
  *
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
 	u32 npossible = num_possible_cpus();
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
-	struct scx_cmask __percpu *set_cmask_scratch;
-	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
 	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
 	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
 	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
-	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
-						       SCX_CMASK_NR_WORDS(npossible)),
-					   sizeof(u64));
 
-	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
 		kfree(cid_to_cpu);
 		kfree(cpu_to_cid);
 		kfree(cid_topo);
-		free_percpu(set_cmask_scratch);
 		return -ENOMEM;
 	}
 
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
-	for_each_possible_cpu(cpu)
-		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
-			       0, npossible);
-	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
 	struct bpf_map		*arena_map;
 	struct gen_pool		*arena_pool;
 
+	/*
+	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+	 * arena's kern_vm_start.
+	 */
+	struct scx_cmask * __percpu *set_cmask_scratch;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * True when the currently loaded scheduler hierarchy is cid-form. All scheds
  * in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index e281c88fa824..70f2a3829af4 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
 	}
 }
 
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
-						   const struct scx_cmask *src)
-{
-	u32 base = 0, nr_cids = 0, nr_words, wi;
-
-	if (dst->base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
-		scx_bpf_error("probe-read cmask->base failed");
-		return;
-	}
-	if (base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
-		scx_bpf_error("probe-read cmask->nr_cids failed");
-		return;
-	}
-
-	if (nr_cids > dst->nr_cids) {
-		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
-			      nr_cids, dst->nr_cids);
-		return;
-	}
-
-	nr_words = CMASK_NR_WORDS(nr_cids);
-	cmask_zero(dst);
-	bpf_for(wi, 0, CMASK_MAX_WORDS) {
-		u64 word = 0;
-		if (wi >= nr_words)
-			break;
-		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
-			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
-			return;
-		}
-		dst->bits[wi] = word;
-	}
-}
-
 #endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
 }
 
 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
-		    const struct scx_cmask *cmask)
+		    const struct scx_cmask *cmask_in)
 {
+	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
 	task_ctx_t *taskc;
 
 	taskc = lookup_task_ctx(p);
 	if (!taskc)
 		return;
-	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+	cmask_copy(&taskc->cpus_allowed, cmask);
 }
 
 struct monitor_timer {
-- 
2.54.0



^ permalink raw reply related

* [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Upcoming patches will let the kernel place arena-resident scratch shared
with the BPF program (e.g. per-CPU set_cmask cmask) so the BPF side can
dereference it directly via __arena pointers, replacing the current
cmask_copy_from_kernel() probe-read loop. That requires each cid-form
scheduler to expose its arena to the kernel. Kernel- side accesses are
recovered by the per-arena scratch-page mechanism.

bpf_scx_reg_cid() walks the struct_ops member progs via
bpf_struct_ops_for_each_prog() and reads each prog's arena via
bpf_prog_arena(). The verifier enforces one arena per program, so each
member prog contributes at most one arena. All non-NULL contributions must
match and at least one member prog must use an arena. The map ref is held on
scx_sched and dropped on sched destroy. cpu-form schedulers (bpf_scx_reg)
are unchanged - no arena requirement.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/sched/ext_internal.h |  8 ++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c458552d14f..56f94ac32ba0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	if (sch->arena_map)
+		bpf_map_put(sch->arena_map);
 	kfree(sch);
 }
 
@@ -6746,6 +6748,7 @@ struct scx_enable_cmd {
 		struct sched_ext_ops_cid	*ops_cid;
 	};
 	bool			is_cid_type;
+	struct bpf_map		*arena_map;	/* arena ref to transfer to sch */
 	int			ret;
 };
 
@@ -6913,6 +6916,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
 		return ERR_PTR(ret);
 	}
 #endif	/* CONFIG_EXT_SUB_SCHED */
+
+	/*
+	 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+	 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+	 * drops the ref. After this point, sch owns the ref and any cleanup
+	 * runs through scx_sched_free_rcu_work() which puts it.
+	 */
+	sch->arena_map = cmd->arena_map;
+	cmd->arena_map = NULL;
 	return sch;
 
 #ifdef CONFIG_EXT_SUB_SCHED
@@ -7898,11 +7910,53 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 	return scx_enable(&cmd, link);
 }
 
+struct scx_arena_scan {
+	struct bpf_map	*arena;
+	int		err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+	struct scx_arena_scan *s = data;
+	struct bpf_map *arena = bpf_prog_arena(prog);
+
+	if (!arena)
+		return 0;
+	if (s->arena && s->arena != arena) {
+		s->err = -EINVAL;
+		return 1;
+	}
+	s->arena = arena;
+	return 0;
+}
+
 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
 {
 	struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+	struct scx_arena_scan scan = {};
+	int ret;
 
-	return scx_enable(&cmd, link);
+	bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+	if (scan.err) {
+		pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+		return scan.err;
+	}
+	if (!scan.arena) {
+		pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+		return -EINVAL;
+	}
+
+	bpf_map_inc(scan.arena);
+	cmd.arena_map = scan.arena;
+	ret = scx_enable(&cmd, link);
+	if (cmd.arena_map)		/* not consumed by scx_alloc_and_add_sched() */
+		bpf_map_put(cmd.arena_map);
+	return ret;
 }
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 7258aea94b9f..d40cfd29ddaa 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1111,6 +1111,14 @@ struct scx_sched {
 		struct sched_ext_ops_cid	ops_cid;
 	};
 	bool			is_cid_type;	/* true if registered via bpf_sched_ext_ops_cid */
+
+	/*
+	 * Arena map auto-discovered from member progs at struct_ops attach.
+	 * cid-form schedulers must use exactly one arena across all member
+	 * progs. NULL on cpu-form.
+	 */
+	struct bpf_map		*arena_map;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
-- 
2.54.0



^ permalink raw reply related

* [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

From: Kumar Kartikeya Dwivedi <memxor@gmail.com>

BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
over arena memory is awkward today. Data has to be staged through a trusted
kernel pointer with extra code and copying on the BPF side. While reads
through arena pointers can use a fault-safe helper, writes don't have a good
solution. The in-line alternative would need instruction emulation or asm
fixup labels.

Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
handed-in arena pointer, without bounds checking. A per-arena scratch page
is installed by the arch fault path into empty arena kernel PTEs - x86 from
page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
translation faults, both after the existing exception-table and KFENCE
handling. The faulting instruction retries and the access is also reported
through the program's BPF stream, preserving error reporting.

bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
half-guard is excluded because well-behaved kfuncs only access forward from
arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.

The install is lock-free via ptep_try_set(). On race-loss the winning
installer's PTE is already valid, so the access retry succeeds. The arena
clear path uses ptep_get_and_clear() so installer and clearer race through
atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
entries just cause one extra re-fault, cheaper than a global IPI on every
install.

Scratch exists only to keep the kernel from oopsing on an in-line arena
access. Its presence at a PTE means the BPF program has already
malfunctioned, and the violation is reported through the program's BPF
stream. The only requirement for behavior on a scratched PTE is that the
kernel doesn't crash. In particular, any user-side access through such a PTE
may segfault. The shared scratch page is freed once during map destruction.

BPF instruction faults continue to use the existing JIT exception-table
path. This patch changes only the kernel-text fault path. No UAPI flag is
added. The new behavior is the default.

v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
 Documentation/bpf/kfuncs.rst |  14 +++
 arch/arm64/mm/fault.c        |  10 +-
 arch/x86/mm/fault.c          |  12 ++-
 include/linux/bpf.h          |   1 +
 include/linux/bpf_defs.h     |  11 +++
 kernel/bpf/arena.c           | 177 +++++++++++++++++++++++++++--------
 kernel/bpf/core.c            |   5 +
 7 files changed, 183 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/bpf_defs.h

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..6d497e720998 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
 PTR_TO_BTF_ID type matching if two types have the exact same name, with one
 being suffixed with ``___init``.
 
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
 .. _BPF_kfunc_lifecycle_expectations:
 
 3. kfunc lifecycle expectations
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 920a8b244d59..0d58d667fcd8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
 #include <linux/extable.h>
 #include <linux/kfence.h>
 #include <linux/signal.h>
@@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (esr_fsc_is_translation_fault(esr) &&
-		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
-			return;
+		if (esr_fsc_is_translation_fault(esr)) {
+			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+				return;
+			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+				return;
+		}
 
 		msg = "paging request";
 	}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..b0f103ddbd23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/memblock.h>		/* max_low_pfn			*/
+#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
 #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
@@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (IS_ENABLED(CONFIG_EFI))
 		efi_crash_gracefully_on_page_fault(address);
 
-	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) &&
-	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
-		return;
+	/* Only not-present faults should be handled by KFENCE or BPF arena. */
+	if (!(error_code & X86_PF_PROT)) {
+		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+			return;
+		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+			return;
+	}
 
 oops:
 	/*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..831996c411cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
 
 #include <crypto/sha2.h>
 #include <linux/workqueue.h>
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..d98e033b8c0b
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..1c0b87ecc817 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,6 +53,7 @@ struct bpf_arena {
 	u64 user_vm_start;
 	u64 user_vm_end;
 	struct vm_struct *kern_vm;
+	struct page *scratch_page;
 	struct range_tree rt;
 	/* protects rt */
 	rqspinlock_t spinlock;
@@ -118,6 +119,11 @@ struct apply_range_data {
 	int i;
 };
 
+struct clear_range_data {
+	struct llist_head *free_pages;
+	struct page *scratch_page;
+};
+
 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct apply_range_data *d = data;
@@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
 	flush_cache_vmap(start, start + size);
 }
 
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
 {
+	struct clear_range_data *d = data;
 	pte_t old_pte;
 	struct page *page;
 
-	/* sanity check */
-	old_pte = ptep_get(pte);
+	/*
+	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+	 * Both sides must be atomic.
+	 */
+	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
 	if (pte_none(old_pte) || !pte_present(old_pte))
-		return 0; /* nothing to do */
+		return 0;
 
 	page = pte_page(old_pte);
 	if (WARN_ON_ONCE(!page))
 		return -EINVAL;
 
-	pte_clear(&init_mm, addr, pte);
+	/*
+	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+	 * here. Without the skip, scratch_page would be freed.
+	 */
+	if (page == d->scratch_page)
+		return 0;
+
+	__llist_add(&page->pcp_llist, d->free_pages);
+	return 0;
+}
 
-	/* Add page to the list so it is freed later */
-	if (free_pages)
-		__llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+	struct page *scratch_page = data;
 
+	if (!pte_none(ptep_get(pte)))
+		return 0;
+	/*
+	 * Best-effort install. ptep_try_set() returns false only if another
+	 * installer (real allocation or concurrent fault) won the cmpxchg.
+	 * Their PTE is already valid, so the access retry succeeds.
+	 *
+	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+	 * cause one extra re-fault through this same path.
+	 */
+	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
 	return 0;
 }
 
 static int populate_pgtable_except_pte(struct bpf_arena *arena)
 {
+	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
 	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
 }
 
 static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 	init_irq_work(&arena->free_irq, arena_free_irq);
 	INIT_WORK(&arena->free_work, arena_free_worker);
 	bpf_map_init_from_attr(&arena->map, attr);
+
+	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+	if (err)
+		goto err_free_arena;
+
 	range_tree_init(&arena->rt);
 	err = range_tree_set(&arena->rt, 0, attr->max_entries);
-	if (err) {
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_free_scratch;
 	mutex_init(&arena->lock);
 	raw_res_spin_lock_init(&arena->spinlock);
 	err = populate_pgtable_except_pte(arena);
-	if (err) {
-		range_tree_destroy(&arena->rt);
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_destroy_rt;
 
 	return &arena->map;
+
+err_destroy_rt:
+	range_tree_destroy(&arena->rt);
+err_free_scratch:
+	__free_page(arena->scratch_page);
+err_free_arena:
+	bpf_map_area_free(arena);
 err:
 	free_vm_area(kern_vm);
 	return ERR_PTR(err);
@@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 
 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 {
+	struct bpf_arena *arena = data;
 	struct page *page;
 	pte_t pte;
 
@@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 	if (!pte_present(pte)) /* sanity check */
 		return 0;
 	page = pte_page(pte);
+	/*
+	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+	 */
+	if (page == arena->scratch_page)
+		return 0;
 	/*
 	 * We do not update pte here:
 	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
 	 * free those pages.
 	 */
 	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
 	free_vm_area(arena->kern_vm);
 	range_tree_destroy(&arena->rt);
+	__free_page(arena->scratch_page);
 	bpf_map_area_free(arena);
 }
 
@@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_RETRY;
 
 	page = vmalloc_to_page((void *)kaddr);
-	if (page)
+	if (page) {
+		if (page == arena->scratch_page)
+			/* BPF triggered scratch here; don't lazy-alloc over it */
+			goto out_sigsegv;
 		/* already have a page vmap-ed */
 		goto out;
+	}
 
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
 		/* User space requested to segfault when page is not allocated by bpf prog */
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
 	if (ret)
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	struct apply_range_data data = { .pages = &page, .i = 0 };
 	/* Account into memcg of the process that created bpf_arena */
 	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 
 	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
 		free_pages_nolock(page, 0);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 	flush_vmap_cache(kaddr, PAGE_SIZE);
 	bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	vmf->page = page;
 	return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
 	bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	return VM_FAULT_SIGSEGV;
 }
@@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	struct llist_head free_pages;
 	struct llist_node *pos, *t;
 	struct arena_free_span *s;
+	struct clear_range_data cdata;
 	unsigned long flags;
 	int ret = 0;
 
@@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	range_tree_set(&arena->rt, pgoff, page_cnt);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	/* clear ptes and collect struct pages */
 	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-				     apply_range_clear_cb, &free_pages);
+				     apply_range_clear_cb, &cdata);
 
 	/* drop the lock to do the tlb flush and zap pages */
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
 	struct arena_free_span *s;
 	u64 arena_vm_start, user_vm_start;
 	struct llist_head free_pages;
+	struct clear_range_data cdata;
 	struct page *page;
 	unsigned long full_uaddr;
 	long kaddr, page_cnt, pgoff;
@@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
 	user_vm_start = bpf_arena_get_user_vm_start(arena);
 
@@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
 
 		/* clear ptes and collect pages in free_pages llist */
 		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-					     apply_range_clear_cb, &free_pages);
+					     apply_range_clear_cb, &cdata);
 
 		range_tree_set(&arena->rt, pgoff, page_cnt);
 	}
@@ -928,23 +986,12 @@ static int __init kfunc_init(void)
 }
 late_initcall(kfunc_init);
 
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+					      unsigned long addr, unsigned long fault_ip)
 {
 	struct bpf_stream_stage ss;
-	struct bpf_prog *prog;
 	u64 user_vm_start;
 
-	/*
-	 * The RCU read lock is held to safely traverse the latch tree, but we
-	 * don't need its protection when accessing the prog, since it will not
-	 * disappear while we are handling the fault.
-	 */
-	rcu_read_lock();
-	prog = bpf_prog_ksym_find(fault_ip);
-	rcu_read_unlock();
-	if (!prog)
-		return;
-
 	/* Use main prog for stream access */
 	prog = prog->aux->main_prog_aux->prog;
 
@@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
 		bpf_stream_dump_stack(ss);
 	}));
 }
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+	struct bpf_arena *arena;
+	struct bpf_prog *prog;
+	unsigned long kbase;
+	unsigned long page_addr = addr & PAGE_MASK;
+
+	prog = bpf_prog_find_from_stack();
+	if (!prog)
+		return false;
+
+	arena = prog->aux->arena;
+	/* a prog not using arena may be on stack, so arena can be NULL */
+	if (!arena)
+		return false;
+
+	kbase = bpf_arena_get_kern_vm_start(arena);
+
+	/*
+	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+	 * Lower guard is unreachable from kfuncs; an address there indicates
+	 * a different bug class - leave it to the regular kernel oops path.
+	 */
+	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+		return false;
+
+	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+			    apply_range_set_scratch_cb, arena->scratch_page);
+	flush_vmap_cache(page_addr, PAGE_SIZE);
+	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+	return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+	struct bpf_prog *prog;
+
+	/*
+	 * The RCU read lock is held to safely traverse the latch tree, but we
+	 * don't need its protection when accessing the prog, since it will not
+	 * disappear while we are handling the fault.
+	 */
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(fault_ip);
+	rcu_read_unlock();
+	if (!prog)
+		return;
+	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..fa368d8920d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
 {
 	return 0;
 }
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+					unsigned long fault_ip)
+{
+	return false;
+}
 
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
-- 
2.54.0



^ permalink raw reply related


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox