Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH v8 3/5] clk: en7523: Add support for selecting the Serdes port in SCU
From: Brian Masney @ 2026-05-20 22:53 UTC (permalink / raw)
  To: Christian Marangi
  Cc: Michael Turquette, Stephen Boyd, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Vinod Koul, Neil Armstrong, Lorenzo Bianconi,
	Felix Fietkau, linux-clk, devicetree, linux-kernel,
	linux-arm-kernel, linux-phy
In-Reply-To: <20260520150912.11614-4-ansuelsmth@gmail.com>

Hi Christian,

On Wed, May 20, 2026 at 05:09:08PM +0200, Christian Marangi wrote:
> In the SCU register for clock and reset, there are also some register to
> select the Serdes port mode. The Airoha AN7581 SoC have 4 different Serdes
> that can switch between PCIe, USB or Ethernet mode.
> 
> Add a simple PHY provider that expose the .set_mode OP to toggle the
> requested mode for the Serdes port.
> 
> Signed-off-by: Christian Marangi <ansuelsmth@gmail.com>
> ---
>  drivers/clk/Kconfig      |   1 +
>  drivers/clk/clk-en7523.c | 216 ++++++++++++++++++++++++++++++++++++++-
>  2 files changed, 214 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/clk/Kconfig b/drivers/clk/Kconfig
> index b2efbe9f6acb..e60a824b5117 100644
> --- a/drivers/clk/Kconfig
> +++ b/drivers/clk/Kconfig
> @@ -221,6 +221,7 @@ config COMMON_CLK_EN7523
>  	bool "Clock driver for Airoha/EcoNet SoC system clocks"
>  	depends on OF
>  	depends on ARCH_AIROHA || ECONET || COMPILE_TEST
> +	select GENERIC_PHY
>  	default ARCH_AIROHA
>  	help
>  	  This driver provides the fixed clocks and gates present on Airoha
> diff --git a/drivers/clk/clk-en7523.c b/drivers/clk/clk-en7523.c
> index 1ab0e2eca5d3..d4b73c5f15b9 100644
> --- a/drivers/clk/clk-en7523.c
> +++ b/drivers/clk/clk-en7523.c
> @@ -6,14 +6,18 @@
>  #include <linux/io.h>
>  #include <linux/mfd/syscon.h>
>  #include <linux/platform_device.h>
> +#include <linux/phy.h>
> +#include <linux/phy/phy.h>
>  #include <linux/property.h>
>  #include <linux/regmap.h>
>  #include <linux/reset-controller.h>
> +#include <linux/spinlock.h>
>  #include <dt-bindings/clock/en7523-clk.h>
>  #include <dt-bindings/reset/airoha,en7523-reset.h>
>  #include <dt-bindings/reset/airoha,en7581-reset.h>
>  #include <dt-bindings/clock/econet,en751221-scu.h>
>  #include <dt-bindings/reset/econet,en751221-scu.h>
> +#include <dt-bindings/soc/airoha,scu-ssr.h>
>  
>  #define RST_NR_PER_BANK			32
>  
> @@ -40,9 +44,22 @@
>  #define   REG_HIR_MASK			GENMASK(31, 16)
>  /* EN7581 */
>  #define REG_NP_SCU_PCIC			0x88
> +#define REG_NP_SCU_SSR3			0x94
> +#define REG_SSUSB_HSGMII_SEL_MASK	BIT(29)
> +#define REG_SSUSB_HSGMII_SEL_HSGMII	FIELD_PREP_CONST(REG_SSUSB_HSGMII_SEL_MASK, 0x0)
> +#define REG_SSUSB_HSGMII_SEL_USB	FIELD_PREP_CONST(REG_SSUSB_HSGMII_SEL_MASK, 0x1)
>  #define REG_NP_SCU_SSTR			0x9c
>  #define REG_PCIE_XSI0_SEL_MASK		GENMASK(14, 13)
> +#define REG_PCIE_XSI0_SEL_PCIE		FIELD_PREP_CONST(REG_PCIE_XSI0_SEL_MASK, 0x0)
> +#define REG_PCIE_XSI0_SEL_XFI		FIELD_PREP_CONST(REG_PCIE_XSI0_SEL_MASK, 0x1)
> +#define REG_PCIE_XSI0_SEL_HSGMII	FIELD_PREP_CONST(REG_PCIE_XSI0_SEL_MASK, 0x2)
>  #define REG_PCIE_XSI1_SEL_MASK		GENMASK(12, 11)
> +#define REG_PCIE_XSI1_SEL_PCIE		FIELD_PREP_CONST(REG_PCIE_XSI1_SEL_MASK, 0x0)
> +#define REG_PCIE_XSI1_SEL_XFI		FIELD_PREP_CONST(REG_PCIE_XSI1_SEL_MASK, 0x1)
> +#define REG_PCIE_XSI1_SEL_HSGMII	FIELD_PREP_CONST(REG_PCIE_XSI1_SEL_MASK, 0x2)
> +#define REG_USB_PCIE_SEL_MASK		BIT(3)
> +#define REG_USB_PCIE_SEL_PCIE		FIELD_PREP_CONST(REG_USB_PCIE_SEL_MASK, 0x0)
> +#define REG_USB_PCIE_SEL_USB		FIELD_PREP_CONST(REG_USB_PCIE_SEL_MASK, 0x1)
>  #define REG_CRYPTO_CLKSRC2		0x20c
>  /* EN751221 */
>  #define EN751221_REG_SPI_DIV		0x0cc
> @@ -81,6 +98,8 @@ enum en_hir {
>  	HIR_MAX		= 14,
>  };
>  
> +#define EN_SERDES_PHY_NUM		4
> +
>  struct en_clk_desc {
>  	int id;
>  	const char *name;
> @@ -113,6 +132,18 @@ struct en_rst_data {
>  	struct reset_controller_dev rcdev;
>  };
>  
> +struct en_serdes_phy_instance {
> +	struct phy *phy;
> +	unsigned int serdes_port;
> +};
> +
> +struct en_clk_priv {
> +	void __iomem *base;
> +	/* protect SCU register */
> +	spinlock_t lock;

This spinlock is not initialized with spin_lock_init(). You can do this in
en7523_clk_probe() after devm_kzalloc().

With that fixed:

Reviewed-by: Brian Masney <bmasney@redhat.com>



^ permalink raw reply

* Re: [PATCH 2/2] ASoC: mt8173-max98090: use standard callback to set jack
From: kernel test robot @ 2026-05-20 23:12 UTC (permalink / raw)
  To: Srinivas Kandagatla, broonie
  Cc: oe-kbuild-all, lgirdwood, perex, tiwai, matthias.bgg,
	angelogioacchino.delregno, sharq0406, kuninori.morimoto.gx,
	ckeepax, srinivas.kandagatla, linux-sound, linux-kernel,
	linux-arm-kernel, linux-mediatek
In-Reply-To: <20260520132930.54333-3-srinivas.kandagatla@oss.qualcomm.com>

Hi Srinivas,

kernel test robot noticed the following build warnings:

[auto build test WARNING on broonie-sound/for-next]
[also build test WARNING on linus/master v7.1-rc4 next-20260520]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Srinivas-Kandagatla/ASoC-codecs-max98090-use-component-set_jack-callback/20260520-213646
base:   https://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound.git for-next
patch link:    https://lore.kernel.org/r/20260520132930.54333-3-srinivas.kandagatla%40oss.qualcomm.com
patch subject: [PATCH 2/2] ASoC: mt8173-max98090: use standard callback to set jack
config: x86_64-rhel-9.4-ltp (https://download.01.org/0day-ci/archive/20260521/202605210132.USry3Haw-lkp@intel.com/config)
compiler: gcc-14 (Debian 14.2.0-19) 14.2.0
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20260521/202605210132.USry3Haw-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp@intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202605210132.USry3Haw-lkp@intel.com/

All warnings (new ones prefixed by >>):

>> Warning: sound/soc/codecs/max98090.c:2353 function parameter 'data' not described in 'max98090_set_jack'
>> Warning: sound/soc/codecs/max98090.c:2353 function parameter 'data' not described in 'max98090_set_jack'

--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


^ permalink raw reply

* Re: [PATCH] ARM: dts: bcm958625-meraki-mx6x: move pinctrl conkfig to pwm
From: Florian Fainelli @ 2026-05-20 23:22 UTC (permalink / raw)
  To: bcm-kernel-feedback-list, Rosen Penev, devicetree
  Cc: Florian Fainelli, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
	Ray Jui, Scott Branden,
	moderated list:BROADCOM IPROC ARM ARCHITECTURE, open list
In-Reply-To: <20260430051612.700050-1-rosenp@gmail.com>

From: Florian Fainelli <f.fainelli@gmail.com>

On Wed, 29 Apr 2026 22:16:12 -0700, Rosen Penev <rosenp@gmail.com> wrote:
> On boot there is this warning:
> 
> /axi@18000000/pinctrl@3f1c0: Fixed dependency cycle(s) with /axi@18000000/pinctrl@3f1c0/pwm_leds
> 
> Fix by moving the pinctrl configuration to pwm, which is the actual
> consumer.
> 
> Signed-off-by: Rosen Penev <rosenp@gmail.com>
> ---

Applied to https://github.com/Broadcom/stblinux/commits/devicetree/next, thanks!
--
Florian


^ permalink raw reply

* Re: [PATCH] ARM: dts: BCM5301X: R6300v2: fix USB3
From: Florian Fainelli @ 2026-05-20 23:26 UTC (permalink / raw)
  To: Rosen Penev, devicetree
  Cc: Hauke Mehrtens, Rafał Miłecki,
	Broadcom internal kernel review list, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley,
	moderated list:BROADCOM BCM5301X ARM ARCHITECTURE, open list
In-Reply-To: <20260406220453.101185-1-rosenp@gmail.com>

On 4/6/26 15:04, Rosen Penev wrote:
> USB3 needs GPIO to be pulled HIGH in order to function. Add vcc-gpio to
> do so.
> 
> Signed-off-by: Rosen Penev <rosenp@gmail.com>

Applied, thanks!
-- 
Florian


^ permalink raw reply

* Re: [PATCH] ARM: dts: BCM5301X: EA6500v2: fix USB3
From: Florian Fainelli @ 2026-05-20 23:27 UTC (permalink / raw)
  To: Rosen Penev, devicetree
  Cc: Hauke Mehrtens, Rafał Miłecki,
	Broadcom internal kernel review list, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley,
	moderated list:BROADCOM BCM5301X ARM ARCHITECTURE, open list
In-Reply-To: <20260406220603.101494-1-rosenp@gmail.com>

On 4/6/26 15:06, Rosen Penev wrote:
> USB3 needs to have a GPIO pulled HIGH in order to function. Add vcc-gpio
> to do so.
> 
> Signed-off-by: Rosen Penev <rosenp@gmail.com>

Applied, thanks!
-- 
Florian


^ permalink raw reply

* Re: [PATCH v3 0/3] Add RP1 PWM controller support
From: Florian Fainelli @ 2026-05-20 23:31 UTC (permalink / raw)
  To: Andrea della Porta
  Cc: Uwe Kleine-König, linux-pwm, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley,
	Broadcom internal kernel review list, devicetree,
	linux-rpi-kernel, linux-arm-kernel, linux-kernel, Naushir Patuck,
	Stanimir Varbanov, mbrugger
In-Reply-To: <af2bsEdAhYY9c4rb@apocalypse>

On 5/8/26 01:15, Andrea della Porta wrote:
> Hi Uwe,
> 
> On 10:30 Thu 23 Apr     , Andrea della Porta wrote:
>> This patchset adds support for the PWM controller found on the
>> Raspberry Pi RP1 southbridge. This is necessary to operate the
>> cooling fan connected to one of the PWM channels.
>>
>> The tachometer pin for the fan speed is managed by the firmware
>> running on the RP1's M-core. It uses the PHASE2 register
>> to report the RPM, which is then exported by this driver via
>> syscon registers. A subsequent patch will add a new device
>> and driver to read the RPM and export this value via hwmon.
>>   
>> Subsequent patches will also add the CPU thermal zone, which
>> acts as a consumer of the PWM device.
>>
>> Best regards,
>> Andrea
>>
>> CHANGES in V3:
>>
>> - Refactored all the register macros. They now have RP1_PWM_ prefix
>>    and follow the register name.
>> - Dropped the tab alignment in front of struct declarations (use a space
>>    instead).
>> - Added a check in tohw() to test (and bail out quickly) in case that
>>    period_length_ns is zero.
>> - Probing now returns an error if clk_rate > 1 GHz.
>> - Added a check on minimum period ticks. Return 1 to signal round-up.
>> - Fixed inverted polarity detection on edge cases.
>> - Fixed rounding errors (in both tohwi() and fromhw()) in inverted
>>    polarity calculations.
>> - Dropped a redundant check on period >= duty.
>> - Replaced memset by inline struct init.
>> - Disabling a channel now is faster, skipping the duty/period/polarity
>>    setup.
>> - Fixed an error string (s/Fail/Failed)
>> - Used %pe to signal error string instead of an integer.
>> - Added several new sections to the Limitations paragraph to better
>>    explain what will happen on edge cases.
>> - Maximum period is now U32_MAX-1 to allow 100% duty cycle on all
>>    selectable periods.
>> - The hw period register now takes into account for the extra tick at
>>    the end of the period (subtracted one to wfhw->period_ticks in tohw
>>    and added 1 in fromhw).
>> - Added .remove() callback to free resources even if the driver is
>>    not unbindable/unloadable, to avoid accumulating tech debt.
>>
>>
>> Naushir Patuck (2):
>>    dt-bindings: pwm: Add Raspberry Pi RP1 PWM controller
>>    pwm: rp1: Add RP1 PWM controller driver
>>
>> Stanimir Varbanov (1):
>>    arm64: dts: broadcom: rpi-5: Add RP1 PWM node
>>
>>   .../bindings/pwm/raspberrypi,rp1-pwm.yaml     |  54 +++
>>   .../boot/dts/broadcom/bcm2712-rpi-5-b.dts     |  12 +
>>   arch/arm64/boot/dts/broadcom/rp1-common.dtsi  |  10 +
>>   drivers/pwm/Kconfig                           |   9 +
>>   drivers/pwm/Makefile                          |   1 +
>>   drivers/pwm/pwm-rp1.c                         | 414 ++++++++++++++++++
>>   6 files changed, 500 insertions(+)
>>   create mode 100644 Documentation/devicetree/bindings/pwm/raspberrypi,rp1-pwm.yaml
>>   create mode 100644 drivers/pwm/pwm-rp1.c
>>
>> -- 
>> 2.35.3
>>
> 
> A gentle reminder about this patchset :)

Hi Uwe,

I would prefer to take the DTS changes through the Broadcom ARM SoC tree 
to minimize conflicts on my end, are you going to take the PWM patches 
for 7.2? Thanks!
-- 
Florian


^ permalink raw reply

* Re: [PATCH v2 0/5] mm: reduce mmap_lock contention and improve page fault performance
From: Barry Song @ 2026-05-20 23:37 UTC (permalink / raw)
  To: David Hildenbrand (Arm)
  Cc: Matthew Wilcox, Liam R. Howlett, Suren Baghdasaryan,
	Lorenzo Stoakes, akpm, linux-mm, vbabka, rppt, mhocko, jack,
	pfalcato, wanglian, chentao, lianux.mm, kunwu.chan, liyangouwen1,
	chrisl, kasong, shikemeng, nphamcs, bhe, youngjun.park,
	linux-arm-kernel, linux-kernel, loongarch, linuxppc-dev,
	linux-riscv, linux-s390, Nanzhe Zhao
In-Reply-To: <3be9475b-0e8a-4df8-a130-4262f993973d@kernel.org>

On Thu, May 21, 2026 at 5:35 AM David Hildenbrand (Arm)
<david@kernel.org> wrote:
>
> On 5/20/26 23:15, Matthew Wilcox wrote:
> > On Thu, May 21, 2026 at 05:14:20AM +0800, Barry Song wrote:
> >> My understanding is that we should not blame applications here. This is 2026:
> >> there are basically only two kinds of applications — single-threaded and
> >> multi-threaded — and single-threaded applications are nearly extinct.
> >
> > all of the applications i run are either single threaded or don't fork.
> > what multithreaded applications call fork?
>
> Traditionally the problem was random libraries using fork+execve to launch other
> programs ... instead of using alternatives like posix_spwan (some use cases
> require more work done before execve and cannot yet switch to that). I'd hope
> that that is less of a problem on Android.
>
> I assume Android zygote might be multi threaded? Maybe sshd as well? Systemd?
> But I'd be surprised if there are really performance implications.

I am trying to answer the question above:

1. zygote, multi-threaded on my phone using Android13.
/ # ls /proc/`pidof zygote64`/task/
1359  22728  22729  22730  22731  22732

/proc/1359/task # cat 22728/comm
Jit thread pool
/proc/1359/task # cat 22730/comm
ReferenceQueueD
/proc/1359/task # cat 22731/comm
FinalizerDaemon
/proc/1359/task # cat 22732/comm
FinalizerWatchd
/proc/1359/task # cat 1359/comm
main

But on another phone of mine running Android 16, zygote64 is
single-threaded.
Not sure if it is due to the Android team making some changes
related to threads from Android 13 to Android 16.

2. sshd, multi-processes instead of multi-threads:
$ ps aux | grep sshd
root        1192  0.0  0.0  15444  9032 ?        Ss   09:42   0:00
sshd: /usr/sbin/sshd -D [listener] 0 of 10-100 startups
root        2465  0.0  0.0  17164 10760 ?        Ss   09:42   0:00
sshd: barry [priv]
barry       2632  0.0  0.0  17164  7852 ?        S    09:42   0:00
sshd: barry@pts/0
root        3305  2.5  0.0  17164 10772 ?        Ss   09:44   0:00
sshd: barry [priv]
barry       3406  0.0  0.0  17164  7940 ?        S    09:44   0:00
sshd: barry@pts/1

3. systemd, also multi-processes

$ ps ax | grep systemd
    350 ?        S<s    0:00 /lib/systemd/systemd-journald
    387 ?        Ss     0:00 /lib/systemd/systemd-udevd
    666 ?        Ss     0:00 /lib/systemd/systemd-oomd
    667 ?        Ss     0:00 /lib/systemd/systemd-resolved
    728 ?        Ss     0:00 @dbus-daemon --system --address=systemd:
--nofork --nopidfile --systemd-activation --syslog-only
    751 ?        Ss     0:00 /lib/systemd/systemd-logind
    753 ?        Ssl    0:00 /usr/sbin/thermald --systemd
--dbus-enable --adaptive
   1350 ?        Ss     0:00 /lib/systemd/systemd --user
   1428 ?        Ss     0:00 /usr/bin/dbus-daemon --session
--address=systemd: --nofork --nopidfile --systemd-activation
--syslog-only
   1900 ?        Ssl    0:00 /usr/libexec/gnome-session-binary
--systemd-service --session=ubuntu
   2141 ?        Ssl    0:00 /lib/systemd/systemd-timesyncd

>
> Not sure about webbroswers .... I think most of them switched to fork servers,
> where I would assume fork servers would be single-threaded.

On my phone, Chrome is multi-process, but its parent process
chrome_zygote (10774) is single-threaded:

 ps -A | grep chrome
u0_i15        9883 10774 321066464 119452 do_epoll_wait     0 S
com.android.chrome:sandboxed_process0:org.chromium.content.app.SandboxedProcessService0:15
u0_a142      10164  1359 35110548 277640 do_epoll_wait      0 S
com.android.chrome
u0_a278      10724  1359 9779864 104988 do_epoll_wait       0 S
com.google.android.apps.chromecast.app
u0_a142      10774  1359 32803908 64076 do_sys_poll         0 S
com.android.chrome_zygote
u0_a142      11173  1359 34208592 142192 do_epoll_wait      0 S
com.android.chrome:privileged_process0

/proc/10774/task # ls
10774

>
> So, yeah, getting a clear understanding how this ends up being a problem on
> Android would be great.

I guess the real issue is that in the Android market, there
are so many applications that are out of our control?

Here are some trace examples from Nanzhe:

iQIYI plugin
vma reader thread:
PbMisc-0, pid=27183, tgid=26444

vma writer thread:
i.video:plugin1, pid=27298, tgid=26444
writer blocked: 440394938 ns (440 ms)

reader stack:
vma_start_read
lock_vma_under_rcu
do_page_fault
do_translation_fault
do_mem_abort
el0_da
el0t_64_sync_handler
el0t_64_sync

writer stack:
__vma_start_write
dup_mmap
copy_mm
copy_process
kernel_clone
__arm64_sys_clone
invoke_syscall
el0_svc_common
do_el0_svc
el0_svc


Baidu Tieba
vma reader thread:
elastic_pms_pro, pid=7731, tgid=7575

vma writer thread:
com.baidu.tieba, pid=8005, tgid=7575
writer blocked: 514975545 ns(515 ms)

reader stack:
vma_start_read
lock_vma_under_rcu
do_page_fault
do_translation_fault
do_mem_abort
el0_da
el0t_64_sync_handler
el0t_64_sync

writer stack:
__vma_start_write
dup_mmap
copy_mm
copy_process
kernel_clone
__arm64_sys_clone
invoke_syscall
el0_svc_common
do_el0_svc
el0_svc

Thanks
Barry


^ permalink raw reply

* Re: [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
From: Tejun Heo @ 2026-05-20 23:47 UTC (permalink / raw)
  To: Alexei Starovoitov, Peter Zijlstra
  Cc: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi, Catalin Marinas, Will Deacon,
	Thomas Gleixner, Ingo Molnar, Borislav Petkov, Dave Hansen,
	Andrew Morton, David Hildenbrand, Mike Rapoport, Emil Tsalapatis,
	sched-ext, bpf, x86, linux-arm-kernel, linux-mm, linux-kernel
In-Reply-To: <DIM6W7GRZK2N.14AH43S1XTFCG@gmail.com>

Hello,

On Mon, May 18, 2026 at 04:26:11PM -0700, Alexei Starovoitov wrote:
> Well, this gen_pool based allocator of arena memory is a temporary hack.
> It's ok for rare allocation like in this at scx init time, but not suitable
> for active arena management. We don't need to expose it beyond scx.

I see. Peter, as Alexei is already prototyping a slab-based arena
allocator, how about keeping the gen_pool layer scx-local for now? Once
the proper allocator lands, scx can switch to it and the custom piece
goes away.

Thanks.

--
tejun


^ permalink raw reply

* Re: [PATCH v7 05/28] media: v4l2-common: add v4l2_fill_pixfmt_mp_aligned helper
From: Nicolas Dufresne @ 2026-05-20 23:48 UTC (permalink / raw)
  To: Sven Püschel, Jacob Chen, Ezequiel Garcia,
	Mauro Carvalho Chehab, Heiko Stuebner, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Hans Verkuil
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	devicetree, kernel, sebastian.reichel, m.tretter, p.zabel
In-Reply-To: <20260521-spu-rga3-v7-5-3f33e8c7145f@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 7397 bytes --]

Le jeudi 21 mai 2026 à 00:44 +0200, Sven Püschel a écrit :
> Add a v4l2_fill_pixfmt_mp_aligned helper which allows the user to
> specify a custom stride alignment in bytes. This is necessary for
> hardware like the Rockchip RGA3, which requires the stride value to be
> aligned to a 16 bytes boundary.
> 
> The code makes some assumptions about the v4l2 format to simplify the
> calculation. They currently hold for all known v4l2 formats.
> 
> v4l2_format_plane_stride uses an unsigned int as argument type to avoid
> the later multiplication from overflowing the u8 value. All other places
> use u8, as no practical use cases for a larger alignment are known at
> the moment.
> 
> Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>

Thanks for the update, my Rb still hold.

Nicolas

> 
> ---
> Changes in v7:
> - Also adjust alignment when C plane is larger than the Y plane
>   Flagged by Sashiko:
>   https://sashiko.dev/#/patchset/20260515-spu-rga3-v6-0-e547152eb9c9%40pengutronix.de?part=5
> 
> Changes in v6:
> - Fixed alignment multiplication of 0 for NV24 - flagged by Sashiko:
>   https://sashiko.dev/#/patchset/20260428-spu-rga3-v5-0-eb7f5d019d86%40pengutronix.de?part=5
> - Changed v4l2_format_plane_stride alignment parameter type to
>   avoid overflow for 64/128 byte alignment by multiplication.
>   Flagged by Sashiko URL above.
> ---
>  drivers/media/v4l2-core/v4l2-common.c | 58 +++++++++++++++++++++++++++--------
>  include/media/v4l2-common.h           |  4 +++
>  2 files changed, 50 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/media/v4l2-core/v4l2-common.c b/drivers/media/v4l2-core/v4l2-common.c
> index 3cc8b04e1ea63..b771ed9b659b0 100644
> --- a/drivers/media/v4l2-core/v4l2-common.c
> +++ b/drivers/media/v4l2-core/v4l2-common.c
> @@ -432,14 +432,35 @@ static inline unsigned int v4l2_format_block_height(const struct v4l2_format_inf
>  }
>  
>  static inline unsigned int v4l2_format_plane_stride(const struct v4l2_format_info *info, int plane,
> -						    unsigned int width)
> +						    unsigned int width, unsigned int byte_alignment)
>  {
>  	unsigned int hdiv = plane ? info->hdiv : 1;
>  	unsigned int aligned_width =
>  		ALIGN(width, v4l2_format_block_width(info, plane));
>  
> -	return DIV_ROUND_UP(aligned_width, hdiv) *
> -	       info->bpp[plane] / info->bpp_div[plane];
> +	/*
> +	 * Formats with a single memory plane derive the stride of the
> +	 * other planes from the y stride. To avoid hardware or software
> +	 * deriving a different stride for the composite plane,
> +	 * multiply the alignment accordingly.
> +	 *
> +	 * It assumes the following format properties:
> +	 * - bpp_div[0] == bpp_div[1]
> +	 * - The multiplication factor doesn't differ between the non y planes
> +	 * - The multiplication factor is a power of 2
> +	 */
> +	if (info->mem_planes == 1 && info->comp_planes > 1) {
> +		if (plane == 0)
> +			byte_alignment *= DIV_ROUND_UP(
> +				info->hdiv * info->bpp[0], info->bpp[1]);
> +		else
> +			byte_alignment *= DIV_ROUND_UP(
> +				info->bpp[1], info->hdiv * info->bpp[0]);
> +	}
> +
> +	return ALIGN(DIV_ROUND_UP(aligned_width, hdiv) * info->bpp[plane] /
> +			     info->bpp_div[plane],
> +		     byte_alignment);
>  }
>  
>  static inline unsigned int v4l2_format_plane_height(const struct v4l2_format_info *info, int plane,
> @@ -453,9 +474,10 @@ static inline unsigned int v4l2_format_plane_height(const struct v4l2_format_inf
>  }
>  
>  static inline unsigned int v4l2_format_plane_size(const struct v4l2_format_info *info, int plane,
> -						  unsigned int width, unsigned int height)
> +						  unsigned int width, unsigned int height,
> +						  u8 stride_alignment)
>  {
> -	return v4l2_format_plane_stride(info, plane, width) *
> +	return v4l2_format_plane_stride(info, plane, width, stride_alignment) *
>  	       v4l2_format_plane_height(info, plane, height);
>  }
>  
> @@ -476,8 +498,9 @@ void v4l2_apply_frmsize_constraints(u32 *width, u32 *height,
>  }
>  EXPORT_SYMBOL_GPL(v4l2_apply_frmsize_constraints);
>  
> -int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt,
> -			u32 pixelformat, u32 width, u32 height)
> +int v4l2_fill_pixfmt_mp_aligned(struct v4l2_pix_format_mplane *pixfmt,
> +				u32 pixelformat, u32 width, u32 height,
> +				u8 stride_alignment)
>  {
>  	const struct v4l2_format_info *info;
>  	struct v4l2_plane_pix_format *plane;
> @@ -494,23 +517,34 @@ int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt,
>  
>  	if (info->mem_planes == 1) {
>  		plane = &pixfmt->plane_fmt[0];
> -		plane->bytesperline = v4l2_format_plane_stride(info, 0, width);
> +		plane->bytesperline = v4l2_format_plane_stride(info, 0, width,
> +							       stride_alignment);
>  		plane->sizeimage = 0;
>  
>  		for (i = 0; i < info->comp_planes; i++)
>  			plane->sizeimage +=
> -				v4l2_format_plane_size(info, i, width, height);
> +				v4l2_format_plane_size(info, i, width, height,
> +						       stride_alignment);
>  	} else {
>  		for (i = 0; i < info->comp_planes; i++) {
>  			plane = &pixfmt->plane_fmt[i];
>  			plane->bytesperline =
> -				v4l2_format_plane_stride(info, i, width);
> +				v4l2_format_plane_stride(info, i, width,
> +							 stride_alignment);
>  			plane->sizeimage = plane->bytesperline *
>  				v4l2_format_plane_height(info, i, height);
>  		}
>  	}
>  	return 0;
>  }
> +EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt_mp_aligned);
> +
> +int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt,
> +			u32 pixelformat, u32 width, u32 height)
> +{
> +	return v4l2_fill_pixfmt_mp_aligned(pixfmt, pixelformat,
> +					   width, height, 1);
> +}
>  EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt_mp);
>  
>  int v4l2_fill_pixfmt(struct v4l2_pix_format *pixfmt, u32 pixelformat,
> @@ -530,12 +564,12 @@ int v4l2_fill_pixfmt(struct v4l2_pix_format *pixfmt, u32 pixelformat,
>  	pixfmt->width = width;
>  	pixfmt->height = height;
>  	pixfmt->pixelformat = pixelformat;
> -	pixfmt->bytesperline = v4l2_format_plane_stride(info, 0, width);
> +	pixfmt->bytesperline = v4l2_format_plane_stride(info, 0, width, 1);
>  	pixfmt->sizeimage = 0;
>  
>  	for (i = 0; i < info->comp_planes; i++)
>  		pixfmt->sizeimage +=
> -			v4l2_format_plane_size(info, i, width, height);
> +			v4l2_format_plane_size(info, i, width, height, 1);
>  	return 0;
>  }
>  EXPORT_SYMBOL_GPL(v4l2_fill_pixfmt);
> diff --git a/include/media/v4l2-common.h b/include/media/v4l2-common.h
> index 401d8506c24b5..edd416178c333 100644
> --- a/include/media/v4l2-common.h
> +++ b/include/media/v4l2-common.h
> @@ -558,6 +558,10 @@ int v4l2_fill_pixfmt(struct v4l2_pix_format *pixfmt, u32 pixelformat,
>  		     u32 width, u32 height);
>  int v4l2_fill_pixfmt_mp(struct v4l2_pix_format_mplane *pixfmt, u32 pixelformat,
>  			u32 width, u32 height);
> +/* @stride_alignment is a power of 2 value in bytes */
> +int v4l2_fill_pixfmt_mp_aligned(struct v4l2_pix_format_mplane *pixfmt,
> +				u32 pixelformat, u32 width, u32 height,
> +				u8 stride_alignment);
>  
>  /**
>   * v4l2_get_link_freq - Get link rate from transmitter

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* [PATCH 1/8] mm: Add ptep_try_set() for lockless empty-slot installs
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Add ptep_try_set(ptep, new_pte): atomically set *ptep to new_pte iff it is
currently pte_none(). Returns true on success, false if the slot was already
populated or the arch has no implementation.

The intended caller is the upcoming bpf_arena kernel-side fault recovery
path. The install runs from a page fault that can be nested under locks
held by the faulting kernel caller (e.g. a BPF program holding
raw_res_spin_lock_irqsave on its arena's spinlock), so trylock-and-retry
would A-A deadlock. Lock-free cmpxchg is the only viable option, which
constrains this helper to special kernel page tables where concurrent
writers cooperate via atomic accessors.

The generic version in <linux/pgtable.h> returns false. x86 and arm64
override with try_cmpxchg-based implementations on the underlying pteval.
Other architectures get the false stub - the callers there already fall
through to oops.

v2: Rename to ptep_try_set(). Tighten kerneldoc for kernel-PTE use.
    (David, Alexei)

Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
 arch/arm64/include/asm/pgtable.h |  8 ++++++++
 arch/x86/include/asm/pgtable.h   |  8 ++++++++
 include/linux/pgtable.h          | 26 ++++++++++++++++++++++++++
 3 files changed, 42 insertions(+)

diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h
index 9029b81ccbe8..a129be91ef2c 100644
--- a/arch/arm64/include/asm/pgtable.h
+++ b/arch/arm64/include/asm/pgtable.h
@@ -1830,6 +1830,14 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
 	return __ptep_get_and_clear(mm, addr, ptep);
 }
 
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pteval_t old = 0;
+
+	return try_cmpxchg(&pte_val(*ptep), &old, pte_val(new_pte));
+}
+#define ptep_try_set ptep_try_set
+
 #define test_and_clear_young_ptes test_and_clear_young_ptes
 static inline bool test_and_clear_young_ptes(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *ptep, unsigned int nr)
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 13e3e9a054cb..047e273a4eab 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -1284,6 +1284,14 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
 	} while (!try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte));
 }
 
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	pte_t old_pte = __pte(0);
+
+	return try_cmpxchg((long *)&ptep->pte, (long *)&old_pte, *(long *)&new_pte);
+}
+#define ptep_try_set ptep_try_set
+
 #define flush_tlb_fix_spurious_fault(vma, address, ptep) do { } while (0)
 
 #define  __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
index cdd68ed3ae1a..d68374f404c1 100644
--- a/include/linux/pgtable.h
+++ b/include/linux/pgtable.h
@@ -1036,6 +1036,32 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres
 }
 #endif
 
+#ifndef ptep_try_set
+/**
+ * ptep_try_set - atomically set an empty kernel PTE
+ * @ptep: page table entry
+ * @new_pte: value to install
+ *
+ * Atomically set *@ptep to @new_pte iff *@ptep is pte_none(). Return
+ * true on success, false if the slot was already populated or the
+ * arch has no implementation.
+ *
+ * For special kernel page tables only - never user page tables. The
+ * caller must prevent concurrent teardown of @ptep and must accept
+ * that other writers may race. Concurrent clearers must use
+ * ptep_get_and_clear() so racing accesses agree on the outcome.
+ *
+ * Architectures opt in by providing a cmpxchg-based override and
+ * defining ptep_try_set as an identity macro. The generic stub
+ * returns false, which is correct for callers that fall through to
+ * oops on failure.
+ */
+static inline bool ptep_try_set(pte_t *ptep, pte_t new_pte)
+{
+	return false;
+}
+#endif
+
 #ifndef wrprotect_ptes
 /**
  * wrprotect_ptes - Write-protect PTEs that map consecutive pages of the same
-- 
2.54.0



^ permalink raw reply related

* [PATCHSET v3 sched_ext/for-7.2] bpf/arena: Direct kernel-side access
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo

Hello,

This makes BPF arena memory directly dereferenceable from kernel code
(struct_ops callbacks, kfuncs). Each arena gets a per-arena scratch page
that an arch fault hook installs into empty PTEs on kernel-side faults,
after KFENCE. The faulting instruction retries and the violation is reported
through the program's BPF stream.

v3:
- Patch 1: rename ptep_try_install() to ptep_try_set(). Tighten kerneldoc
  for kernel-PTE use. (David Hildenbrand, Alexei)
- Patch 2: apply_range_clear_cb() uses ptep_get_and_clear() so the install
  and clear sides race through atomic accessors. (David)

v2: https://lore.kernel.org/r/20260517211232.1670594-1-tj@kernel.org
v1 (RFC): https://lore.kernel.org/r/20260427105109.2554518-1-tj@kernel.org

Motivation
----------

sched_ext's ops_cid.set_cmask() hands the BPF scheduler a struct scx_cmask
*. The kernel translates a kernel cpumask to a cmask, but it had no way to
write into the arena, so the cmask lived in kernel memory and was passed as
a trusted pointer. BPF cmask helpers all operate on arena cmasks though, so
the BPF side had to word-by-word probe-read the kernel cmask into an arena
cmask via cmask_copy_from_kernel() before any helper could touch it. It
works, but is clumsy.

The shape isn't unique to set_cmask. Sub-scheduler support is on the way and
more sched_ext callbacks will want to pass structured data to BPF. Anywhere
a kfunc or struct_ops callback wants to hand a struct to a BPF program,
arena residence is the natural answer.

Approach
--------

Each arena gets a per-arena scratch page. Arenas stay sparsely mapped as
today - PTEs are populated only for allocated pages. A new arch fault hook
(bpf_arena_handle_page_fault) is wired into x86 page_fault_oops() and arm64
__do_kernel_fault(), after KFENCE. When a kernel-side access faults inside
an arena's kern_vm range, the helper walks the stack to find the BPF program
responsible, range-checks the fault address against prog->aux->arena, and
atomically installs the scratch page into the empty PTE via the new
ptep_try_set() wrapper. The kernel instruction retries and reads/writes the
scratch page. Free paths and map destruction treat scratch as non-owned.
Real allocation refuses to overwrite scratch (apply_range_set_cb returns
-EBUSY). A scratched address stays dead until map destroy, since its
presence means the BPF program has already malfunctioned.

The mechanism is default behavior - no UAPI flag.

What this preserves
-------------------

All the debugging properties of today's sparse-PTE design are preserved:

* BPF programs still fault on unmapped arena accesses. The fault semantics
  (instruction retry with rdst = 0) and the violation report through
  bpf_streams are unchanged for prog-side accesses.

* The first kernel-side touch of an unmapped address is reported via
  bpf_streams the same way as a prog-side fault, with the stack walk
  attributing it to the originating prog.

* User-side fault on a never-scratched address still lazy-allocates a real
  page (or returns SIGSEGV under BPF_F_SEGV_ON_FAULT). User-side fault on a
  scratched address SIGSEGVs.

What changes for the kernel-side caller is just that an unmapped deref no
longer oopses - it retries through the scratch page and emits a violation
report. The same shape today's BPF instruction faults have.

Patches 1-2 (atomic PTE install + arena scratch-page recovery)
--------------------------------------------------------------

  mm: Add ptep_try_set() for lockless empty-slot installs
  bpf: Recover arena kernel faults with scratch page

Patches 3-5 (helpers used by struct_ops registration)
-----------------------------------------------------

  bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
  bpf: Add bpf_struct_ops_for_each_prog()
  bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()

Patches 6-8 (sched_ext: arena auto-discovery, allocator, set_cmask)
-------------------------------------------------------------------

  sched_ext: Require an arena for cid-form schedulers
  sched_ext: Sub-allocator over kernel-claimed BPF arena pages
  sched_ext: Convert ops.set_cmask() to arena-resident cmask

Patch 6 reads each member prog's prog->aux->arena via bpf_prog_arena() and
requires the cid-form struct_ops to reference exactly one arena. Patch 7
builds a gen_pool sub-allocator inside that arena. Patch 8 converts
set_cmask() to write into arena memory; BPF dereferences via __arena like
any other arena struct, no probe-reads.

Base
----

sched_ext/for-7.2 (1136fb1213d1) with cmask-prep-v2.3 applied:
  https://lore.kernel.org/r/20260519075838.2706712-1-tj@kernel.org

Git tree: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git arena-direct-v3

 Documentation/bpf/kfuncs.rst          |  14 +++
 arch/arm64/include/asm/pgtable.h      |   8 ++
 arch/arm64/mm/fault.c                 |  10 +-
 arch/x86/include/asm/pgtable.h        |   8 ++
 arch/x86/mm/fault.c                   |  12 +-
 include/linux/bpf.h                   |  14 +++
 include/linux/bpf_defs.h              |  11 ++
 include/linux/pgtable.h               |  26 ++++
 kernel/bpf/arena.c                    | 216 +++++++++++++++++++++++++++-------
 kernel/bpf/bpf_struct_ops.c           |  36 ++++++
 kernel/bpf/core.c                     |   5 +
 kernel/sched/build_policy.c           |   4 +
 kernel/sched/ext.c                    | 135 ++++++++++++++++++++-
 kernel/sched/ext_arena.c              | 127 ++++++++++++++++++++
 kernel/sched/ext_arena.h              |  18 +++
 kernel/sched/ext_cid.c                |  20 +---
 kernel/sched/ext_internal.h           |  23 +++-
 tools/sched_ext/include/scx/cid.bpf.h |  52 --------
 tools/sched_ext/scx_qmap.bpf.c        |   5 +-
 19 files changed, 616 insertions(+), 128 deletions(-)

Thanks.

--
tejun


^ permalink raw reply

* [PATCH 3/8] bpf: Add sleepable variant of bpf_arena_alloc_pages for kernel callers
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

The existing kernel-side export of bpf_arena_alloc_pages is _non_sleepable
only - it's used by the verifier to inline the kfunc when the call site is
non-sleepable. There is no sleepable equivalent for kernel callers; the
kfunc bpf_arena_alloc_pages itself is BPF-only.

sched_ext needs sleepable kernel-side allocs for its arena pool init/grow
paths. Add bpf_arena_alloc_pages_sleepable() mirroring the _non_sleepable
wrapper but passing sleepable=true to arena_alloc_pages().

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h |  8 ++++++++
 kernel/bpf/arena.c  | 13 +++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 831996c411cf..64968ca6db51 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -679,6 +679,8 @@ int bpf_dynptr_from_file_sleepable(struct file *file, u32 flags,
 void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
 					  u64 flags);
 void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt);
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt, int node_id,
+				      u64 flags);
 #else
 static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
 							int node_id, u64 flags)
@@ -689,6 +691,12 @@ static inline void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr
 static inline void bpf_arena_free_pages_non_sleepable(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 }
+
+static inline void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+						    int node_id, u64 flags)
+{
+	return NULL;
+}
 #endif
 
 extern const struct bpf_map_ops bpf_map_offload_ops;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 1c0b87ecc817..a811cf6170fa 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -934,6 +934,19 @@ void *bpf_arena_alloc_pages_non_sleepable(void *p__map, void *addr__ign, u32 pag
 
 	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, false);
 }
+
+void *bpf_arena_alloc_pages_sleepable(void *p__map, void *addr__ign, u32 page_cnt,
+				      int node_id, u64 flags)
+{
+	struct bpf_map *map = p__map;
+	struct bpf_arena *arena = container_of(map, struct bpf_arena, map);
+
+	if (map->map_type != BPF_MAP_TYPE_ARENA || flags || !page_cnt)
+		return NULL;
+
+	return (void *)arena_alloc_pages(arena, (long)addr__ign, page_cnt, node_id, true);
+}
+
 __bpf_kfunc void bpf_arena_free_pages(void *p__map, void *ptr__ign, u32 page_cnt)
 {
 	struct bpf_map *map = p__map;
-- 
2.54.0



^ permalink raw reply related

* [PATCH 4/8] bpf: Add bpf_struct_ops_for_each_prog()
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Add a helper that walks the member progs of the struct_ops map
containing a given @kdata vmtable. struct_ops ->reg() callbacks (and
similar) sometimes need to inspect the loaded BPF programs, e.g. to
discover maps they reference via prog->aux->used_maps.

The implementation mirrors bpf_struct_ops_id(): container_of @kdata
to recover the bpf_struct_ops_map, then iterate st_map->links[i]->prog
for i in [0, funcs_cnt). Same access pattern, no new locking - by the
time ->reg() fires st_map is fully populated and stable.

A sched_ext follow-up walks the member progs of a cid-form scheduler's
struct_ops map, reads prog->aux->arena directly, and requires all member
progs to reference exactly one arena, without requiring the BPF program
to call a registration kfunc.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h         |  3 +++
 kernel/bpf/bpf_struct_ops.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 64968ca6db51..5b99d786e98c 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -2129,6 +2129,9 @@ int bpf_prog_assoc_struct_ops(struct bpf_prog *prog, struct bpf_map *map);
 void bpf_prog_disassoc_struct_ops(struct bpf_prog *prog);
 void *bpf_prog_get_assoc_struct_ops(const struct bpf_prog_aux *aux);
 u32 bpf_struct_ops_id(const void *kdata);
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data);
 
 #ifdef CONFIG_NET
 /* Define it here to avoid the use of forward declaration */
diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c
index 05b366b821c3..16aec18ed31b 100644
--- a/kernel/bpf/bpf_struct_ops.c
+++ b/kernel/bpf/bpf_struct_ops.c
@@ -1203,6 +1203,42 @@ u32 bpf_struct_ops_id(const void *kdata)
 }
 EXPORT_SYMBOL_GPL(bpf_struct_ops_id);
 
+/**
+ * bpf_struct_ops_for_each_prog - Invoke @cb for each member prog
+ * @kdata: kernel-side struct_ops vmtable (the @kdata arg to ->reg/->update/->unreg)
+ * @cb: callback invoked once per member prog; non-zero return stops iteration
+ * @data: opaque argument passed to @cb
+ *
+ * Walks the struct_ops member progs registered on the map containing @kdata.
+ * Intended for use from struct_ops ->reg() callbacks (and similar) that need to
+ * inspect the loaded BPF programs (for example to discover maps they reference
+ * via @prog->aux->used_maps).
+ *
+ * Return 0 if iteration completed, otherwise the first non-zero @cb return.
+ */
+int bpf_struct_ops_for_each_prog(const void *kdata,
+				 int (*cb)(struct bpf_prog *prog, void *data),
+				 void *data)
+{
+	struct bpf_struct_ops_value *kvalue;
+	struct bpf_struct_ops_map *st_map;
+	u32 i;
+	int ret;
+
+	kvalue = container_of(kdata, struct bpf_struct_ops_value, data);
+	st_map = container_of(kvalue, struct bpf_struct_ops_map, kvalue);
+
+	for (i = 0; i < st_map->funcs_cnt; i++) {
+		if (!st_map->links[i])
+			continue;
+		ret = cb(st_map->links[i]->prog, data);
+		if (ret)
+			return ret;
+	}
+	return 0;
+}
+EXPORT_SYMBOL_GPL(bpf_struct_ops_for_each_prog);
+
 static bool bpf_struct_ops_valid_to_reg(struct bpf_map *map)
 {
 	struct bpf_struct_ops_map *st_map = (struct bpf_struct_ops_map *)map;
-- 
2.54.0



^ permalink raw reply related

* [PATCH 2/8] bpf: Recover arena kernel faults with scratch page
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

From: Kumar Kartikeya Dwivedi <memxor@gmail.com>

BPF arena usage is becoming more prevalent, but kernel <-> BPF communication
over arena memory is awkward today. Data has to be staged through a trusted
kernel pointer with extra code and copying on the BPF side. While reads
through arena pointers can use a fault-safe helper, writes don't have a good
solution. The in-line alternative would need instruction emulation or asm
fixup labels.

Enable direct kernel-side reads and writes within GUARD_SZ / 2 of any
handed-in arena pointer, without bounds checking. A per-arena scratch page
is installed by the arch fault path into empty arena kernel PTEs - x86 from
page_fault_oops() for not-present faults, arm64 from __do_kernel_fault() for
translation faults, both after the existing exception-table and KFENCE
handling. The faulting instruction retries and the access is also reported
through the program's BPF stream, preserving error reporting.

bpf_prog_find_from_stack() resolves the current BPF program (and its arena)
from the kernel stack - no new bpf_run_ctx state is added. Recovery covers
the 4 GiB arena plus the upper half-guard (GUARD_SZ / 2). The lower
half-guard is excluded because well-behaved kfuncs only access forward from
arena pointers. The kfunc-author contract - access at most GUARD_SZ / 2 past
a handed-in pointer - is documented in Documentation/bpf/kfuncs.rst.

The install is lock-free via ptep_try_set(). On race-loss the winning
installer's PTE is already valid, so the access retry succeeds. The arena
clear path uses ptep_get_and_clear() so installer and clearer race through
atomic accessors. No flush_tlb_kernel_range() afterwards. Stale "not mapped"
entries just cause one extra re-fault, cheaper than a global IPI on every
install.

Scratch exists only to keep the kernel from oopsing on an in-line arena
access. Its presence at a PTE means the BPF program has already
malfunctioned, and the violation is reported through the program's BPF
stream. The only requirement for behavior on a scratched PTE is that the
kernel doesn't crash. In particular, any user-side access through such a PTE
may segfault. The shared scratch page is freed once during map destruction.

BPF instruction faults continue to use the existing JIT exception-table
path. This patch changes only the kernel-text fault path. No UAPI flag is
added. The new behavior is the default.

v2: Use ptep_get_and_clear() in apply_range_clear_cb(). (David)

Suggested-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: David Hildenbrand <david@kernel.org>
---
 Documentation/bpf/kfuncs.rst |  14 +++
 arch/arm64/mm/fault.c        |  10 +-
 arch/x86/mm/fault.c          |  12 ++-
 include/linux/bpf.h          |   1 +
 include/linux/bpf_defs.h     |  11 +++
 kernel/bpf/arena.c           | 177 +++++++++++++++++++++++++++--------
 kernel/bpf/core.c            |   5 +
 7 files changed, 183 insertions(+), 47 deletions(-)
 create mode 100644 include/linux/bpf_defs.h

diff --git a/Documentation/bpf/kfuncs.rst b/Documentation/bpf/kfuncs.rst
index 75e6c078e0e7..6d497e720998 100644
--- a/Documentation/bpf/kfuncs.rst
+++ b/Documentation/bpf/kfuncs.rst
@@ -462,6 +462,20 @@ In order to accommodate such requirements, the verifier will enforce strict
 PTR_TO_BTF_ID type matching if two types have the exact same name, with one
 being suffixed with ``___init``.
 
+2.8 Accessing arena memory through kfunc arguments
+--------------------------------------------------
+
+A read or write at any address inside an arena does not oops the kernel.
+Unallocated arena pages are lazily backed by a scratch page and the
+access is reported through the program's BPF stream as an error. Only
+the BPF program's correctness is affected; the kernel itself remains
+intact.
+
+The arena is followed by a ``GUARD_SZ / 2`` (32 KiB) guard region that
+is also covered by this recovery. A kfunc handed an arena pointer may
+therefore access up to ``GUARD_SZ / 2`` past it without bounds-checking
+against the arena. Larger accesses must verify the range explicitly.
+
 .. _BPF_kfunc_lifecycle_expectations:
 
 3. kfunc lifecycle expectations
diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 920a8b244d59..0d58d667fcd8 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -9,6 +9,7 @@
 
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
+#include <linux/bpf_defs.h>
 #include <linux/extable.h>
 #include <linux/kfence.h>
 #include <linux/signal.h>
@@ -416,9 +417,12 @@ static void __do_kernel_fault(unsigned long addr, unsigned long esr,
 	} else if (addr < PAGE_SIZE) {
 		msg = "NULL pointer dereference";
 	} else {
-		if (esr_fsc_is_translation_fault(esr) &&
-		    kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
-			return;
+		if (esr_fsc_is_translation_fault(esr)) {
+			if (kfence_handle_page_fault(addr, esr & ESR_ELx_WNR, regs))
+				return;
+			if (bpf_arena_handle_page_fault(addr, esr & ESR_ELx_WNR, regs->pc))
+				return;
+		}
 
 		msg = "paging request";
 	}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f0e77e084482..b0f103ddbd23 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -8,6 +8,7 @@
 #include <linux/sched/task_stack.h>	/* task_stack_*(), ...		*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
 #include <linux/memblock.h>		/* max_low_pfn			*/
+#include <linux/bpf_defs.h>		/* bpf_arena_handle_page_fault	*/
 #include <linux/kfence.h>		/* kfence_handle_page_fault	*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
@@ -688,10 +689,13 @@ page_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (IS_ENABLED(CONFIG_EFI))
 		efi_crash_gracefully_on_page_fault(address);
 
-	/* Only not-present faults should be handled by KFENCE. */
-	if (!(error_code & X86_PF_PROT) &&
-	    kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
-		return;
+	/* Only not-present faults should be handled by KFENCE or BPF arena. */
+	if (!(error_code & X86_PF_PROT)) {
+		if (kfence_handle_page_fault(address, error_code & X86_PF_WRITE, regs))
+			return;
+		if (bpf_arena_handle_page_fault(address, error_code & X86_PF_WRITE, regs->ip))
+			return;
+	}
 
 oops:
 	/*
diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 0136a108d083..831996c411cf 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -6,6 +6,7 @@
 
 #include <uapi/linux/bpf.h>
 #include <uapi/linux/filter.h>
+#include <linux/bpf_defs.h>
 
 #include <crypto/sha2.h>
 #include <linux/workqueue.h>
diff --git a/include/linux/bpf_defs.h b/include/linux/bpf_defs.h
new file mode 100644
index 000000000000..d98e033b8c0b
--- /dev/null
+++ b/include/linux/bpf_defs.h
@@ -0,0 +1,11 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Subset of bpf.h declarations, split out so files that need only these
+ * declarations can avoid bpf.h's full include cost.
+ */
+#ifndef _LINUX_BPF_DEFS_H
+#define _LINUX_BPF_DEFS_H
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip);
+
+#endif /* _LINUX_BPF_DEFS_H */
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index 08d008cc471e..1c0b87ecc817 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -53,6 +53,7 @@ struct bpf_arena {
 	u64 user_vm_start;
 	u64 user_vm_end;
 	struct vm_struct *kern_vm;
+	struct page *scratch_page;
 	struct range_tree rt;
 	/* protects rt */
 	rqspinlock_t spinlock;
@@ -118,6 +119,11 @@ struct apply_range_data {
 	int i;
 };
 
+struct clear_range_data {
+	struct llist_head *free_pages;
+	struct page *scratch_page;
+};
+
 static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
 {
 	struct apply_range_data *d = data;
@@ -144,33 +150,59 @@ static void flush_vmap_cache(unsigned long start, unsigned long size)
 	flush_cache_vmap(start, start + size);
 }
 
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
 {
+	struct clear_range_data *d = data;
 	pte_t old_pte;
 	struct page *page;
 
-	/* sanity check */
-	old_pte = ptep_get(pte);
+	/*
+	 * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+	 * Both sides must be atomic.
+	 */
+	old_pte = ptep_get_and_clear(&init_mm, addr, pte);
 	if (pte_none(old_pte) || !pte_present(old_pte))
-		return 0; /* nothing to do */
+		return 0;
 
 	page = pte_page(old_pte);
 	if (WARN_ON_ONCE(!page))
 		return -EINVAL;
 
-	pte_clear(&init_mm, addr, pte);
+	/*
+	 * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+	 * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+	 * here. Without the skip, scratch_page would be freed.
+	 */
+	if (page == d->scratch_page)
+		return 0;
+
+	__llist_add(&page->pcp_llist, d->free_pages);
+	return 0;
+}
 
-	/* Add page to the list so it is freed later */
-	if (free_pages)
-		__llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+	struct page *scratch_page = data;
 
+	if (!pte_none(ptep_get(pte)))
+		return 0;
+	/*
+	 * Best-effort install. ptep_try_set() returns false only if another
+	 * installer (real allocation or concurrent fault) won the cmpxchg.
+	 * Their PTE is already valid, so the access retry succeeds.
+	 *
+	 * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+	 * cause one extra re-fault through this same path.
+	 */
+	ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
 	return 0;
 }
 
 static int populate_pgtable_except_pte(struct bpf_arena *arena)
 {
+	/* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
 	return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				   KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+				   SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
 }
 
 static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
@@ -221,22 +253,29 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 	init_irq_work(&arena->free_irq, arena_free_irq);
 	INIT_WORK(&arena->free_work, arena_free_worker);
 	bpf_map_init_from_attr(&arena->map, attr);
+
+	err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+	if (err)
+		goto err_free_arena;
+
 	range_tree_init(&arena->rt);
 	err = range_tree_set(&arena->rt, 0, attr->max_entries);
-	if (err) {
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_free_scratch;
 	mutex_init(&arena->lock);
 	raw_res_spin_lock_init(&arena->spinlock);
 	err = populate_pgtable_except_pte(arena);
-	if (err) {
-		range_tree_destroy(&arena->rt);
-		bpf_map_area_free(arena);
-		goto err;
-	}
+	if (err)
+		goto err_destroy_rt;
 
 	return &arena->map;
+
+err_destroy_rt:
+	range_tree_destroy(&arena->rt);
+err_free_scratch:
+	__free_page(arena->scratch_page);
+err_free_arena:
+	bpf_map_area_free(arena);
 err:
 	free_vm_area(kern_vm);
 	return ERR_PTR(err);
@@ -244,6 +283,7 @@ static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
 
 static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 {
+	struct bpf_arena *arena = data;
 	struct page *page;
 	pte_t pte;
 
@@ -251,6 +291,12 @@ static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
 	if (!pte_present(pte)) /* sanity check */
 		return 0;
 	page = pte_page(pte);
+	/*
+	 * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+	 * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+	 */
+	if (page == arena->scratch_page)
+		return 0;
 	/*
 	 * We do not update pte here:
 	 * 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
@@ -286,9 +332,10 @@ static void arena_map_free(struct bpf_map *map)
 	 * free those pages.
 	 */
 	apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
-				     KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+				     SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
 	free_vm_area(arena->kern_vm);
 	range_tree_destroy(&arena->rt);
+	__free_page(arena->scratch_page);
 	bpf_map_area_free(arena);
 }
 
@@ -374,33 +421,37 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 		return VM_FAULT_RETRY;
 
 	page = vmalloc_to_page((void *)kaddr);
-	if (page)
+	if (page) {
+		if (page == arena->scratch_page)
+			/* BPF triggered scratch here; don't lazy-alloc over it */
+			goto out_sigsegv;
 		/* already have a page vmap-ed */
 		goto out;
+	}
 
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
 		/* User space requested to segfault when page is not allocated by bpf prog */
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
 	if (ret)
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 
 	struct apply_range_data data = { .pages = &page, .i = 0 };
 	/* Account into memcg of the process that created bpf_arena */
 	ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 
 	ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
 	if (ret) {
 		range_tree_set(&arena->rt, vmf->pgoff, 1);
 		free_pages_nolock(page, 0);
-		goto out_unlock_sigsegv;
+		goto out_sigsegv_memcg;
 	}
 	flush_vmap_cache(kaddr, PAGE_SIZE);
 	bpf_map_memcg_exit(old_memcg, new_memcg);
@@ -409,8 +460,9 @@ static vm_fault_t arena_vm_fault(struct vm_fault *vmf)
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	vmf->page = page;
 	return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
 	bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
 	return VM_FAULT_SIGSEGV;
 }
@@ -668,6 +720,7 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	struct llist_head free_pages;
 	struct llist_node *pos, *t;
 	struct arena_free_span *s;
+	struct clear_range_data cdata;
 	unsigned long flags;
 	int ret = 0;
 
@@ -696,9 +749,11 @@ static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt,
 	range_tree_set(&arena->rt, pgoff, page_cnt);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	/* clear ptes and collect struct pages */
 	apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-				     apply_range_clear_cb, &free_pages);
+				     apply_range_clear_cb, &cdata);
 
 	/* drop the lock to do the tlb flush and zap pages */
 	raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
@@ -788,6 +843,7 @@ static void arena_free_worker(struct work_struct *work)
 	struct arena_free_span *s;
 	u64 arena_vm_start, user_vm_start;
 	struct llist_head free_pages;
+	struct clear_range_data cdata;
 	struct page *page;
 	unsigned long full_uaddr;
 	long kaddr, page_cnt, pgoff;
@@ -801,6 +857,8 @@ static void arena_free_worker(struct work_struct *work)
 	bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
 
 	init_llist_head(&free_pages);
+	cdata.free_pages = &free_pages;
+	cdata.scratch_page = arena->scratch_page;
 	arena_vm_start = bpf_arena_get_kern_vm_start(arena);
 	user_vm_start = bpf_arena_get_user_vm_start(arena);
 
@@ -813,7 +871,7 @@ static void arena_free_worker(struct work_struct *work)
 
 		/* clear ptes and collect pages in free_pages llist */
 		apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
-					     apply_range_clear_cb, &free_pages);
+					     apply_range_clear_cb, &cdata);
 
 		range_tree_set(&arena->rt, pgoff, page_cnt);
 	}
@@ -928,23 +986,12 @@ static int __init kfunc_init(void)
 }
 late_initcall(kfunc_init);
 
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+					      unsigned long addr, unsigned long fault_ip)
 {
 	struct bpf_stream_stage ss;
-	struct bpf_prog *prog;
 	u64 user_vm_start;
 
-	/*
-	 * The RCU read lock is held to safely traverse the latch tree, but we
-	 * don't need its protection when accessing the prog, since it will not
-	 * disappear while we are handling the fault.
-	 */
-	rcu_read_lock();
-	prog = bpf_prog_ksym_find(fault_ip);
-	rcu_read_unlock();
-	if (!prog)
-		return;
-
 	/* Use main prog for stream access */
 	prog = prog->aux->main_prog_aux->prog;
 
@@ -957,3 +1004,53 @@ void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned lo
 		bpf_stream_dump_stack(ss);
 	}));
 }
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+	struct bpf_arena *arena;
+	struct bpf_prog *prog;
+	unsigned long kbase;
+	unsigned long page_addr = addr & PAGE_MASK;
+
+	prog = bpf_prog_find_from_stack();
+	if (!prog)
+		return false;
+
+	arena = prog->aux->arena;
+	/* a prog not using arena may be on stack, so arena can be NULL */
+	if (!arena)
+		return false;
+
+	kbase = bpf_arena_get_kern_vm_start(arena);
+
+	/*
+	 * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+	 * Lower guard is unreachable from kfuncs; an address there indicates
+	 * a different bug class - leave it to the regular kernel oops path.
+	 */
+	if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+		return false;
+
+	apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+			    apply_range_set_scratch_cb, arena->scratch_page);
+	flush_vmap_cache(page_addr, PAGE_SIZE);
+	__bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+	return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+	struct bpf_prog *prog;
+
+	/*
+	 * The RCU read lock is held to safely traverse the latch tree, but we
+	 * don't need its protection when accessing the prog, since it will not
+	 * disappear while we are handling the fault.
+	 */
+	rcu_read_lock();
+	prog = bpf_prog_ksym_find(fault_ip);
+	rcu_read_unlock();
+	if (!prog)
+		return;
+	__bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 066b86e7233c..fa368d8920d9 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -3290,6 +3290,11 @@ __weak u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena)
 {
 	return 0;
 }
+__weak bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write,
+					unsigned long fault_ip)
+{
+	return false;
+}
 
 #ifdef CONFIG_BPF_SYSCALL
 static int __init bpf_global_ma_init(void)
-- 
2.54.0



^ permalink raw reply related

* [PATCH 6/8] sched_ext: Require an arena for cid-form schedulers
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Upcoming patches will let the kernel place arena-resident scratch shared
with the BPF program (e.g. per-CPU set_cmask cmask) so the BPF side can
dereference it directly via __arena pointers, replacing the current
cmask_copy_from_kernel() probe-read loop. That requires each cid-form
scheduler to expose its arena to the kernel. Kernel- side accesses are
recovered by the per-arena scratch-page mechanism.

bpf_scx_reg_cid() walks the struct_ops member progs via
bpf_struct_ops_for_each_prog() and reads each prog's arena via
bpf_prog_arena(). The verifier enforces one arena per program, so each
member prog contributes at most one arena. All non-NULL contributions must
match and at least one member prog must use an arena. The map ref is held on
scx_sched and dropped on sched destroy. cpu-form schedulers (bpf_scx_reg)
are unchanged - no arena requirement.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c          | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/sched/ext_internal.h |  8 ++++++
 2 files changed, 63 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 9c458552d14f..56f94ac32ba0 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,8 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	if (sch->arena_map)
+		bpf_map_put(sch->arena_map);
 	kfree(sch);
 }
 
@@ -6746,6 +6748,7 @@ struct scx_enable_cmd {
 		struct sched_ext_ops_cid	*ops_cid;
 	};
 	bool			is_cid_type;
+	struct bpf_map		*arena_map;	/* arena ref to transfer to sch */
 	int			ret;
 };
 
@@ -6913,6 +6916,15 @@ static struct scx_sched *scx_alloc_and_add_sched(struct scx_enable_cmd *cmd,
 		return ERR_PTR(ret);
 	}
 #endif	/* CONFIG_EXT_SUB_SCHED */
+
+	/*
+	 * Consume the arena_map ref bpf_scx_reg_cid() took. Defer to here so
+	 * earlier failure paths leave cmd->arena_map set and bpf_scx_reg_cid
+	 * drops the ref. After this point, sch owns the ref and any cleanup
+	 * runs through scx_sched_free_rcu_work() which puts it.
+	 */
+	sch->arena_map = cmd->arena_map;
+	cmd->arena_map = NULL;
 	return sch;
 
 #ifdef CONFIG_EXT_SUB_SCHED
@@ -7898,11 +7910,53 @@ static int bpf_scx_reg(void *kdata, struct bpf_link *link)
 	return scx_enable(&cmd, link);
 }
 
+struct scx_arena_scan {
+	struct bpf_map	*arena;
+	int		err;
+};
+
+/*
+ * The verifier enforces one arena per BPF program, so each struct_ops
+ * member prog contributes at most one arena via bpf_prog_arena().
+ * Require all non-NULL contributions to match.
+ */
+static int scx_arena_scan_prog(struct bpf_prog *prog, void *data)
+{
+	struct scx_arena_scan *s = data;
+	struct bpf_map *arena = bpf_prog_arena(prog);
+
+	if (!arena)
+		return 0;
+	if (s->arena && s->arena != arena) {
+		s->err = -EINVAL;
+		return 1;
+	}
+	s->arena = arena;
+	return 0;
+}
+
 static int bpf_scx_reg_cid(void *kdata, struct bpf_link *link)
 {
 	struct scx_enable_cmd cmd = { .ops_cid = kdata, .is_cid_type = true };
+	struct scx_arena_scan scan = {};
+	int ret;
 
-	return scx_enable(&cmd, link);
+	bpf_struct_ops_for_each_prog(kdata, scx_arena_scan_prog, &scan);
+	if (scan.err) {
+		pr_err("sched_ext: cid-form scheduler uses multiple arena maps\n");
+		return scan.err;
+	}
+	if (!scan.arena) {
+		pr_err("sched_ext: cid-form scheduler must use a BPF arena map\n");
+		return -EINVAL;
+	}
+
+	bpf_map_inc(scan.arena);
+	cmd.arena_map = scan.arena;
+	ret = scx_enable(&cmd, link);
+	if (cmd.arena_map)		/* not consumed by scx_alloc_and_add_sched() */
+		bpf_map_put(cmd.arena_map);
+	return ret;
 }
 
 static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index 7258aea94b9f..d40cfd29ddaa 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1111,6 +1111,14 @@ struct scx_sched {
 		struct sched_ext_ops_cid	ops_cid;
 	};
 	bool			is_cid_type;	/* true if registered via bpf_sched_ext_ops_cid */
+
+	/*
+	 * Arena map auto-discovered from member progs at struct_ops attach.
+	 * cid-form schedulers must use exactly one arena across all member
+	 * progs. NULL on cpu-form.
+	 */
+	struct bpf_map		*arena_map;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
-- 
2.54.0



^ permalink raw reply related

* [PATCH 8/8] sched_ext: Convert ops.set_cmask() to arena-resident cmask
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

ops_cid.set_cmask() expects a cmask. The kernel couldn't write into the
arena, so it translated cpumask -> cmask in kernel memory and passed the
result as a trusted pointer. The BPF cmask helpers all operate on arena
cmasks though, so the BPF side had to word-by-word probe-read the kernel
cmask into an arena cmask via cmask_copy_from_kernel() before any helper
could touch it. It works, but is clumsy.

With direct kernel-side arena access now in place, build the cmask in the
arena. The kernel writes to it through the kern_va side of the dual mapping;
BPF directly dereferences it via an __arena pointer like any other arena
struct.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c                    | 68 +++++++++++++++++++++++++--
 kernel/sched/ext_cid.c                | 20 +-------
 kernel/sched/ext_internal.h           | 10 +++-
 tools/sched_ext/include/scx/cid.bpf.h | 52 --------------------
 tools/sched_ext/scx_qmap.bpf.c        |  5 +-
 5 files changed, 75 insertions(+), 80 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fb91079c1244..94562e3350c6 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -621,11 +621,16 @@ static inline void scx_call_op_set_cpumask(struct scx_sched *sch, struct rq *rq,
 		update_locked_rq(rq);
 
 	if (scx_is_cid_type()) {
-		struct scx_cmask *cmask = this_cpu_ptr(scx_set_cmask_scratch);
-
-		lockdep_assert_irqs_disabled();
-		scx_cpumask_to_cmask(cpumask, cmask);
-		sch->ops_cid.set_cmask(task, cmask);
+		struct scx_cmask *kern_va = *this_cpu_ptr(sch->set_cmask_scratch);
+		unsigned long uaddr = (unsigned long)kern_va -
+			bpf_arena_map_kern_vm_start(sch->arena_map);
+		/*
+		 * Build the per-CPU arena cmask and hand BPF the uaddr. Caller
+		 * holds the rq lock with IRQs disabled, which makes us the sole
+		 * user of the scratch area.
+		 */
+		scx_cpumask_to_cmask(cpumask, kern_va);
+		sch->ops_cid.set_cmask(task, (struct scx_cmask *)uaddr);
 	} else {
 		sch->ops.set_cpumask(task, cpumask);
 	}
@@ -4949,6 +4954,48 @@ static const struct attribute_group scx_global_attr_group = {
 static void free_pnode(struct scx_sched_pnode *pnode);
 static void free_exit_info(struct scx_exit_info *ei);
 
+static s32 scx_set_cmask_scratch_alloc(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->is_cid_type || !sch->arena_pool)
+		return 0;
+
+	sch->set_cmask_scratch = alloc_percpu(struct scx_cmask *);
+	if (!sch->set_cmask_scratch)
+		return -ENOMEM;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		*slot = scx_arena_alloc(sch, size);
+		if (!*slot)
+			return -ENOMEM;
+		scx_cmask_init(*slot, 0, num_possible_cpus());
+	}
+	return 0;
+}
+
+static void scx_set_cmask_scratch_free(struct scx_sched *sch)
+{
+	size_t size = struct_size_t(struct scx_cmask, bits,
+				    SCX_CMASK_NR_WORDS(num_possible_cpus()));
+	int cpu;
+
+	if (!sch->set_cmask_scratch)
+		return;
+
+	for_each_possible_cpu(cpu) {
+		struct scx_cmask **slot = per_cpu_ptr(sch->set_cmask_scratch, cpu);
+
+		scx_arena_free(sch, *slot, size);
+	}
+	free_percpu(sch->set_cmask_scratch);
+	sch->set_cmask_scratch = NULL;
+}
+
 static void scx_sched_free_rcu_work(struct work_struct *work)
 {
 	struct rcu_work *rcu_work = to_rcu_work(work);
@@ -5003,6 +5050,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_set_cmask_scratch_free(sch);
 	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
@@ -7162,6 +7210,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		goto err_disable;
 	}
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7484,6 +7538,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 	if (ret)
 		goto err_disable;
 
+	ret = scx_set_cmask_scratch_alloc(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_cid.c b/kernel/sched/ext_cid.c
index 0c91b951fd33..808c6390da5a 100644
--- a/kernel/sched/ext_cid.c
+++ b/kernel/sched/ext_cid.c
@@ -7,14 +7,6 @@
  */
 #include <linux/cacheinfo.h>
 
-/*
- * Per-cpu scratch cmask used by scx_call_op_set_cpumask() to synthesize a
- * cmask from a cpumask. Allocated alongside the cid arrays on first enable
- * and never freed. Sized to the full cid space. Caller holds rq lock so
- * this_cpu_ptr is safe.
- */
-struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * cid tables.
  *
@@ -54,8 +46,6 @@ static s32 scx_cid_arrays_alloc(void)
 	u32 npossible = num_possible_cpus();
 	s16 *cid_to_cpu, *cpu_to_cid;
 	struct scx_cid_topo *cid_topo;
-	struct scx_cmask __percpu *set_cmask_scratch;
-	s32 cpu;
 
 	if (scx_cid_to_cpu_tbl)
 		return 0;
@@ -63,25 +53,17 @@ static s32 scx_cid_arrays_alloc(void)
 	cid_to_cpu = kzalloc_objs(*scx_cid_to_cpu_tbl, npossible, GFP_KERNEL);
 	cpu_to_cid = kzalloc_objs(*scx_cpu_to_cid_tbl, nr_cpu_ids, GFP_KERNEL);
 	cid_topo = kmalloc_objs(*scx_cid_topo, npossible, GFP_KERNEL);
-	set_cmask_scratch = __alloc_percpu(struct_size(set_cmask_scratch, bits,
-						       SCX_CMASK_NR_WORDS(npossible)),
-					   sizeof(u64));
 
-	if (!cid_to_cpu || !cpu_to_cid || !cid_topo || !set_cmask_scratch) {
+	if (!cid_to_cpu || !cpu_to_cid || !cid_topo) {
 		kfree(cid_to_cpu);
 		kfree(cpu_to_cid);
 		kfree(cid_topo);
-		free_percpu(set_cmask_scratch);
 		return -ENOMEM;
 	}
 
 	WRITE_ONCE(scx_cid_to_cpu_tbl, cid_to_cpu);
 	WRITE_ONCE(scx_cpu_to_cid_tbl, cpu_to_cid);
 	WRITE_ONCE(scx_cid_topo, cid_topo);
-	for_each_possible_cpu(cpu)
-		scx_cmask_init(per_cpu_ptr(set_cmask_scratch, cpu),
-			       0, npossible);
-	WRITE_ONCE(scx_set_cmask_scratch, set_cmask_scratch);
 	return 0;
 }
 
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index ff7e882bd67a..9bb65367f510 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1124,6 +1124,14 @@ struct scx_sched {
 	struct bpf_map		*arena_map;
 	struct gen_pool		*arena_pool;
 
+	/*
+	 * Per-CPU arena cmask used by scx_call_op_set_cpumask() to hand a cmask
+	 * to ops_cid.set_cmask(). The kernel writes through the stored kern_va;
+	 * the BPF-arena uaddr handed to BPF is recovered by subtracting the
+	 * arena's kern_vm_start.
+	 */
+	struct scx_cmask * __percpu *set_cmask_scratch;
+
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
 	/*
@@ -1480,8 +1488,6 @@ enum scx_ops_state {
 extern struct scx_sched __rcu *scx_root;
 DECLARE_PER_CPU(struct rq *, scx_locked_rq_state);
 
-extern struct scx_cmask __percpu *scx_set_cmask_scratch;
-
 /*
  * True when the currently loaded scheduler hierarchy is cid-form. All scheds
  * in a hierarchy share one form, so this single key tells callsites which
diff --git a/tools/sched_ext/include/scx/cid.bpf.h b/tools/sched_ext/include/scx/cid.bpf.h
index e281c88fa824..70f2a3829af4 100644
--- a/tools/sched_ext/include/scx/cid.bpf.h
+++ b/tools/sched_ext/include/scx/cid.bpf.h
@@ -675,56 +675,4 @@ static __always_inline void cmask_from_cpumask(struct scx_cmask __arena *m,
 	}
 }
 
-/**
- * cmask_copy_from_kernel - probe-read a kernel cmask into an arena cmask
- * @dst: arena cmask to fill; must have @dst->base == 0 and be sized for @src.
- * @src: kernel-memory cmask (e.g. ops.set_cmask() arg); @src->base must be 0.
- *
- * Word-for-word copy; @src and @dst must share base 0 alignment. Triggers
- * scx_bpf_error() on probe failure or precondition violation.
- */
-static __always_inline void cmask_copy_from_kernel(struct scx_cmask __arena *dst,
-						   const struct scx_cmask *src)
-{
-	u32 base = 0, nr_cids = 0, nr_words, wi;
-
-	if (dst->base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires dst->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&base, sizeof(base), &src->base)) {
-		scx_bpf_error("probe-read cmask->base failed");
-		return;
-	}
-	if (base != 0) {
-		scx_bpf_error("cmask_copy_from_kernel requires src->base == 0");
-		return;
-	}
-
-	if (bpf_probe_read_kernel(&nr_cids, sizeof(nr_cids), &src->nr_cids)) {
-		scx_bpf_error("probe-read cmask->nr_cids failed");
-		return;
-	}
-
-	if (nr_cids > dst->nr_cids) {
-		scx_bpf_error("src cmask nr_cids=%u exceeds dst nr_cids=%u",
-			      nr_cids, dst->nr_cids);
-		return;
-	}
-
-	nr_words = CMASK_NR_WORDS(nr_cids);
-	cmask_zero(dst);
-	bpf_for(wi, 0, CMASK_MAX_WORDS) {
-		u64 word = 0;
-		if (wi >= nr_words)
-			break;
-		if (bpf_probe_read_kernel(&word, sizeof(u64), &src->bits[wi])) {
-			scx_bpf_error("probe-read cmask->bits[%u] failed", wi);
-			return;
-		}
-		dst->bits[wi] = word;
-	}
-}
-
 #endif /* __SCX_CID_BPF_H */
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 7e77f22674ea..8a2d6a8ebd8e 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -919,14 +919,15 @@ void BPF_STRUCT_OPS(qmap_update_idle, s32 cid, bool idle)
 }
 
 void BPF_STRUCT_OPS(qmap_set_cmask, struct task_struct *p,
-		    const struct scx_cmask *cmask)
+		    const struct scx_cmask *cmask_in)
 {
+	struct scx_cmask __arena *cmask = (struct scx_cmask __arena *)(long)cmask_in;
 	task_ctx_t *taskc;
 
 	taskc = lookup_task_ctx(p);
 	if (!taskc)
 		return;
-	cmask_copy_from_kernel(&taskc->cpus_allowed, cmask);
+	cmask_copy(&taskc->cpus_allowed, cmask);
 }
 
 struct monitor_timer {
-- 
2.54.0



^ permalink raw reply related

* [PATCH 5/8] bpf/arena: Add bpf_arena_map_kern_vm_start() and bpf_prog_arena()
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

struct bpf_arena is opaque to callers outside arena.c. Add two helpers
for struct_ops subsystems that need to reach into an arena:

  bpf_arena_map_kern_vm_start(struct bpf_map *map)
    returns @map's kern_vm_start. A sched_ext follow-up needs this
    to translate kern_va <-> uaddr.

  bpf_prog_arena(struct bpf_prog *prog)
    returns the bpf_map of the arena referenced by @prog (NULL if
    @prog references no arena). The verifier enforces at most one
    arena per program. Used by struct_ops callers that auto-discover
    an arena from a member prog and need to take a map reference.

Suggested-by: Kumar Kartikeya Dwivedi <memxor@gmail.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 include/linux/bpf.h |  2 ++
 kernel/bpf/arena.c  | 26 ++++++++++++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 5b99d786e98c..e1ba57c10aaa 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -618,6 +618,8 @@ void bpf_rb_root_free(const struct btf_field *field, void *rb_root,
 		      struct bpf_spin_lock *spin_lock);
 u64 bpf_arena_get_kern_vm_start(struct bpf_arena *arena);
 u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena);
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map);
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog);
 int bpf_obj_name_cpy(char *dst, const char *src, unsigned int size);
 
 struct bpf_offload_dev;
diff --git a/kernel/bpf/arena.c b/kernel/bpf/arena.c
index a811cf6170fa..51b9ae36feb6 100644
--- a/kernel/bpf/arena.c
+++ b/kernel/bpf/arena.c
@@ -84,6 +84,32 @@ u64 bpf_arena_get_user_vm_start(struct bpf_arena *arena)
 	return arena ? arena->user_vm_start : 0;
 }
 
+/**
+ * bpf_arena_map_kern_vm_start - kern_vm_start lookup by struct bpf_map *
+ * @map: a BPF_MAP_TYPE_ARENA map
+ *
+ * Return @map's kern_vm_start.
+ */
+u64 bpf_arena_map_kern_vm_start(struct bpf_map *map)
+{
+	return bpf_arena_get_kern_vm_start(container_of(map, struct bpf_arena, map));
+}
+
+/**
+ * bpf_prog_arena - return the bpf_map of the arena referenced by @prog
+ * @prog: a loaded BPF program
+ *
+ * The verifier enforces at most one arena per program and stores it in
+ * prog->aux->arena. Return that arena's underlying bpf_map, or NULL if
+ * @prog does not reference an arena.
+ */
+struct bpf_map *bpf_prog_arena(struct bpf_prog *prog)
+{
+	struct bpf_arena *arena = prog->aux->arena;
+
+	return arena ? &arena->map : NULL;
+}
+
 static long arena_map_peek_elem(struct bpf_map *map, void *value)
 {
 	return -EOPNOTSUPP;
-- 
2.54.0



^ permalink raw reply related

* [PATCH 7/8] sched_ext: Sub-allocator over kernel-claimed BPF arena pages
From: Tejun Heo @ 2026-05-20 23:50 UTC (permalink / raw)
  To: David Vernet, Andrea Righi, Changwoo Min, Alexei Starovoitov,
	Andrii Nakryiko, Daniel Borkmann, Martin KaFai Lau,
	Kumar Kartikeya Dwivedi
  Cc: Peter Zijlstra, Catalin Marinas, Will Deacon, Thomas Gleixner,
	Ingo Molnar, Borislav Petkov, Dave Hansen, Andrew Morton,
	David Hildenbrand, Mike Rapoport, Emil Tsalapatis, sched-ext, bpf,
	x86, linux-arm-kernel, linux-mm, linux-kernel, Tejun Heo
In-Reply-To: <20260520235052.4180316-1-tj@kernel.org>

Build a per-scheduler sub-allocator on top of pages claimed from the BPF
arena registered in the previous patch. Subsequent kernel-managed
arena-resident structures (e.g. per-CPU set_cmask cmask) carve their storage
from this pool.

scx_arena_pool_init() creates a gen_pool. scx_arena_alloc() returns the
kernel VA. On exhaustion, the pool grows by claiming more pages via
bpf_arena_alloc_pages_sleepable(). Chunks are added at the kernel-side
mapping address; callers translate to the BPF-arena form themselves if
needed.

Allocations sleep (GFP_KERNEL) - they may grow the pool through vzalloc and
arena page allocation. All current consumers run from the enable path (after
ops.init() and the kernel-side arena auto-discovery, before validate_ops()),
where sleeping is fine.

scx_arena_pool_destroy() walks each chunk, returns outstanding ranges to the
gen_pool with gen_pool_free() and then calls gen_pool_destroy(). The
underlying arena pages are released when the arena map itself is torn down,
so the pool destroy doesn't free them explicitly.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/build_policy.c |   4 ++
 kernel/sched/ext.c          |  11 ++++
 kernel/sched/ext_arena.c    | 127 ++++++++++++++++++++++++++++++++++++
 kernel/sched/ext_arena.h    |  18 +++++
 kernel/sched/ext_internal.h |   5 ++
 5 files changed, 165 insertions(+)
 create mode 100644 kernel/sched/ext_arena.c
 create mode 100644 kernel/sched/ext_arena.h

diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index 5e76c9177d54..067979a7b69e 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -59,12 +59,16 @@
 
 #ifdef CONFIG_SCHED_CLASS_EXT
 # include <linux/btf_ids.h>
+# include <linux/find.h>
+# include <linux/genalloc.h>
 # include "ext_types.h"
 # include "ext_internal.h"
 # include "ext_cid.h"
+# include "ext_arena.h"
 # include "ext_idle.h"
 # include "ext.c"
 # include "ext_cid.c"
+# include "ext_arena.c"
 # include "ext_idle.c"
 #endif
 
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 56f94ac32ba0..fb91079c1244 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5003,6 +5003,7 @@ static void scx_sched_free_rcu_work(struct work_struct *work)
 
 	rhashtable_free_and_destroy(&sch->dsq_hash, NULL, NULL);
 	free_exit_info(sch->exit_info);
+	scx_arena_pool_destroy(sch);
 	if (sch->arena_map)
 		bpf_map_put(sch->arena_map);
 	kfree(sch);
@@ -7155,6 +7156,12 @@ static void scx_root_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret) {
+		cpus_read_unlock();
+		goto err_disable;
+	}
+
 	for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++)
 		if (((void (**)(void))ops)[i])
 			set_bit(i, sch->has_op);
@@ -7473,6 +7480,10 @@ static void scx_sub_enable_workfn(struct kthread_work *work)
 		sch->exit_info->flags |= SCX_EFLAG_INITIALIZED;
 	}
 
+	ret = scx_arena_pool_init(sch);
+	if (ret)
+		goto err_disable;
+
 	if (validate_ops(sch, ops))
 		goto err_disable;
 
diff --git a/kernel/sched/ext_arena.c b/kernel/sched/ext_arena.c
new file mode 100644
index 000000000000..53174033765d
--- /dev/null
+++ b/kernel/sched/ext_arena.c
@@ -0,0 +1,127 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * scx_arena_pool: kernel-side sub-allocator over BPF-arena pages.
+ *
+ * Each chunk added to @sch->arena_pool comes from one
+ * bpf_arena_alloc_pages_sleepable() call and is registered at the
+ * kernel-side mapping address. Callers translate to the BPF-arena form
+ * themselves if needed.
+ *
+ * Allocations grow the pool on demand. Underlying arena pages are released
+ * when the arena map itself is torn down.
+ *
+ * Copyright (c) 2026 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2026 Tejun Heo <tj@kernel.org>
+ */
+
+enum scx_arena_consts {
+	SCX_ARENA_MIN_ORDER		= 3,	/* 8-byte minimum sub-allocation */
+	SCX_ARENA_GROW_PAGES		= 4,	/* per growth */
+};
+
+s32 scx_arena_pool_init(struct scx_sched *sch)
+{
+	if (!sch->arena_map)
+		return 0;
+
+	sch->arena_pool = gen_pool_create(SCX_ARENA_MIN_ORDER, NUMA_NO_NODE);
+	if (!sch->arena_pool)
+		return -ENOMEM;
+	return 0;
+}
+
+static void scx_arena_clear_chunk(struct gen_pool *pool, struct gen_pool_chunk *chunk,
+				  void *data)
+{
+	int order = pool->min_alloc_order;
+	size_t chunk_sz = chunk->end_addr - chunk->start_addr + 1;
+	unsigned long end_bit = chunk_sz >> order;
+	unsigned long b, e;
+
+	for_each_set_bitrange(b, e, chunk->bits, end_bit)
+		gen_pool_free(pool, chunk->start_addr + (b << order),
+			      (e - b) << order);
+}
+
+/*
+ * Tear down the pool. Outstanding gen_pool allocations are freed via
+ * scx_arena_clear_chunk() so gen_pool_destroy() doesn't BUG. The underlying
+ * arena pages are released when the arena map itself is torn down.
+ */
+void scx_arena_pool_destroy(struct scx_sched *sch)
+{
+	if (!sch->arena_pool)
+		return;
+	gen_pool_for_each_chunk(sch->arena_pool, scx_arena_clear_chunk, NULL);
+	gen_pool_destroy(sch->arena_pool);
+	sch->arena_pool = NULL;
+}
+
+/*
+ * Grow the pool by @page_cnt pages. bpf_arena_alloc_pages_sleepable() and
+ * gen_pool_add() (which calls vzalloc(GFP_KERNEL)) require a sleepable
+ * context.
+ */
+static int scx_arena_grow(struct scx_sched *sch, u32 page_cnt)
+{
+	u64 kern_vm_start;
+	u32 uaddr32;
+	void *p;
+	int ret;
+
+	if (!sch->arena_map || !sch->arena_pool)
+		return -EINVAL;
+
+	p = bpf_arena_alloc_pages_sleepable(sch->arena_map, NULL,
+					    page_cnt, NUMA_NO_NODE, 0);
+	if (!p)
+		return -ENOMEM;
+
+	uaddr32 = (u32)(unsigned long)p;
+	kern_vm_start = bpf_arena_map_kern_vm_start(sch->arena_map);
+
+	ret = gen_pool_add(sch->arena_pool, kern_vm_start + uaddr32,
+			   page_cnt * PAGE_SIZE, NUMA_NO_NODE);
+	if (ret) {
+		bpf_arena_free_pages_non_sleepable(sch->arena_map, p, page_cnt);
+		return ret;
+	}
+	return 0;
+}
+
+/*
+ * Allocate @size bytes from the arena pool. Returns kernel VA on success, NULL
+ * on failure. May grow the pool via scx_arena_grow() which sleeps. Caller must
+ * be in a GFP_KERNEL context.
+ */
+void *scx_arena_alloc(struct scx_sched *sch, size_t size)
+{
+	unsigned long kern_va;
+	u32 page_cnt;
+
+	might_sleep();
+
+	if (!sch->arena_pool)
+		return NULL;
+
+	kern_va = gen_pool_alloc(sch->arena_pool, size);
+	if (!kern_va) {
+		page_cnt = max_t(u32, SCX_ARENA_GROW_PAGES,
+				 (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
+		if (scx_arena_grow(sch, page_cnt))
+			return NULL;
+		kern_va = gen_pool_alloc(sch->arena_pool, size);
+		if (!kern_va)
+			return NULL;
+	}
+
+	return (void *)kern_va;
+}
+
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size)
+{
+	if (sch->arena_pool && kern_va)
+		gen_pool_free(sch->arena_pool, (unsigned long)kern_va, size);
+}
diff --git a/kernel/sched/ext_arena.h b/kernel/sched/ext_arena.h
new file mode 100644
index 000000000000..4f3610160102
--- /dev/null
+++ b/kernel/sched/ext_arena.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst
+ *
+ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates.
+ * Copyright (c) 2025 Tejun Heo <tj@kernel.org>
+ */
+#ifndef _KERNEL_SCHED_EXT_ARENA_H
+#define _KERNEL_SCHED_EXT_ARENA_H
+
+struct scx_sched;
+
+s32 scx_arena_pool_init(struct scx_sched *sch);
+void scx_arena_pool_destroy(struct scx_sched *sch);
+void *scx_arena_alloc(struct scx_sched *sch, size_t size);
+void scx_arena_free(struct scx_sched *sch, void *kern_va, size_t size);
+
+#endif /* _KERNEL_SCHED_EXT_ARENA_H */
diff --git a/kernel/sched/ext_internal.h b/kernel/sched/ext_internal.h
index d40cfd29ddaa..ff7e882bd67a 100644
--- a/kernel/sched/ext_internal.h
+++ b/kernel/sched/ext_internal.h
@@ -1116,8 +1116,13 @@ struct scx_sched {
 	 * Arena map auto-discovered from member progs at struct_ops attach.
 	 * cid-form schedulers must use exactly one arena across all member
 	 * progs. NULL on cpu-form.
+	 *
+	 * @arena_pool sub-allocates @arena_map. Each gen_pool chunk is added
+	 * at the kernel-side mapping address. Grows on demand and pages are
+	 * not released until sched destroy.
 	 */
 	struct bpf_map		*arena_map;
+	struct gen_pool		*arena_pool;
 
 	DECLARE_BITMAP(has_op, SCX_OPI_END);
 
-- 
2.54.0



^ permalink raw reply related

* Re: [PATCH v7 16/28] media: rockchip: rga: reuse cmdbuf contents
From: Nicolas Dufresne @ 2026-05-20 23:55 UTC (permalink / raw)
  To: Sven Püschel, Jacob Chen, Ezequiel Garcia,
	Mauro Carvalho Chehab, Heiko Stuebner, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Hans Verkuil
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	devicetree, kernel, sebastian.reichel, m.tretter, p.zabel
In-Reply-To: <20260521-spu-rga3-v7-16-3f33e8c7145f@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 5751 bytes --]

Le jeudi 21 mai 2026 à 00:44 +0200, Sven Püschel a écrit :
> Reuse the command buffer contents instead of completely writing it
> for every frame. Therefore we only need to replace the source and
> destination addresses for each frame. This reduces the amount of CPU
> and memory operations done in each frame. A new cmdbuf_dirty flag notes
> if the cmdbuf has to be rewritten on the next frame.
> 
> The initial idea of initializing the cmdbuf on streamon broke the
> ability to update controls while streaming (e.g. mirroring).
> 
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

> 
> ---
> Changes in v6:
> - Reworked the commit to not setup the cmdbuf on streamon but rather
>   re-initialize it on the next frame when something changed.
> - Sasahiko flagged the cmdbuf setup at streamon:
>   https://sashiko.dev/#/patchset/20260428-spu-rga3-v5-0-eb7f5d019d86%40pengutronix.de?part=17
> - Dropped Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
>   due to the reworked patch and commit message contents
> 
> Changes in v5:
> - Don't set the flipping and rotation values at streamon and preventing
>   the userspace from chainging them at runtime
> ---
>  drivers/media/platform/rockchip/rga/rga-hw.c | 13 +++++++++----
>  drivers/media/platform/rockchip/rga/rga.c    | 11 +++++++++--
>  drivers/media/platform/rockchip/rga/rga.h    |  2 ++
>  3 files changed, 20 insertions(+), 6 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
> index dac3cb6aa17d3..567d39e58d33f 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.c
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.c
> @@ -417,8 +417,6 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  {
>  	struct rockchip_rga *rga = ctx->rga;
>  
> -	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> -
>  	rga_cmd_set_src_addr(ctx, src->dma_desc_pa);
>  	/*
>  	 * Due to hardware bug,
> @@ -427,11 +425,9 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  	rga_cmd_set_src1_addr(ctx, dst->dma_desc_pa);
>  
>  	rga_cmd_set_dst_addr(ctx, dst->dma_desc_pa);
> -	rga_cmd_set_mode(ctx);
>  
>  	rga_cmd_set_src_info(ctx, &src->offset);
>  	rga_cmd_set_dst_info(ctx, &dst->offset);
> -	rga_cmd_set_trans_info(ctx);
>  
>  	rga_write(rga, RGA_CMD_BASE, ctx->cmdbuf_phy);
>  
> @@ -440,6 +436,14 @@ static void rga_cmd_set(struct rga_ctx *ctx,
>  				   PAGE_SIZE, DMA_BIDIRECTIONAL);
>  }
>  
> +static void rga_hw_setup_cmdbuf(struct rga_ctx *ctx)
> +{
> +	memset(ctx->cmdbuf_virt, 0, RGA_CMDBUF_SIZE);
> +
> +	rga_cmd_set_mode(ctx);
> +	rga_cmd_set_trans_info(ctx);
> +}
> +
>  static void rga_hw_start(struct rockchip_rga *rga,
>  			 struct rga_vb_buffer *src,  struct rga_vb_buffer *dst)
>  {
> @@ -582,6 +586,7 @@ const struct rga_hw rga2_hw = {
>  	.max_height = MAX_HEIGHT,
>  	.stride_alignment = 4,
>  
> +	.setup_cmdbuf = rga_hw_setup_cmdbuf,
>  	.start = rga_hw_start,
>  	.handle_irq = rga_handle_irq,
>  	.get_version = rga_get_version,
> diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
> index d080cb672740b..394b14b9469df 100644
> --- a/drivers/media/platform/rockchip/rga/rga.c
> +++ b/drivers/media/platform/rockchip/rga/rga.c
> @@ -38,6 +38,11 @@ static void device_run(void *prv)
>  	unsigned long flags;
>  
>  	spin_lock_irqsave(&rga->ctrl_lock, flags);
> +	if (ctx->cmdbuf_dirty) {
> +		ctx->cmdbuf_dirty = false;
> +		rga->hw->setup_cmdbuf(ctx);
> +	}
> +	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
>  
>  	rga->curr = ctx;
>  
> @@ -47,8 +52,6 @@ static void device_run(void *prv)
>  	dst = v4l2_m2m_next_dst_buf(ctx->fh.m2m_ctx);
>  
>  	rga->hw->start(rga, vb_to_rga(src), vb_to_rga(dst));
> -
> -	spin_unlock_irqrestore(&rga->ctrl_lock, flags);
>  }
>  
>  static irqreturn_t rga_isr(int irq, void *prv)
> @@ -141,6 +144,7 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		ctx->fill_color = ctrl->val;
>  		break;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  	spin_unlock_irqrestore(&ctx->rga->ctrl_lock, flags);
>  	return 0;
>  }
> @@ -228,6 +232,7 @@ static int rga_open(struct file *file)
>  		ret = -ENOMEM;
>  		goto rel_ctx;
>  	}
> +	ctx->cmdbuf_dirty = true;
>  
>  	ctx->rga = rga;
>  	/* Set default formats */
> @@ -448,6 +453,7 @@ static int vidioc_s_fmt(struct file *file, void *priv, struct v4l2_format *f)
>  	frm->crop.height = pix_fmt->height;
>  
>  	frm->pix = *pix_fmt;
> +	ctx->cmdbuf_dirty = true;
>  
>  	v4l2_dbg(debug, 1, &rga->v4l2_dev,
>  		 "[%s] fmt - %p4cc %dx%d (stride %d, sizeimage %d)\n",
> @@ -564,6 +570,7 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  	}
>  
>  	f->crop = s->r;
> +	ctx->cmdbuf_dirty = true;
>  
>  	return ret;
>  }
> diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
> index 38518146910a6..5360f092fecf0 100644
> --- a/drivers/media/platform/rockchip/rga/rga.h
> +++ b/drivers/media/platform/rockchip/rga/rga.h
> @@ -55,6 +55,7 @@ struct rga_ctx {
>  
>  	void *cmdbuf_virt;
>  	dma_addr_t cmdbuf_phy;
> +	bool cmdbuf_dirty;
>  
>  	int osequence;
>  	int csequence;
> @@ -152,6 +153,7 @@ struct rga_hw {
>  	u32 max_width, max_height;
>  	u8 stride_alignment;
>  
> +	void (*setup_cmdbuf)(struct rga_ctx *ctx);
>  	void (*start)(struct rockchip_rga *rga,
>  		      struct rga_vb_buffer *src, struct rga_vb_buffer *dst);
>  	bool (*handle_irq)(struct rockchip_rga *rga);

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v7 17/28] media: rockchip: rga: check scaling factor
From: Nicolas Dufresne @ 2026-05-20 23:58 UTC (permalink / raw)
  To: Sven Püschel, Jacob Chen, Ezequiel Garcia,
	Mauro Carvalho Chehab, Heiko Stuebner, Rob Herring,
	Krzysztof Kozlowski, Conor Dooley, Hans Verkuil
  Cc: linux-media, linux-rockchip, linux-arm-kernel, linux-kernel,
	devicetree, kernel, sebastian.reichel, m.tretter, p.zabel
In-Reply-To: <20260521-spu-rga3-v7-17-3f33e8c7145f@pengutronix.de>

[-- Attachment #1: Type: text/plain, Size: 8851 bytes --]

Le jeudi 21 mai 2026 à 00:44 +0200, Sven Püschel a écrit :
> Check the scaling factor to avoid potential problems. This is relevant
> for the upcoming RGA3 support, as it can hang when the scaling factor
> is exceeded.
> 
> The check is done at streamon when the other side is already streaming
> to avoid incorrectly failing if the application configures the other
> side after calling streamon. As try_fmt shouldn't be state aware,
> it cannot be used to limit the format based on the scaling factor.
> Therefore the check is done just before the actual streaming would be
> started.
> 
> As the driver allows changing the rotation and selection while
> streaming, add additional checks to ensure these changes
> don't exceed the scaling factor.
> 
> Signed-off-by: Sven Püschel <s.pueschel@pengutronix.de>

Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

> 
> ---
> Changes in v6:
> - Dropped scaling adjustment in s_fmt, as this didn't match the try_fmt
>   result (which shouldn't have it to avoid making it stateful)
> - Moved scaling check to the prepare_streaming callback instead of
>   overwriting the ioctl directly
> - Consider rotation when checking the scaling
> - Check scaling factor when adjusting rotation and selection while
>   streaming
> ---
>  drivers/media/platform/rockchip/rga/rga-buf.c | 28 ++++++++++++
>  drivers/media/platform/rockchip/rga/rga-hw.c  |  1 +
>  drivers/media/platform/rockchip/rga/rga-hw.h  |  1 +
>  drivers/media/platform/rockchip/rga/rga.c     | 63 +++++++++++++++++++++++++--
>  drivers/media/platform/rockchip/rga/rga.h     |  4 ++
>  5 files changed, 94 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rga/rga-buf.c b/drivers/media/platform/rockchip/rga/rga-buf.c
> index ffc6162b2e681..dcaba66f5c1fc 100644
> --- a/drivers/media/platform/rockchip/rga/rga-buf.c
> +++ b/drivers/media/platform/rockchip/rga/rga-buf.c
> @@ -197,6 +197,33 @@ static void rga_buf_return_buffers(struct vb2_queue *q,
>  	}
>  }
>  
> +static int rga_buf_prepare_streaming(struct vb2_queue *q)
> +{
> +	struct rga_ctx *ctx = vb2_get_drv_priv(q);
> +	const struct rga_hw *hw = ctx->rga->hw;
> +	int ret;
> +
> +	/* It's safe to check the streaming state of the other queue,
> +	 * as the streamon ioctl's can't race due to the lock set in
> +	 * the queue_init function.
> +	 */
> +	if ((V4L2_TYPE_IS_OUTPUT(q->type) &&
> +	     vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx))) ||
> +	    (V4L2_TYPE_IS_CAPTURE(q->type) &&
> +	     vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx)))) {
> +		/*
> +		 * As the other side is already streaming,
> +		 * check that the max scaling factor isn't exceeded.
> +		 */
> +		ret = rga_check_scaling(hw, &ctx->in.crop, &ctx->out.crop,
> +					ctx->rotate);
> +		if (ret < 0)
> +			return ret;
> +	}
> +
> +	return 0;
> +}
> +
>  static int rga_buf_start_streaming(struct vb2_queue *q, unsigned int count)
>  {
>  	struct rga_ctx *ctx = vb2_get_drv_priv(q);
> @@ -232,6 +259,7 @@ const struct vb2_ops rga_qops = {
>  	.buf_prepare = rga_buf_prepare,
>  	.buf_queue = rga_buf_queue,
>  	.buf_cleanup = rga_buf_cleanup,
> +	.prepare_streaming = rga_buf_prepare_streaming,
>  	.start_streaming = rga_buf_start_streaming,
>  	.stop_streaming = rga_buf_stop_streaming,
>  };
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.c b/drivers/media/platform/rockchip/rga/rga-hw.c
> index 567d39e58d33f..f2900812ba76f 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.c
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.c
> @@ -584,6 +584,7 @@ const struct rga_hw rga2_hw = {
>  	.max_width = MAX_WIDTH,
>  	.min_height = MIN_HEIGHT,
>  	.max_height = MAX_HEIGHT,
> +	.max_scaling_factor = MAX_SCALING_FACTOR,
>  	.stride_alignment = 4,
>  
>  	.setup_cmdbuf = rga_hw_setup_cmdbuf,
> diff --git a/drivers/media/platform/rockchip/rga/rga-hw.h b/drivers/media/platform/rockchip/rga/rga-hw.h
> index c2e34be751939..805ec23e5e3f4 100644
> --- a/drivers/media/platform/rockchip/rga/rga-hw.h
> +++ b/drivers/media/platform/rockchip/rga/rga-hw.h
> @@ -14,6 +14,7 @@
>  
>  #define MIN_WIDTH 34
>  #define MIN_HEIGHT 34
> +#define MAX_SCALING_FACTOR 16
>  
>  #define RGA_TIMEOUT 500
>  
> diff --git a/drivers/media/platform/rockchip/rga/rga.c b/drivers/media/platform/rockchip/rga/rga.c
> index 394b14b9469df..22954bbae55fc 100644
> --- a/drivers/media/platform/rockchip/rga/rga.c
> +++ b/drivers/media/platform/rockchip/rga/rga.c
> @@ -127,7 +127,9 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  {
>  	struct rga_ctx *ctx = container_of(ctrl->handler, struct rga_ctx,
>  					   ctrl_handler);
> +	const struct rga_hw *hw = ctx->rga->hw;
>  	unsigned long flags;
> +	int ret = 0;
>  
>  	spin_lock_irqsave(&ctx->rga->ctrl_lock, flags);
>  	switch (ctrl->id) {
> @@ -138,6 +140,13 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		ctx->vflip = ctrl->val;
>  		break;
>  	case V4L2_CID_ROTATE:
> +		if (vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx)) &&
> +		    vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx))) {
> +			ret = rga_check_scaling(hw, &ctx->in.crop,
> +						&ctx->out.crop, ctrl->val);
> +			if (ret < 0)
> +				goto s_ctrl_done;
> +		}
>  		ctx->rotate = ctrl->val;
>  		break;
>  	case V4L2_CID_BG_COLOR:
> @@ -145,8 +154,10 @@ static int rga_s_ctrl(struct v4l2_ctrl *ctrl)
>  		break;
>  	}
>  	ctx->cmdbuf_dirty = true;
> +
> +s_ctrl_done:
>  	spin_unlock_irqrestore(&ctx->rga->ctrl_lock, flags);
> -	return 0;
> +	return ret;
>  }
>  
>  static const struct v4l2_ctrl_ops rga_ctrl_ops = {
> @@ -182,6 +193,38 @@ static int rga_setup_ctrls(struct rga_ctx *ctx)
>  	return 0;
>  }
>  
> +static bool check_scaling_factor(const struct rga_hw *hw, u32 src_size,
> +				 u32 dst_size)
> +{
> +	if (src_size < dst_size)
> +		return src_size * hw->max_scaling_factor >= dst_size;
> +	else
> +		return dst_size * hw->max_scaling_factor >= src_size;
> +}
> +
> +int rga_check_scaling(const struct rga_hw *hw, const struct v4l2_rect *crop_in,
> +		      const struct v4l2_rect *crop_out, u32 rotate)
> +{
> +	u32 scaled_width;
> +	u32 scaled_height;
> +
> +	if (rotate == 90 || rotate == 270) {
> +		scaled_width = crop_out->height;
> +		scaled_height = crop_out->width;
> +	} else {
> +		scaled_width = crop_out->width;
> +		scaled_height = crop_out->height;
> +	}
> +
> +	if (!check_scaling_factor(hw, crop_in->width, scaled_width))
> +		return -EINVAL;
> +
> +	if (!check_scaling_factor(hw, crop_in->height, scaled_height))
> +		return -EINVAL;
> +
> +	return 0;
> +}
> +
>  static struct rga_fmt *rga_fmt_find(struct rockchip_rga *rga, u32 pixelformat)
>  {
>  	unsigned int i;
> @@ -525,7 +568,6 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  	struct rga_ctx *ctx = file_to_rga_ctx(file);
>  	struct rockchip_rga *rga = ctx->rga;
>  	struct rga_frame *f;
> -	int ret = 0;
>  
>  	f = rga_get_frame(ctx, s->type);
>  	if (IS_ERR(f))
> @@ -569,10 +611,25 @@ static int vidioc_s_selection(struct file *file, void *priv,
>  		return -EINVAL;
>  	}
>  
> +	if (vb2_is_streaming(v4l2_m2m_get_dst_vq(ctx->fh.m2m_ctx)) &&
> +	    vb2_is_streaming(v4l2_m2m_get_src_vq(ctx->fh.m2m_ctx))) {
> +		int ret = 0;
> +
> +		if (V4L2_TYPE_IS_OUTPUT(s->type))
> +			ret = rga_check_scaling(rga->hw, &s->r, &ctx->out.crop,
> +						ctx->rotate);
> +		else
> +			ret = rga_check_scaling(rga->hw, &ctx->in.crop, &s->r,
> +						ctx->rotate);
> +
> +		if (ret < 0)
> +			return ret;
> +	}
> +
>  	f->crop = s->r;
>  	ctx->cmdbuf_dirty = true;
>  
> -	return ret;
> +	return 0;
>  }
>  
>  static const struct v4l2_ioctl_ops rga_ioctl_ops = {
> diff --git a/drivers/media/platform/rockchip/rga/rga.h b/drivers/media/platform/rockchip/rga/rga.h
> index 5360f092fecf0..df525c6aea8b6 100644
> --- a/drivers/media/platform/rockchip/rga/rga.h
> +++ b/drivers/media/platform/rockchip/rga/rga.h
> @@ -123,6 +123,9 @@ static inline struct rga_vb_buffer *vb_to_rga(struct vb2_v4l2_buffer *vb)
>  
>  struct rga_frame *rga_get_frame(struct rga_ctx *ctx, enum v4l2_buf_type type);
>  
> +int rga_check_scaling(const struct rga_hw *hw, const struct v4l2_rect *crop_in,
> +		      const struct v4l2_rect *crop_out, u32 rotate);
> +
>  /* RGA Buffers Manage */
>  extern const struct vb2_ops rga_qops;
>  
> @@ -151,6 +154,7 @@ struct rga_hw {
>  	size_t cmdbuf_size;
>  	u32 min_width, min_height;
>  	u32 max_width, max_height;
> +	u8 max_scaling_factor;
>  	u8 stride_alignment;
>  
>  	void (*setup_cmdbuf)(struct rga_ctx *ctx);

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* [PATCH] pwm: imx27: Fix variable truncation in .apply()
From: Ronaldo Nunez @ 2026-05-21  0:00 UTC (permalink / raw)
  To: linux-pwm
  Cc: Uwe Kleine-König, Frank Li, Sascha Hauer,
	Pengutronix Kernel Team, Fabio Estevam, imx, linux-arm-kernel,
	linux-kernel, Ronaldo Nunez

This patch fixes a variable truncation when calculating period in
microseconds as part of the solution for the ERR051198 in .apply()
callback.

The problem was identified when reducing the duty cycle through sysfs,
with enable set to 1. The condition to fix errata ERR051198 for period
smaller than 2us is always being met, due to a truncation on tmp,
variable from .apply() callback, caused by the multiplication of
NSEC_PER_SEC, PWMPR (period register) and the prescaler which can easily
overflow u32. Declaring tmp as u64 makes it large enough to accommodate
larger multiplication results.

Testing:
- Hardware: Udoo Neo Extended with iMX6SoloX SoC
- Tools: Verified with a logic analyzer

Signed-off-by: Ronaldo Nunez <rnunez@baylibre.com>
---
 drivers/pwm/pwm-imx27.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/drivers/pwm/pwm-imx27.c b/drivers/pwm/pwm-imx27.c
index 3d34cdc4a3a5..c8b801fcb525 100644
--- a/drivers/pwm/pwm-imx27.c
+++ b/drivers/pwm/pwm-imx27.c
@@ -200,7 +200,7 @@ static void pwm_imx27_wait_fifo_slot(struct pwm_chip *chip,
 static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 			   const struct pwm_state *state)
 {
-	unsigned long period_cycles, duty_cycles, prescale, period_us, tmp;
+	unsigned long period_cycles, duty_cycles, prescale, period_us;
 	struct pwm_imx27_chip *imx = to_pwm_imx27_chip(chip);
 	unsigned long long c;
 	unsigned long long clkrate;
@@ -208,6 +208,7 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	int val;
 	int ret;
 	u32 cr;
+	u64 tmp;
 
 	clkrate = clk_get_rate(imx->clks[PWM_IMX27_PER].clk);
 	c = clkrate * state->period;
@@ -249,6 +250,11 @@ static int pwm_imx27_apply(struct pwm_chip *chip, struct pwm_device *pwm,
 	val = readl(imx->mmio_base + MX3_PWMPR);
 	val = val >= MX3_PWMPR_MAX ? MX3_PWMPR_MAX : val;
 	cr = readl(imx->mmio_base + MX3_PWMCR);
+
+	/*
+	 * tmp stores period in nanoseconds. Result fits in u64 since
+	 * val <= 0xfffe and prescaler in [1, 0x1000].
+	 */
 	tmp = NSEC_PER_SEC * (u64)(val + 2) * MX3_PWMCR_PRESCALER_GET(cr);
 	tmp = DIV_ROUND_UP_ULL(tmp, clkrate);
 	period_us = DIV_ROUND_UP_ULL(tmp, 1000);
-- 
2.53.0



^ permalink raw reply related

* Re: [PATCH v14 05/44] arm64: RMI: Add wrappers for RMI calls
From: Gavin Shan @ 2026-05-21  0:21 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-6-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> The wrappers make the call sites easier to read and deal with the
> boiler plate of handling the error codes from the RMM.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes from v13:
>   * Update to RMM v2.0-bet1 spec including some SRO support (there still
>     some FIXMEs where SRO support is incomplete).
> Changes from v12:
>   * Update to RMM v2.0 specification
> Changes from v8:
>   * Switch from arm_smccc_1_2_smc() to arm_smccc_1_2_invoke() in
>     rmi_rtt_read_entry() for consistency.
> Changes from v7:
>   * Minor renaming of parameters and updated comments
> Changes from v5:
>   * Further improve comments
> Changes from v4:
>   * Improve comments
> Changes from v2:
>   * Make output arguments optional.
>   * Mask RIPAS value rmi_rtt_read_entry()
>   * Drop unused rmi_rtt_get_phys()
> ---
>   arch/arm64/include/asm/rmi_cmds.h | 661 ++++++++++++++++++++++++++++++
>   1 file changed, 661 insertions(+)
>   create mode 100644 arch/arm64/include/asm/rmi_cmds.h
> 
> diff --git a/arch/arm64/include/asm/rmi_cmds.h b/arch/arm64/include/asm/rmi_cmds.h
> new file mode 100644
> index 000000000000..04f7066894e9
> --- /dev/null
> +++ b/arch/arm64/include/asm/rmi_cmds.h
> @@ -0,0 +1,661 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +/*
> + * Copyright (C) 2023 ARM Ltd.
> + */
> +
> +#ifndef __ASM_RMI_CMDS_H
> +#define __ASM_RMI_CMDS_H
> +
> +#include <linux/arm-smccc.h>
> +

[...]

> +
> +/**
> + * rmi_rtt_destroy() - Destroy an RTT
> + * @rd: PA of the RD
> + * @ipa: Base of the IPA range described by the RTT
> + * @level: Depth of the RTT within the tree
> + * @out_rtt: Pointer to write the PA of the RTT which was destroyed
> + * @out_top: Pointer to write the top IPA of non-live RTT entries
> + *

In most cases, the parameters are well explained in RMM-v2.0-bet1 spec, I think
it's nice to keep the code and the spec synchronized. For those specific parameters
of this function, they're well explained in RMM-v2.0-bet1 spec as below.

    @rd: PA of the RD for the target realm
    @ipa: Base of the IPA range described by the RTT
    @level: RTT level
    @out_rtt: PA of the RTT which was destroyed
    @out_top: Top IPA of non-live RTT entries, from entry at which the RTT walk terminated

> + * Destroys an RTT. The RTT must be non-live, i.e. none of the entries in the
> + * table are in ASSIGNED or TABLE state.
> + *
> + * Return: RMI return code.
> + */
> +static inline int rmi_rtt_destroy(unsigned long rd,
> +				  unsigned long ipa,
> +				  long level,
> +				  unsigned long *out_rtt,
> +				  unsigned long *out_top)
> +{
> +	struct arm_smccc_res res;
> +
> +	arm_smccc_1_1_invoke(SMC_RMI_RTT_DESTROY, rd, ipa, level, &res);
> +
> +	if (out_rtt)
> +		*out_rtt = res.a1;
> +	if (out_top)
> +		*out_top = res.a2;
> +
> +	return res.a0;
> +}
> +

[...]

Thanks,
Gavin



^ permalink raw reply

* Re: [PATCH v14 06/44] arm64: RMI: Check for RMI support at init
From: Gavin Shan @ 2026-05-21  0:39 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-7-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> Query the RMI version number and check if it is a compatible version.
> The first two feature registers are read and exposed for future code to
> use.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> v14:
>   * This moves the basic RMI setup into the 'kernel' directory. This is
>     because RMI will be used for some features outside of KVM so should
>     be available even if KVM isn't compiled in.
> ---
>   arch/arm64/include/asm/rmi_cmds.h |  3 ++
>   arch/arm64/kernel/Makefile        |  2 +-
>   arch/arm64/kernel/cpufeature.c    |  1 +
>   arch/arm64/kernel/rmi.c           | 65 +++++++++++++++++++++++++++++++
>   4 files changed, 70 insertions(+), 1 deletion(-)
>   create mode 100644 arch/arm64/kernel/rmi.c
> 

[...]

> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> new file mode 100644
> index 000000000000..99c1ccc35c11
> --- /dev/null
> +++ b/arch/arm64/kernel/rmi.c
> @@ -0,0 +1,65 @@
> +// SPDX-License-Identifier: GPL-2.0
> +/*
> + * Copyright (C) 2023-2025 ARM Ltd.
> + */
> +
> +#include <linux/memblock.h>
> +
> +#include <asm/rmi_cmds.h>
> +
> +unsigned long rmm_feat_reg0;
> +unsigned long rmm_feat_reg1;
> +
> +static int rmi_check_version(void)
> +{
> +	struct arm_smccc_res res;
> +	unsigned short version_major, version_minor;
> +	unsigned long host_version = RMI_ABI_VERSION(RMI_ABI_MAJOR_VERSION,
> +						     RMI_ABI_MINOR_VERSION);
> +	unsigned long aa64pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1);
> +
> +	/* If RME isn't supported, then RMI can't be */
> +	if (cpuid_feature_extract_unsigned_field(aa64pfr0, ID_AA64PFR0_EL1_RME_SHIFT) == 0)
> +		return -ENXIO;
> +
> +	arm_smccc_1_1_invoke(SMC_RMI_VERSION, host_version, &res);
> +
> +	if (res.a0 == SMCCC_RET_NOT_SUPPORTED)
> +		return -ENXIO;
> +
> +	version_major = RMI_ABI_VERSION_GET_MAJOR(res.a1);
> +	version_minor = RMI_ABI_VERSION_GET_MINOR(res.a1);
> +
> +	if (res.a0 != RMI_SUCCESS) {
> +		unsigned short high_version_major, high_version_minor;
> +
> +		high_version_major = RMI_ABI_VERSION_GET_MAJOR(res.a2);
> +		high_version_minor = RMI_ABI_VERSION_GET_MINOR(res.a2);
> +
> +		pr_err("Unsupported RMI ABI (v%d.%d - v%d.%d) we want v%d.%d\n",
> +		       version_major, version_minor,
> +		       high_version_major, high_version_minor,
> +		       RMI_ABI_MAJOR_VERSION,
> +		       RMI_ABI_MINOR_VERSION);
> +		return -ENXIO;
> +	}
> +
> +	pr_info("RMI ABI version %d.%d\n", version_major, version_minor);
> +
> +	return 0;
> +}
> +
> +static int __init arm64_init_rmi(void)
> +{
> +	/* Continue without realm support if we can't agree on a version */
> +	if (rmi_check_version())
> +		return 0;

Is this still a valid point that we have to return zero on errors returned
from rmi_check_version() or other other function calls like rmi_features()?
arm64_init_rmi() is triggered by subsys_initcall() where the return value
needs to indicate success or failure. It's fine to return error code from
arm64_init_rmi() in the path.

> +
> +	if (WARN_ON(rmi_features(0, &rmm_feat_reg0)))
> +		return 0;
> +	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
> +		return 0;
> +
> +	return 0;
> +}
> +subsys_initcall(arm64_init_rmi);

Thanks,
Gavin



^ permalink raw reply

* Re: [PATCH v2] media: rkvdec: fix PM runtime teardown ordering in remove
From: Nicolas Dufresne @ 2026-05-21  0:51 UTC (permalink / raw)
  To: Francesco Saverio Pavone, jonas, detlev.casanova, hverkuil,
	mchehab
  Cc: ezequiel, heiko, stable, linux-media, linux-rockchip,
	linux-arm-kernel, linux-kernel
In-Reply-To: <20260518145414.64514-1-pavone.lawyer@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 3054 bytes --]

Le lundi 18 mai 2026 à 16:54 +0200, Francesco Saverio Pavone a écrit :
> From: Jonas Karlman <jonas@kwiboo.se>
> 
> The current remove() path calls rkvdec_v4l2_cleanup() and
> pm_runtime_disable() before pm_runtime_dont_use_autosuspend(), and
> frees the empty IOMMU domain after that. With autosuspend still
> armed when the domain goes away, the VDPU381 can be left in a dirty
> state across module reload and suspend/resume cycles.
> 
> On RK3588 this surfaces as a VP9 inter-prediction bug: from the
> second ALTREF frame onward, motion blocks decode with U=V=0 (BT.709
> green), while intra and static blocks stay correct. Reordering the
> teardown to dont_use_autosuspend() -> iommu_domain_free() ->
> pm_runtime_disable() -> v4l2_cleanup() makes the symptom go away.
> 
> Tested on a Radxa Rock 5B+ (RK3588, 8 GB LPDDR5) with both the
> libva-v4l2-request mpv pipeline and Chromium's V4L2 stateless
> decoder. With the fix, 300 random pixel samples on VP9 Profile 0
> clips at 1080p and 1440p match a libvpx software reference exactly
> (worst delta 0). Without it, the same 1080p sample at frame 4,
> pixel (960, 270) reads HW=(0,112,0) vs SW=(204,147,116). HEVC and
> H.264 stateless decoding via mpv keep running on hardware with no
> fallback.
> 
> Fixes: ff8c5622f9f7 ("media: rkvdec: Restore iommu addresses on errors")
> Cc: <stable@vger.kernel.org>
> Signed-off-by: Jonas Karlman <jonas@kwiboo.se>
> Tested-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>
> Signed-off-by: Francesco Saverio Pavone <pavone.lawyer@gmail.com>

Tested-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>
Reviewed-by: Nicolas Dufresne <nicolas.dufresne@collabora.com>

cheers,
Nicolas

> ---
> Changes in v2:
>  - Add Cc: <stable@vger.kernel.org>; media-CI flagged that the
>    Fixes: target (ff8c5622f9f7) is present in the 6.17, 6.18, 6.19
>    and 7.0 stable branches, so the fix should reach them too.
>    Link to v1:
> https://lore.kernel.org/all/20260518105413.42147-1-pavone.lawyer@gmail.com/
>    Media-CI report:
> https://linux-media.pages.freedesktop.org/-/users/patchwork/-/jobs/100124849/artifacts/report.htm
> 
>  drivers/media/platform/rockchip/rkvdec/rkvdec.c | 5 +++--
>  1 file changed, 3 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> index 6f5f0422d317..bb95b090a25b 100644
> --- a/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> +++ b/drivers/media/platform/rockchip/rkvdec/rkvdec.c
> @@ -2066,12 +2066,13 @@ static void rkvdec_remove(struct platform_device
> *pdev)
>  
>  	cancel_delayed_work_sync(&rkvdec->watchdog_work);
>  
> -	rkvdec_v4l2_cleanup(rkvdec);
> -	pm_runtime_disable(&pdev->dev);
>  	pm_runtime_dont_use_autosuspend(&pdev->dev);
>  
>  	if (rkvdec->empty_domain)
>  		iommu_domain_free(rkvdec->empty_domain);
> +
> +	pm_runtime_disable(&pdev->dev);
> +	rkvdec_v4l2_cleanup(rkvdec);
>  }
>  
>  #ifdef CONFIG_PM

[-- Attachment #2: This is a digitally signed message part --]
[-- Type: application/pgp-signature, Size: 228 bytes --]

^ permalink raw reply

* Re: [PATCH v14 07/44] arm64: RMI: Configure the RMM with the host's page size
From: Gavin Shan @ 2026-05-21  0:51 UTC (permalink / raw)
  To: Steven Price, kvm, kvmarm
  Cc: Catalin Marinas, Marc Zyngier, Will Deacon, James Morse,
	Oliver Upton, Suzuki K Poulose, Zenghui Yu, linux-arm-kernel,
	linux-kernel, Joey Gouly, Alexandru Elisei, Christoffer Dall,
	Fuad Tabba, linux-coco, Ganapatrao Kulkarni, Shanker Donthineni,
	Alper Gun, Aneesh Kumar K . V, Emi Kisanuki, Vishal Annapurve,
	WeiLin.Chang, Lorenzo.Pieralisi2
In-Reply-To: <20260513131757.116630-8-steven.price@arm.com>

Hi Steven,

On 5/13/26 11:17 PM, Steven Price wrote:
> RMM v2.0 brings the ability to set the RMM's granule size. Check the
> feature registers and configure the RMM so that it matches the host's
> page size. This means that operations can be done with a granulatity
> equal to PAGE_SIZE.
> 
> Signed-off-by: Steven Price <steven.price@arm.com>
> ---
> Changes since v13:
>   * Moved out of KVM.
> ---
>   arch/arm64/kernel/rmi.c | 42 +++++++++++++++++++++++++++++++++++++++++
>   1 file changed, 42 insertions(+)
> 
> diff --git a/arch/arm64/kernel/rmi.c b/arch/arm64/kernel/rmi.c
> index 99c1ccc35c11..a14ead5dedda 100644
> --- a/arch/arm64/kernel/rmi.c
> +++ b/arch/arm64/kernel/rmi.c
> @@ -49,6 +49,45 @@ static int rmi_check_version(void)
>   	return 0;
>   }
>   
> +static int rmi_configure(void)
> +{
> +	struct rmm_config *config __free(free_page) = NULL;
> +	unsigned long ret;
> +
> +	config = (struct rmm_config *)get_zeroed_page(GFP_KERNEL);
> +	if (!config)
> +		return -ENOMEM;
> +
> +	switch (PAGE_SIZE) {
> +	case SZ_4K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_4KB;
> +		break;
> +	case SZ_16K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_16KB;
> +		break;
> +	case SZ_64K:
> +		config->rmi_granule_size = RMI_GRANULE_SIZE_64KB;
> +		break;
> +	default:
> +		pr_err("Unsupported PAGE_SIZE for RMM\n");
> +		return -EINVAL;
> +	}
> +
> +	ret = rmi_rmm_config_set(virt_to_phys(config));
> +	if (ret) {
> +		pr_err("RMM config set failed\n");
> +		return -EINVAL;
> +	}
> +

Looking at branch 'topics/rmm-v2.0-poc_2' of RMM implementation, the granule size
is fixed to be 4KB at present. I'm not sure if I have looked into correct RMM
implementation, but 'topics/rmm-v2.0-poc_2' is recommended one in the cover
letter.

Besides, there has checks in the handler of the RMI command to make sure that
struct rmm_config::tracking_region_size to be 1GB, indicated by zero. It maybe
worthy to set it before call to rmi_rmm_config_set().

	config.tracking_region_size = 0; /* 1GB */
	ret = rmi_rmm_config_set(virt_to_phys(config));


> +	ret = rmi_rmm_activate();
> +	if (ret) {
> +		pr_err("RMM activate failed\n");
> +		return -ENXIO;
> +	}
> +
> +	return 0;
> +}
> +
>   static int __init arm64_init_rmi(void)
>   {
>   	/* Continue without realm support if we can't agree on a version */
> @@ -60,6 +99,9 @@ static int __init arm64_init_rmi(void)
>   	if (WARN_ON(rmi_features(1, &rmm_feat_reg1)))
>   		return 0;
>   
> +	if (rmi_configure())
> +		return 0;
> +
>   	return 0;
>   }
>   subsys_initcall(arm64_init_rmi);

Thanks,
Gavin



^ permalink raw reply


This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox