* [PATCH v2 1/2] arm64/cpufeature: Define hwcaps for 2025 dpISA features
From: Mark Brown @ 2026-05-18 15:07 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, Jonathan Corbet, Shuah Khan
Cc: linux-arm-kernel, linux-kernel, linux-doc, linux-kselftest,
Mark Brown
In-Reply-To: <20260518-arm64-dpisa-2025-v2-0-b3367b73bd00@kernel.org>
The features added by the 2025 dpISA are all straightforward instruction
only features so there is no state to manage, we can just expose hwcaps to
let userspace know they are available.
F16MM is slightly odd in that the feature is FEAT_F16MM but it is discovered
via ID_AA64FPFR0_EL1.F16MM2. We follow the feature name.
Signed-off-by: Mark Brown <broonie@kernel.org>
---
Documentation/arch/arm64/elf_hwcaps.rst | 24 ++++++++++++++++++++++++
arch/arm64/include/uapi/asm/hwcap.h | 8 ++++++++
arch/arm64/kernel/cpufeature.c | 11 +++++++++++
arch/arm64/kernel/cpuinfo.c | 8 ++++++++
4 files changed, 51 insertions(+)
diff --git a/Documentation/arch/arm64/elf_hwcaps.rst b/Documentation/arch/arm64/elf_hwcaps.rst
index 97315ae6c0da..07ff9ea1d605 100644
--- a/Documentation/arch/arm64/elf_hwcaps.rst
+++ b/Documentation/arch/arm64/elf_hwcaps.rst
@@ -451,6 +451,30 @@ HWCAP3_LS64
of CPU. User should only use ld64b/st64b on supported target (device)
memory location, otherwise fallback to the non-atomic alternatives.
+HWCAP3_SVE_B16MM
+ Functionality implied by ID_AA64ZFR0_EL1.B16B16 == 0b0011
+
+HWCAP3_SVE2P3
+ Functionality implied by ID_AA64ZFR0_EL1.SVEver == 0b0100
+
+HWCAP3_SME_LUT6
+ Functionality implied by ID_AA64SMFR0_EL1.LUT6 == 0b1
+
+HWCAP3_SME2P3
+ Functionality implied by ID_AA64SMFR0_EL1.SMEver == 0b0100
+
+HWCAP3_F16MM
+ Functionality implied by ID_AA64FPFR0_EL1.F16MM2 == 0b1
+
+HWCAP3_F16F32DOT
+ Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0010
+
+HWCAP3_F16F32MM
+ Functionality implied by ID_AA64ISAR0_EL1.FHM == 0b0011
+
+HWCAP3_SVE_LUT6
+ Functionality implied by ID_AA64ISAR2_EL1.LUT == 0b0010 and
+ ID_AA64PFR0_EL1.SVE == 0b0001.
4. Unused AT_HWCAP bits
-----------------------
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 06f83ca8de56..10272ddb4d6f 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -147,5 +147,13 @@
#define HWCAP3_MTE_STORE_ONLY (1UL << 1)
#define HWCAP3_LSFE (1UL << 2)
#define HWCAP3_LS64 (1UL << 3)
+#define HWCAP3_SVE_B16MM (1UL << 4)
+#define HWCAP3_SVE2P3 (1UL << 5)
+#define HWCAP3_SME_LUT6 (1UL << 6)
+#define HWCAP3_SME2P3 (1UL << 7)
+#define HWCAP3_F16MM (1UL << 8)
+#define HWCAP3_F16F32DOT (1UL << 9)
+#define HWCAP3_F16F32MM (1UL << 10)
+#define HWCAP3_SVE_LUT6 (1UL << 11)
#endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6d53bb15cf7b..96de16582fca 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -365,6 +365,8 @@ static const struct arm64_ftr_bits ftr_id_aa64zfr0[] = {
static const struct arm64_ftr_bits ftr_id_aa64smfr0[] = {
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_FA64_SHIFT, 1, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
+ FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUT6_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
FTR_STRICT, FTR_EXACT, ID_AA64SMFR0_EL1_LUTv2_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE_IF_IS_ENABLED(CONFIG_ARM64_SME),
@@ -419,6 +421,7 @@ static const struct arm64_ftr_bits ftr_id_aa64fpfr0[] = {
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8DP2_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM8_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8MM4_SHIFT, 1, 0),
+ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F16MM2_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E4M3_SHIFT, 1, 0),
ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64FPFR0_EL1_F8E5M2_SHIFT, 1, 0),
ARM64_FTR_END,
@@ -3284,6 +3287,8 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ISAR0_EL1, SM4, IMP, CAP_HWCAP, KERNEL_HWCAP_SM4),
HWCAP_CAP(ID_AA64ISAR0_EL1, DP, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDDP),
HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, IMP, CAP_HWCAP, KERNEL_HWCAP_ASIMDFHM),
+ HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, F16F32DOT, CAP_HWCAP, KERNEL_HWCAP_F16F32DOT),
+ HWCAP_CAP(ID_AA64ISAR0_EL1, FHM, F16F32MM, CAP_HWCAP, KERNEL_HWCAP_F16F32MM),
HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM, CAP_HWCAP, KERNEL_HWCAP_FLAGM),
HWCAP_CAP(ID_AA64ISAR0_EL1, TS, FLAGM2, CAP_HWCAP, KERNEL_HWCAP_FLAGM2),
HWCAP_CAP(ID_AA64ISAR0_EL1, RNDR, IMP, CAP_HWCAP, KERNEL_HWCAP_RNG),
@@ -3313,7 +3318,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64ISAR3_EL1, LSFE, IMP, CAP_HWCAP, KERNEL_HWCAP_LSFE),
HWCAP_CAP(ID_AA64MMFR2_EL1, AT, IMP, CAP_HWCAP, KERNEL_HWCAP_USCAT),
#ifdef CONFIG_ARM64_SVE
+ HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ISAR2_EL1, LUT, LUT6, CAP_HWCAP, KERNEL_HWCAP_SVE_LUT6),
HWCAP_CAP(ID_AA64PFR0_EL1, SVE, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE),
+ HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p3, CAP_HWCAP, KERNEL_HWCAP_SVE2P3),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p2, CAP_HWCAP, KERNEL_HWCAP_SVE2P2),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2p1, CAP_HWCAP, KERNEL_HWCAP_SVE2P1),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SVEver, SVE2, CAP_HWCAP, KERNEL_HWCAP_SVE2),
@@ -3323,6 +3330,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BitPerm, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBITPERM),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVE_B16B16),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, BFSCALE, CAP_HWCAP, KERNEL_HWCAP_SVE_BFSCALE),
+ HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, B16B16, B16MM, CAP_HWCAP, KERNEL_HWCAP_SVE_B16MM),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, IMP, CAP_HWCAP, KERNEL_HWCAP_SVEBF16),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, BF16, EBF16, CAP_HWCAP, KERNEL_HWCAP_SVE_EBF16),
HWCAP_CAP_MATCH_ID(has_sve_feature, ID_AA64ZFR0_EL1, SHA3, IMP, CAP_HWCAP, KERNEL_HWCAP_SVESHA3),
@@ -3362,7 +3370,9 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
#ifdef CONFIG_ARM64_SME
HWCAP_CAP(ID_AA64PFR1_EL1, SME, IMP, CAP_HWCAP, KERNEL_HWCAP_SME),
HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, FA64, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_FA64),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUT6, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUT6),
HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, LUTv2, IMP, CAP_HWCAP, KERNEL_HWCAP_SME_LUTV2),
+ HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p3, CAP_HWCAP, KERNEL_HWCAP_SME2P3),
HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p2, CAP_HWCAP, KERNEL_HWCAP_SME2P2),
HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2p1, CAP_HWCAP, KERNEL_HWCAP_SME2P1),
HWCAP_CAP_MATCH_ID(has_sme_feature, ID_AA64SMFR0_EL1, SMEver, SME2, CAP_HWCAP, KERNEL_HWCAP_SME2),
@@ -3393,6 +3403,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
HWCAP_CAP(ID_AA64FPFR0_EL1, F8DP2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8DP2),
HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM8, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM8),
HWCAP_CAP(ID_AA64FPFR0_EL1, F8MM4, IMP, CAP_HWCAP, KERNEL_HWCAP_F8MM4),
+ HWCAP_CAP(ID_AA64FPFR0_EL1, F16MM2, IMP, CAP_HWCAP, KERNEL_HWCAP_F16MM),
HWCAP_CAP(ID_AA64FPFR0_EL1, F8E4M3, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E4M3),
HWCAP_CAP(ID_AA64FPFR0_EL1, F8E5M2, IMP, CAP_HWCAP, KERNEL_HWCAP_F8E5M2),
#ifdef CONFIG_ARM64_POE
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 6149bc91251d..d50e2a9b066b 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -164,6 +164,14 @@ static const char *const hwcap_str[] = {
[KERNEL_HWCAP_MTE_FAR] = "mtefar",
[KERNEL_HWCAP_MTE_STORE_ONLY] = "mtestoreonly",
[KERNEL_HWCAP_LSFE] = "lsfe",
+ [KERNEL_HWCAP_SVE_B16MM] = "sveb16mm",
+ [KERNEL_HWCAP_SVE2P3] = "sve2p3",
+ [KERNEL_HWCAP_SME_LUT6] = "smelut6",
+ [KERNEL_HWCAP_SME2P3] = "sme2p3",
+ [KERNEL_HWCAP_F16MM] = "f16mm",
+ [KERNEL_HWCAP_F16F32DOT] = "f16f32dot",
+ [KERNEL_HWCAP_F16F32MM] = "f16f32mm",
+ [KERNEL_HWCAP_SVE_LUT6] = "svelut6",
};
#ifdef CONFIG_COMPAT
--
2.47.3
^ permalink raw reply related
* [PATCH v2 0/2] arm64: Implement support for 2025 dpISA extensions
From: Mark Brown @ 2026-05-18 15:07 UTC (permalink / raw)
To: Catalin Marinas, Will Deacon, Jonathan Corbet, Shuah Khan
Cc: linux-arm-kernel, linux-kernel, linux-doc, linux-kselftest,
Mark Brown
The 2025 dpISA extensions introduce a number of architecture features
all of which are fairly straightforward from a kernel point of view
since they only introduce new instructions, not any architecture state.
Signed-off-by: Mark Brown <broonie@kernel.org>
---
Changes in v2:
- Rename HWCAP3_LUT6 to HWCAP3_SVE_LUT6 and make it depend on SVE.
- Rebase onto v7.1-rc3.
- Link to v1: https://patch.msgid.link/20260302-arm64-dpisa-2025-v1-0-0855e7f41689@kernel.org
---
Mark Brown (2):
arm64/cpufeature: Define hwcaps for 2025 dpISA features
kselftest/arm64: Add 2025 dpISA coverage to hwcaps
Documentation/arch/arm64/elf_hwcaps.rst | 24 +++++++
arch/arm64/include/uapi/asm/hwcap.h | 8 +++
arch/arm64/kernel/cpufeature.c | 11 +++
arch/arm64/kernel/cpuinfo.c | 8 +++
tools/testing/selftests/arm64/abi/hwcap.c | 116 ++++++++++++++++++++++++++++++
5 files changed, 167 insertions(+)
---
base-commit: 5d6919055dec134de3c40167a490f33c74c12581
change-id: 20260106-arm64-dpisa-2025-d6673ae6acee
Best regards,
--
Mark Brown <broonie@kernel.org>
^ permalink raw reply
* Re: [PATCH 00/12] misc/syncobj: add /dev/syncobj device
From: Christian König @ 2026-05-18 15:06 UTC (permalink / raw)
To: Michel Dänzer, Julian Orth
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Sumit Semwal, Jonathan Corbet, Shuah Khan,
Arnd Bergmann, Greg Kroah-Hartman, dri-devel, linux-kernel,
linux-media, linaro-mm-sig, linux-doc, wayland-devel
In-Reply-To: <1162f62e-9c65-446b-9788-bb289a202e6e@mailbox.org>
On 5/18/26 16:59, Michel Dänzer wrote:
> On 5/18/26 14:41, Christian König wrote:
>> On 5/18/26 14:02, Julian Orth wrote:
>>> On Mon, May 18, 2026 at 1:58 PM Christian König
>>> <christian.koenig@amd.com> wrote:
>>>> On 5/16/26 13:06, Julian Orth wrote:
>>>>> This series adds a new device /dev/syncobj that can be used to create
>>>>> and manipulate DRM syncobjs. Previously, these operations required the
>>>>> use of a DRM device and the device needed to support the DRIVER_SYNCOBJ
>>>>> and DRIVER_SYNCOBJ_TIMELINE features.
>>>>>
>>>>> There are several issues with the existing API:
>>>>>
>>>>> - Syncobjs are the only explicit sync mechanism available on wayland.
>>>>> Most compositors do not use GPU waits. Instead, they use the
>>>>> DRM_IOCTL_SYNCOBJ_EVENTFD ioctl to perform a CPU wait. Being tied to
>>>>> DRM devices means that compositors cannot consistently offer this
>>>>> feature even though no device-specific logic is involved.
>>>>
>>>> Well the drm_syncobj is a container for device specific dma fences.
>>>
>>> Not necessarily. The DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL ioctl attaches
>>> some kind of dummy fence that is already signaled. I don't believe
>>> this is device specific. That is also the path that llvmpipe would
>>> use.
>>
>> Yeah I feared that.
>>
>> This is the wait before signal path and if I'm not completely mistaken that one is not supported by a lot of compositors.
>
> Where did you get that impression from?
Kernel space seems to not handle that support very well. We added the flag at some point for drivers, but only a fraction actually implemented it.
I wasn't aware that the general eventfd implementation can handle it, but yeah when compositors use that one then that actually makes sense.
> It's arguably the main point of the syncobj Wayland protocol extension, which is supported by all major compositors (except Weston, where it's still a pending MR).
>
>
>> So as far as I can see using drm_syncobj for software rendering really doesn't make sense, eventfd is a much better fit for that use case.
>
> I agree with Julian's rebuttal to that.
That eventfd is missing the timeline functionality is a pretty good argument, but I'm still not sure if that justifies the extra kernel complexity.
Regards,
Christian.
^ permalink raw reply
* RE: [PATCH v11 2/6] iio: adc: ad4691: add initial driver for AD4691 family
From: Sabau, Radu bogdan @ 2026-05-18 15:05 UTC (permalink / raw)
To: Jonathan Cameron, Radu Sabau via B4 Relay
Cc: Lars-Peter Clausen, Hennerich, Michael, David Lechner, Sa, Nuno,
Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Uwe Kleine-König, Liam Girdwood, Mark Brown, Linus Walleij,
Bartosz Golaszewski, Philipp Zabel, Jonathan Corbet, Shuah Khan,
linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <20260517125241.7dbfb964@jic23-huawei>
> -----Original Message-----
> From: Jonathan Cameron <jic23@kernel.org>
> Sent: Sunday, May 17, 2026 2:53 PM
...
> > +static int ad4691_reset(struct ad4691_state *st)
> > +{
> > + struct device *dev = regmap_get_device(st->regmap);
> > + struct reset_control *rst;
> > +
> > + rst = devm_reset_control_get_optional_exclusive(dev, NULL);
> > + if (IS_ERR(rst))
> > + return dev_err_probe(dev, PTR_ERR(rst), "Failed to get
> reset\n");
> > +
> > + if (rst) {
> > + /*
> > + * Assert the reset line before sleeping to guarantee a proper
> > + * reset pulse on every probe, including driver reloads where
> > + * the line may already be deasserted (reset_control_put()
> does
> > + * not re-assert on release).
> > + * devm_reset_control_get_optional_exclusive_deasserted()
> cannot
> > + * be used because it deasserts immediately without delay; the
> > + * datasheet (Table 5) requires a ≥300 µs reset pulse width
> > + * before deassertion.
> > + */
> > + reset_control_assert(rst);
> > + fsleep(300);
> > + return reset_control_deassert(rst);
> Sashiko makes the reasonable point that we'd kind of expect some time
> between
> that pin dropping the device out of reset and it being able to respond. If it
> really is that quick - then add a comment.
>
> https://urldefense.com/v3/__https://sashiko.dev/*/patchset/20260515-
> ad4692-multichannel-sar-adc-driver-v11-0-
> eab27d852ac2*40analog.com__;IyU!!A3Ni8CS0y2Y!5I5rXG5US4TFIZ_cAbgcy
> gH-_Cbt6wLDZ5jVeBiPSDg5KuzEZT-CMN4Z3aFYYVXH6Kx4f2ClJcbr9w$
> > + }
> > +
> > + /* No hardware reset available, fall back to software reset. */
> > + return regmap_write(st->regmap, AD4691_SPI_CONFIG_A_REG,
> > + AD4691_SW_RESET);
> Same applies here.
>
He is right. And also I think I misread the datasheet, the 300us should happen
after deassertion and actual RESETL time is 10ns minimum which would be
covered by the overhead itself.
I think the reason to why this was never a problem while testing is because
300us is the recommended time and the minimum one could be much lower
than this, so successful probe was nothing but luck on my end.
I will make sure to update the function accordingly.
^ permalink raw reply
* Re: [PATCH v11 2/6] iio: adc: ad4691: add initial driver for AD4691 family
From: David Lechner @ 2026-05-18 15:05 UTC (permalink / raw)
To: Sabau, Radu bogdan, Lars-Peter Clausen, Hennerich, Michael,
Jonathan Cameron, Sa, Nuno, Andy Shevchenko, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Uwe Kleine-König,
Liam Girdwood, Mark Brown, Linus Walleij, Bartosz Golaszewski,
Philipp Zabel, Jonathan Corbet, Shuah Khan
Cc: linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <LV9PR03MB841445D5BD1087FB3204EBD9F7032@LV9PR03MB8414.namprd03.prod.outlook.com>
On 5/18/26 9:59 AM, Sabau, Radu bogdan wrote:
>> -----Original Message-----
>> From: David Lechner <dlechner@baylibre.com>
>> Sent: Saturday, May 16, 2026 8:11 PM
>
> ...
>
>>> +static int ad4691_reg_read(void *context, unsigned int reg, unsigned int
>> *val)
>>> +{
>>> + struct spi_device *spi = context;
>>> + u8 tx[2], rx[4];
>>> + int ret;
>>> +
>>> + /* Set bit 15 to mark the operation as READ. */
>>
>> Can't we just set read_flag_mask in the regmap config?
>>
>
> As far as I can tell read_flag_mask is applied by the standard SPI regmap bus
> backend, which constructs and sends the address byte itself before reading
> the response. When using devm_regmap_init() with custom reg_read/reg_write
> callbacks, the regmap core calls those callbacks directly with the raw register
> address - it never touches read_flag_mask.
>
>>> + put_unaligned_be16(0x8000 | reg, tx);
>>> +
>>> + switch (reg) {
>>> + case 0 ... AD4691_OSC_FREQ_REG:
>>> + case AD4691_SPARE_CONTROL ... AD4691_ACC_MASK_REG - 1:
>
> ...
>
>>> +static int ad4691_write_raw(struct iio_dev *indio_dev,
>>> + struct iio_chan_spec const *chan,
>>> + int val, int val2, long mask)
>>> +{
>>> + switch (mask) {
>>> + case IIO_CHAN_INFO_SAMP_FREQ:
>>
>> Should we aquire direct mode so that we can't change the rate during
>> buffered read?
>>
>
> It is in set_sampling_freq already. Do you think it would make more sense
> to move it here in order to help readability?
>
IIRC, I think it was resolved in a later patch in the series. So
could just be a problem of it not getting added in the right patch.
In general though, yes it would make it easier review if the
direct mode claim was made here.
^ permalink raw reply
* Re: [PATCH v13 3/4] gpio: rpmsg: add generic rpmsg GPIO driver
From: Shah, Tanmay @ 2026-05-18 15:01 UTC (permalink / raw)
To: Shenwei Wang, Mathieu Poirier
Cc: Arnaud POULIQUEN, Beleswar Prasad Padhi, Andrew Lunn,
Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson, Frank Li,
Sascha Hauer, Shuah Khan, linux-gpio@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
Pengutronix Kernel Team, Fabio Estevam, Peng Fan,
devicetree@vger.kernel.org, linux-remoteproc@vger.kernel.org,
imx@lists.linux.dev, linux-arm-kernel@lists.infradead.org,
dl-linux-imx, Bartosz Golaszewski
In-Reply-To: <PAXPR04MB918587A8812B51BBB2A46A2C89032@PAXPR04MB9185.eurprd04.prod.outlook.com>
On 5/18/2026 9:24 AM, Shenwei Wang wrote:
>
>
>
>>
>> On Thu, May 07, 2026 at 07:43:33PM +0000, Shenwei Wang wrote:
>>>
>>>
>>>> That was my initial approach. We don't even need an additional
>>>> "rpmsg-io-*" in rpmsg_gpio_channel_id_table[]. All we need is:
>>>>
>>>> /* rpmsg devices and drivers are matched using the service name */
>>>> static inline int rpmsg_id_match(const struct rpmsg_device *rpdev,
>>>> const struct rpmsg_device_id *id) {
>>>> + size_t len = strnlen(id->name, RPMSG_NAME_SIZE);
>>>>
>>>> - return strncmp(id->name, rpdev->id.name, RPMSG_NAME_SIZE) == 0;
>>>> + return strncmp(id->name, rpdev->id.name, len) == 0;
>>>> }
>>>>
>>>
>>> If we encode the port index directly into ept->src, for example:
>>>
>>> ept->src = (baseaddr << 8) | port_index;
>>>
>>
>> There is no rpmsg_endpoint::src. You likely meant ept->addr. This would work
>> but not optimal on two front:
>>
>> (1) rpms_endpoint::addr is a u32 and idr_alloc() returns an 'int'. As such there is a
>> possibility of conflict. I concede the possibility is marginal, but it still exists.
>>
>
> I think there may be a misunderstanding in the implementation. In this case, we do not
> need the return value from idr_alloc.
>
> When the driver calls rpmsg_create_ept, it can pass an rpmsg_channel_info structure as an
> input parameter. This allows you to specify the source address you want to bind.
> Please refer to the definitions below:
>
> struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev,
> rpmsg_rx_cb_t cb, void *priv,
> struct rpmsg_channel_info chinfo)
>
> struct rpmsg_channel_info {
> char name[RPMSG_NAME_SIZE];
> u32 src;
> u32 dst;
> };
>
>> (2) By proceeding this way, the kernel exposes the GPIO controller it knows
>> about. It is preferrable to have the remote processor tell the kernel about the
>> GPIO controller it wants.
>>
>
> If everyone agrees with this namespace announcement approach, I will prepare the
> next revision based on it, even though it is not as clean as the source address encoding solution.
>
I have ack on the namespace announcement approach. To me it is very
simple contract between the firmware and the Linux which allows dynamic
endpoint allocation without giving up security concerns. Also this
approach can be easily extended for any other rpmsg devices like i2c,
spi etc.. With the fix in the rpmsg_core.c, this will work. I will have
to see the actual implementation in the next rev to provide further
feedback.
Thanks,
Tanmay
> Shenwei
>
>> I am done reviewing this revision. Given the amount of refactoring needed, I will
>> not look at the code. Please refer to this reply [1] for what I am expecting in the
>> next revision.
>>
>
^ permalink raw reply
* RE: [PATCH v11 2/6] iio: adc: ad4691: add initial driver for AD4691 family
From: Sabau, Radu bogdan @ 2026-05-18 14:59 UTC (permalink / raw)
To: David Lechner, Lars-Peter Clausen, Hennerich, Michael,
Jonathan Cameron, Sa, Nuno, Andy Shevchenko, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Uwe Kleine-König,
Liam Girdwood, Mark Brown, Linus Walleij, Bartosz Golaszewski,
Philipp Zabel, Jonathan Corbet, Shuah Khan
Cc: linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <0696b662-f478-4d1a-95e0-0338bbdb719e@baylibre.com>
> -----Original Message-----
> From: David Lechner <dlechner@baylibre.com>
> Sent: Saturday, May 16, 2026 8:11 PM
...
> > +static int ad4691_reg_read(void *context, unsigned int reg, unsigned int
> *val)
> > +{
> > + struct spi_device *spi = context;
> > + u8 tx[2], rx[4];
> > + int ret;
> > +
> > + /* Set bit 15 to mark the operation as READ. */
>
> Can't we just set read_flag_mask in the regmap config?
>
As far as I can tell read_flag_mask is applied by the standard SPI regmap bus
backend, which constructs and sends the address byte itself before reading
the response. When using devm_regmap_init() with custom reg_read/reg_write
callbacks, the regmap core calls those callbacks directly with the raw register
address - it never touches read_flag_mask.
> > + put_unaligned_be16(0x8000 | reg, tx);
> > +
> > + switch (reg) {
> > + case 0 ... AD4691_OSC_FREQ_REG:
> > + case AD4691_SPARE_CONTROL ... AD4691_ACC_MASK_REG - 1:
...
> > +static int ad4691_write_raw(struct iio_dev *indio_dev,
> > + struct iio_chan_spec const *chan,
> > + int val, int val2, long mask)
> > +{
> > + switch (mask) {
> > + case IIO_CHAN_INFO_SAMP_FREQ:
>
> Should we aquire direct mode so that we can't change the rate during
> buffered read?
>
It is in set_sampling_freq already. Do you think it would make more sense
to move it here in order to help readability?
^ permalink raw reply
* Re: [PATCH 00/12] misc/syncobj: add /dev/syncobj device
From: Michel Dänzer @ 2026-05-18 14:59 UTC (permalink / raw)
To: Christian König, Julian Orth
Cc: Maarten Lankhorst, Maxime Ripard, Thomas Zimmermann, David Airlie,
Simona Vetter, Sumit Semwal, Jonathan Corbet, Shuah Khan,
Arnd Bergmann, Greg Kroah-Hartman, dri-devel, linux-kernel,
linux-media, linaro-mm-sig, linux-doc, wayland-devel
In-Reply-To: <69dcbcc1-da58-4d34-bfb0-5c8d33b75d59@amd.com>
On 5/18/26 14:41, Christian König wrote:
> On 5/18/26 14:02, Julian Orth wrote:
>> On Mon, May 18, 2026 at 1:58 PM Christian König
>> <christian.koenig@amd.com> wrote:
>>> On 5/16/26 13:06, Julian Orth wrote:
>>>> This series adds a new device /dev/syncobj that can be used to create
>>>> and manipulate DRM syncobjs. Previously, these operations required the
>>>> use of a DRM device and the device needed to support the DRIVER_SYNCOBJ
>>>> and DRIVER_SYNCOBJ_TIMELINE features.
>>>>
>>>> There are several issues with the existing API:
>>>>
>>>> - Syncobjs are the only explicit sync mechanism available on wayland.
>>>> Most compositors do not use GPU waits. Instead, they use the
>>>> DRM_IOCTL_SYNCOBJ_EVENTFD ioctl to perform a CPU wait. Being tied to
>>>> DRM devices means that compositors cannot consistently offer this
>>>> feature even though no device-specific logic is involved.
>>>
>>> Well the drm_syncobj is a container for device specific dma fences.
>>
>> Not necessarily. The DRM_IOCTL_SYNCOBJ_TIMELINE_SIGNAL ioctl attaches
>> some kind of dummy fence that is already signaled. I don't believe
>> this is device specific. That is also the path that llvmpipe would
>> use.
>
> Yeah I feared that.
>
> This is the wait before signal path and if I'm not completely mistaken that one is not supported by a lot of compositors.
Where did you get that impression from?
It's arguably the main point of the syncobj Wayland protocol extension, which is supported by all major compositors (except Weston, where it's still a pending MR).
> So as far as I can see using drm_syncobj for software rendering really doesn't make sense, eventfd is a much better fit for that use case.
I agree with Julian's rebuttal to that.
--
Earthling Michel Dänzer \ GNOME / Xwayland / Mesa developer
https://redhat.com \ Libre software enthusiast
^ permalink raw reply
* Re: [PATCH mm-unstable v17 02/14] mm/khugepaged: generalize alloc_charge_folio()
From: Lance Yang @ 2026-05-18 14:49 UTC (permalink / raw)
To: Usama Arif, Nico Pache
Cc: linux-doc, linux-kernel, linux-mm, linux-trace-kernel, akpm,
anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat, mhocko,
peterx, pfalcato, rakie.kim, raquini, rdunlap, richard.weiyang,
rientjes, rostedt, rppt, ryan.roberts, shivankg, sunnanyong,
surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <20260518115553.3513034-1-usama.arif@linux.dev>
On 2026/5/18 19:55, Usama Arif wrote:
[...]
>> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
>> index 979885694351..f0e29d5c7b1f 100644
>> --- a/mm/khugepaged.c
>> +++ b/mm/khugepaged.c
>> @@ -1068,21 +1068,26 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
>> }
>>
>> static enum scan_result alloc_charge_folio(struct folio **foliop, struct mm_struct *mm,
>> - struct collapse_control *cc)
>> + struct collapse_control *cc, unsigned int order)
>> {
>> gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() :
>> GFP_TRANSHUGE);
>> int node = collapse_find_target_node(cc);
>> struct folio *folio;
>>
>> - folio = __folio_alloc(gfp, HPAGE_PMD_ORDER, node, &cc->alloc_nmask);
>> + folio = __folio_alloc(gfp, order, node, &cc->alloc_nmask);
>> if (!folio) {
>> *foliop = NULL;
>> - count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>> + if (is_pmd_order(order))
>> + count_vm_event(THP_COLLAPSE_ALLOC_FAILED);
>> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC_FAILED);
>> return SCAN_ALLOC_HUGE_PAGE_FAIL;
>> }
>>
>> - count_vm_event(THP_COLLAPSE_ALLOC);
>> + if (is_pmd_order(order))
>> + count_vm_event(THP_COLLAPSE_ALLOC);
>> + count_mthp_stat(order, MTHP_STAT_COLLAPSE_ALLOC);
>> +
>
> The vmstat THP_COLLAPSE_ALLOC counter is pmd order only.
> But after this we have
>
> count_memcg_folio_events(folio, THP_COLLAPSE_ALLOC, 1);
>
> which is not being guarded with is_pmd_order().
Good catch!
>
> I think we want this to be pmd order only as well so that
> the meaning of the vmstat and cgroup counter remains the same?
Agreed. THP_COLLAPSE_ALLOC should remain PMD order only for
vmstat and memcg events.
So this should be guarded with is_pmd_order() as well :)
Cheers, Lance
^ permalink raw reply
* Re: [PATCH v11 3/6] iio: adc: ad4691: add triggered buffer support
From: David Lechner @ 2026-05-18 14:36 UTC (permalink / raw)
To: Jonathan Cameron
Cc: radu.sabau, Lars-Peter Clausen, Michael Hennerich, Nuno Sá,
Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Uwe Kleine-König, Liam Girdwood, Mark Brown, Linus Walleij,
Bartosz Golaszewski, Philipp Zabel, Jonathan Corbet, Shuah Khan,
linux-iio, devicetree, linux-kernel, linux-pwm, linux-gpio,
linux-doc
In-Reply-To: <20260518152103.4d428c1e@jic23-huawei>
On 5/18/26 9:21 AM, Jonathan Cameron wrote:
> On Sun, 17 May 2026 14:21:30 -0500
> David Lechner <dlechner@baylibre.com> wrote:
>
>> On 5/17/26 7:25 AM, Jonathan Cameron wrote:
>>> On Sat, 16 May 2026 12:32:51 -0500
>>> David Lechner <dlechner@baylibre.com> wrote:
>>>
>>>> On 5/15/26 8:31 AM, Radu Sabau via B4 Relay wrote:
>>>>> From: Radu Sabau <radu.sabau@analog.com>
>>>>>
>>>>> Add buffered capture support using the IIO triggered buffer framework.
>>>>>
>>>>> CNV Burst Mode: the GP pin identified by interrupt-names in the device
>>>>> tree is configured as DATA_READY output. The IRQ handler stops
>>>>> conversions and fires the IIO trigger; the trigger handler executes a
>>>>> pre-built SPI message that reads all active channels from the AVG_IN
>>>>> accumulator registers and then resets accumulator state and restarts
>>>>> conversions for the next cycle.
>>>>>
>>>>> Manual Mode: CNV is tied to SPI CS so each transfer simultaneously
>>>>> reads the previous result and starts the next conversion (pipelined
>>>>> N+1 scheme). At preenable time a pre-built, optimised SPI message of
>>>>> N+1 transfers is constructed (N channel reads plus one NOOP to drain
>>>>> the pipeline). The trigger handler executes the message in a single
>>>>> spi_sync() call and collects the results. An external trigger (e.g.
>>>>> iio-trig-hrtimer) is required to drive the trigger at the desired
>>>>> sample rate.
>>>>>
>>>>> Both modes share the same trigger handler and push a complete scan —
>>>>> one big-endian 16-bit (__be16) slot per active channel, densely packed
>>>>> in scan_index order, followed by a timestamp.
>>>>>
>>>>> The CNV Burst Mode sampling frequency (PWM period) is exposed as a
>>>>> buffer-level attribute via IIO_DEVICE_ATTR.
>>>>>
>>>>> Signed-off-by: Radu Sabau <radu.sabau@analog.com>
>>>
>>>>> +
>>>>> +static int ad4691_manual_buffer_preenable(struct iio_dev *indio_dev)
>>>>> +{
>>>>> + struct ad4691_state *st = iio_priv(indio_dev);
>>>>> + unsigned int k, i;
>>>>> + int ret;
>>>>> +
>>>>> + memset(st->scan_xfers, 0, sizeof(st->scan_xfers));
>>>>> + memset(st->scan_tx, 0, sizeof(st->scan_tx));
>>>>> +
>>>>> + spi_message_init(&st->scan_msg);
>>>>> +
>>>>> + k = 0;
>>>>> + iio_for_each_active_channel(indio_dev, i) {
>>>>> + if (i >= indio_dev->num_channels - 1)
>>>>> + break; /* skip soft timestamp */
>>>>
>>>> I don't think timestamp gets set in the scan mask. It is handled separately.
>>>
>>> FWIW that is a sashiko false postive (I believe anyway!)
>>> If we do hit this please shout as we have a core bug.
>>>
>>> If anyone has time to look at how hard it would be to tweak
>>> iio_for_each_active_channel to skip a last element timestamp that
>>> would be great.
>>>
>>> I think that iterates one too far which is what sashiko is tripping over.
>>>
>>> I'm only keen to fix that if we can make it low cost and hid it entirely
>>> from drivers.
>>>
>>> Jonathan
>>>
>> This is what I came up with (totally untested).
>>
>> Since timestamp can never be set in scan_mask/active_scan_mask, it should
>> be safe to exclude it from masklength without breaking existing code.
> Probably...
>>
>> I didn't check all callers of masklength/iio_get_masklength() though.
>
> That was the bit that made me nervous. Particularly if there is an off
> by one that is working by luck today - or someone who understood this
> oddity and did it deliberately.
>
> At one point we also had a few other timestamps - the ones come from hardware.
> I can't remember how we handled those wrt to the scan mask. I took a quick
> look and thing they are all fine.
> FWIW a nice precursor would be to make sure all timestamp channels are assigned
> using the macro. There are a few that are hand crafted. I tested a few, but obviously
> needs turning in to a proper set and cleaning up.
>
> diff --git a/drivers/iio/adc/ad4170-4.c b/drivers/iio/adc/ad4170-4.c
> index 627cbf5a37b0..890e25294baa 100644
> --- a/drivers/iio/adc/ad4170-4.c
> +++ b/drivers/iio/adc/ad4170-4.c
> @@ -2385,9 +2385,7 @@ static int ad4170_parse_channels(struct iio_dev *indio_dev)
> }
>
> /* Add timestamp channel */
> - struct iio_chan_spec ts_chan = IIO_CHAN_SOFT_TIMESTAMP(chan_num);
> -
> - st->chans[chan_num] = ts_chan;
> + st->chans[chan_num] = IIO_CHAN_SOFT_TIMESTAMP(chan_num);
> num_channels = num_channels + 1;
>
> indio_dev->num_channels = num_channels;
> diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
> index 6e1930f7c65d..56baca1f5026 100644
> --- a/drivers/iio/adc/at91_adc.c
> +++ b/drivers/iio/adc/at91_adc.c
> @@ -521,13 +521,7 @@ static int at91_adc_channel_init(struct iio_dev *idev)
> }
> timestamp = chan_array + idx;
>
> - timestamp->type = IIO_TIMESTAMP;
> - timestamp->channel = -1;
> - timestamp->scan_index = idx;
> - timestamp->scan_type.sign = 's';
> - timestamp->scan_type.realbits = 64;
> - timestamp->scan_type.storagebits = 64;
> -
> + *timestamp = IIO_CHAN_SOFT_TIMESTAMP(idx);
> idev->channels = chan_array;
> return idev->num_channels;
> }
> diff --git a/drivers/iio/adc/cc10001_adc.c b/drivers/iio/adc/cc10001_adc.c
> index 2c51b90b7101..d42b747325aa 100644
> --- a/drivers/iio/adc/cc10001_adc.c
> +++ b/drivers/iio/adc/cc10001_adc.c
> @@ -262,7 +262,7 @@ static const struct iio_info cc10001_adc_info = {
> static int cc10001_adc_channel_init(struct iio_dev *indio_dev,
> unsigned long channel_map)
> {
> - struct iio_chan_spec *chan_array, *timestamp;
> + struct iio_chan_spec *chan_array;
> unsigned int bit, idx = 0;
>
> indio_dev->num_channels = bitmap_weight(&channel_map,
> @@ -289,13 +289,7 @@ static int cc10001_adc_channel_init(struct iio_dev *indio_dev,
> idx++;
> }
>
> - timestamp = &chan_array[idx];
> - timestamp->type = IIO_TIMESTAMP;
> - timestamp->channel = -1;
> - timestamp->scan_index = idx;
> - timestamp->scan_type.sign = 's';
> - timestamp->scan_type.realbits = 64;
> - timestamp->scan_type.storagebits = 64;
> + chan_array[idx] = IIO_CHAN_SOFT_TIMESTAMP(idx);
>
> indio_dev->channels = chan_array;
>
> diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
> index 96b05c86c325..702b2fc66326 100644
> --- a/include/linux/iio/iio.h
> +++ b/include/linux/iio/iio.h
> @@ -353,7 +353,7 @@ static inline bool iio_channel_has_available(const struct iio_chan_spec *chan,
> (chan->info_mask_shared_by_all_available & BIT(type));
> }
>
> -#define IIO_CHAN_SOFT_TIMESTAMP(_si) { \
> +#define IIO_CHAN_SOFT_TIMESTAMP(_si) (struct iio_chan_spec) { \
> .type = IIO_TIMESTAMP, \
> .channel = -1, \
> .scan_index = _si, \
>
> Doing that will mean we can spot any unusual use of IIO_TIMESTAMP much more
> easily.
>
> Anyhow, basic approach looks good to me.
I guess you didn't see the other series cleaning up IIO_TIMESTAMP I already
sent yet.
>
> Jonathan
>
>
>
>>
>> ---
>> diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
>> index 9d66510a1d49..17f539fc23e2 100644
>> --- a/drivers/iio/industrialio-buffer.c
>> +++ b/drivers/iio/industrialio-buffer.c
>> @@ -2300,8 +2300,10 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
>> if (channels) {
>> int ml = 0;
>>
>> - for (i = 0; i < indio_dev->num_channels; i++)
>> - ml = max(ml, channels[i].scan_index + 1);
>> + for (i = 0; i < indio_dev->num_channels; i++) {
>> + if (channels[i].type != IIO_TIMESTAMP)
>> + ml = max(ml, channels[i].scan_index + 1);
>> + }
>> ACCESS_PRIVATE(indio_dev, masklength) = ml;
>> }
>>
>>
>>
>>
>
^ permalink raw reply
* Re: [PATCH v13 3/4] gpio: rpmsg: add generic rpmsg GPIO driver
From: Shenwei Wang @ 2026-05-18 14:24 UTC (permalink / raw)
To: Mathieu Poirier
Cc: Arnaud POULIQUEN, Beleswar Prasad Padhi, Andrew Lunn,
Linus Walleij, Bartosz Golaszewski, Jonathan Corbet, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Bjorn Andersson, Frank Li,
Sascha Hauer, Shuah Khan, linux-gpio@vger.kernel.org,
linux-doc@vger.kernel.org, linux-kernel@vger.kernel.org,
Pengutronix Kernel Team, Fabio Estevam, Peng Fan,
devicetree@vger.kernel.org, linux-remoteproc@vger.kernel.org,
imx@lists.linux.dev, linux-arm-kernel@lists.infradead.org,
dl-linux-imx, Bartosz Golaszewski
In-Reply-To: <agYHzH-nJLl1HFIn@p14s>
>
> On Thu, May 07, 2026 at 07:43:33PM +0000, Shenwei Wang wrote:
> >
> >
> > > That was my initial approach. We don't even need an additional
> > > "rpmsg-io-*" in rpmsg_gpio_channel_id_table[]. All we need is:
> > >
> > > /* rpmsg devices and drivers are matched using the service name */
> > > static inline int rpmsg_id_match(const struct rpmsg_device *rpdev,
> > > const struct rpmsg_device_id *id) {
> > > + size_t len = strnlen(id->name, RPMSG_NAME_SIZE);
> > >
> > > - return strncmp(id->name, rpdev->id.name, RPMSG_NAME_SIZE) == 0;
> > > + return strncmp(id->name, rpdev->id.name, len) == 0;
> > > }
> > >
> >
> > If we encode the port index directly into ept->src, for example:
> >
> > ept->src = (baseaddr << 8) | port_index;
> >
>
> There is no rpmsg_endpoint::src. You likely meant ept->addr. This would work
> but not optimal on two front:
>
> (1) rpms_endpoint::addr is a u32 and idr_alloc() returns an 'int'. As such there is a
> possibility of conflict. I concede the possibility is marginal, but it still exists.
>
I think there may be a misunderstanding in the implementation. In this case, we do not
need the return value from idr_alloc.
When the driver calls rpmsg_create_ept, it can pass an rpmsg_channel_info structure as an
input parameter. This allows you to specify the source address you want to bind.
Please refer to the definitions below:
struct rpmsg_endpoint *rpmsg_create_ept(struct rpmsg_device *rpdev,
rpmsg_rx_cb_t cb, void *priv,
struct rpmsg_channel_info chinfo)
struct rpmsg_channel_info {
char name[RPMSG_NAME_SIZE];
u32 src;
u32 dst;
};
> (2) By proceeding this way, the kernel exposes the GPIO controller it knows
> about. It is preferrable to have the remote processor tell the kernel about the
> GPIO controller it wants.
>
If everyone agrees with this namespace announcement approach, I will prepare the
next revision based on it, even though it is not as clean as the source address encoding solution.
Shenwei
> I am done reviewing this revision. Given the amount of refactoring needed, I will
> not look at the code. Please refer to this reply [1] for what I am expecting in the
> next revision.
>
^ permalink raw reply
* Re: [PATCH v11 3/6] iio: adc: ad4691: add triggered buffer support
From: Jonathan Cameron @ 2026-05-18 14:21 UTC (permalink / raw)
To: David Lechner
Cc: radu.sabau, Lars-Peter Clausen, Michael Hennerich, Nuno Sá,
Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Uwe Kleine-König, Liam Girdwood, Mark Brown, Linus Walleij,
Bartosz Golaszewski, Philipp Zabel, Jonathan Corbet, Shuah Khan,
linux-iio, devicetree, linux-kernel, linux-pwm, linux-gpio,
linux-doc
In-Reply-To: <58a66855-9fb3-48ca-8cae-ff9277f745df@baylibre.com>
On Sun, 17 May 2026 14:21:30 -0500
David Lechner <dlechner@baylibre.com> wrote:
> On 5/17/26 7:25 AM, Jonathan Cameron wrote:
> > On Sat, 16 May 2026 12:32:51 -0500
> > David Lechner <dlechner@baylibre.com> wrote:
> >
> >> On 5/15/26 8:31 AM, Radu Sabau via B4 Relay wrote:
> >>> From: Radu Sabau <radu.sabau@analog.com>
> >>>
> >>> Add buffered capture support using the IIO triggered buffer framework.
> >>>
> >>> CNV Burst Mode: the GP pin identified by interrupt-names in the device
> >>> tree is configured as DATA_READY output. The IRQ handler stops
> >>> conversions and fires the IIO trigger; the trigger handler executes a
> >>> pre-built SPI message that reads all active channels from the AVG_IN
> >>> accumulator registers and then resets accumulator state and restarts
> >>> conversions for the next cycle.
> >>>
> >>> Manual Mode: CNV is tied to SPI CS so each transfer simultaneously
> >>> reads the previous result and starts the next conversion (pipelined
> >>> N+1 scheme). At preenable time a pre-built, optimised SPI message of
> >>> N+1 transfers is constructed (N channel reads plus one NOOP to drain
> >>> the pipeline). The trigger handler executes the message in a single
> >>> spi_sync() call and collects the results. An external trigger (e.g.
> >>> iio-trig-hrtimer) is required to drive the trigger at the desired
> >>> sample rate.
> >>>
> >>> Both modes share the same trigger handler and push a complete scan —
> >>> one big-endian 16-bit (__be16) slot per active channel, densely packed
> >>> in scan_index order, followed by a timestamp.
> >>>
> >>> The CNV Burst Mode sampling frequency (PWM period) is exposed as a
> >>> buffer-level attribute via IIO_DEVICE_ATTR.
> >>>
> >>> Signed-off-by: Radu Sabau <radu.sabau@analog.com>
> >
> >>> +
> >>> +static int ad4691_manual_buffer_preenable(struct iio_dev *indio_dev)
> >>> +{
> >>> + struct ad4691_state *st = iio_priv(indio_dev);
> >>> + unsigned int k, i;
> >>> + int ret;
> >>> +
> >>> + memset(st->scan_xfers, 0, sizeof(st->scan_xfers));
> >>> + memset(st->scan_tx, 0, sizeof(st->scan_tx));
> >>> +
> >>> + spi_message_init(&st->scan_msg);
> >>> +
> >>> + k = 0;
> >>> + iio_for_each_active_channel(indio_dev, i) {
> >>> + if (i >= indio_dev->num_channels - 1)
> >>> + break; /* skip soft timestamp */
> >>
> >> I don't think timestamp gets set in the scan mask. It is handled separately.
> >
> > FWIW that is a sashiko false postive (I believe anyway!)
> > If we do hit this please shout as we have a core bug.
> >
> > If anyone has time to look at how hard it would be to tweak
> > iio_for_each_active_channel to skip a last element timestamp that
> > would be great.
> >
> > I think that iterates one too far which is what sashiko is tripping over.
> >
> > I'm only keen to fix that if we can make it low cost and hid it entirely
> > from drivers.
> >
> > Jonathan
> >
> This is what I came up with (totally untested).
>
> Since timestamp can never be set in scan_mask/active_scan_mask, it should
> be safe to exclude it from masklength without breaking existing code.
Probably...
>
> I didn't check all callers of masklength/iio_get_masklength() though.
That was the bit that made me nervous. Particularly if there is an off
by one that is working by luck today - or someone who understood this
oddity and did it deliberately.
At one point we also had a few other timestamps - the ones come from hardware.
I can't remember how we handled those wrt to the scan mask. I took a quick
look and thing they are all fine.
FWIW a nice precursor would be to make sure all timestamp channels are assigned
using the macro. There are a few that are hand crafted. I tested a few, but obviously
needs turning in to a proper set and cleaning up.
diff --git a/drivers/iio/adc/ad4170-4.c b/drivers/iio/adc/ad4170-4.c
index 627cbf5a37b0..890e25294baa 100644
--- a/drivers/iio/adc/ad4170-4.c
+++ b/drivers/iio/adc/ad4170-4.c
@@ -2385,9 +2385,7 @@ static int ad4170_parse_channels(struct iio_dev *indio_dev)
}
/* Add timestamp channel */
- struct iio_chan_spec ts_chan = IIO_CHAN_SOFT_TIMESTAMP(chan_num);
-
- st->chans[chan_num] = ts_chan;
+ st->chans[chan_num] = IIO_CHAN_SOFT_TIMESTAMP(chan_num);
num_channels = num_channels + 1;
indio_dev->num_channels = num_channels;
diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c
index 6e1930f7c65d..56baca1f5026 100644
--- a/drivers/iio/adc/at91_adc.c
+++ b/drivers/iio/adc/at91_adc.c
@@ -521,13 +521,7 @@ static int at91_adc_channel_init(struct iio_dev *idev)
}
timestamp = chan_array + idx;
- timestamp->type = IIO_TIMESTAMP;
- timestamp->channel = -1;
- timestamp->scan_index = idx;
- timestamp->scan_type.sign = 's';
- timestamp->scan_type.realbits = 64;
- timestamp->scan_type.storagebits = 64;
-
+ *timestamp = IIO_CHAN_SOFT_TIMESTAMP(idx);
idev->channels = chan_array;
return idev->num_channels;
}
diff --git a/drivers/iio/adc/cc10001_adc.c b/drivers/iio/adc/cc10001_adc.c
index 2c51b90b7101..d42b747325aa 100644
--- a/drivers/iio/adc/cc10001_adc.c
+++ b/drivers/iio/adc/cc10001_adc.c
@@ -262,7 +262,7 @@ static const struct iio_info cc10001_adc_info = {
static int cc10001_adc_channel_init(struct iio_dev *indio_dev,
unsigned long channel_map)
{
- struct iio_chan_spec *chan_array, *timestamp;
+ struct iio_chan_spec *chan_array;
unsigned int bit, idx = 0;
indio_dev->num_channels = bitmap_weight(&channel_map,
@@ -289,13 +289,7 @@ static int cc10001_adc_channel_init(struct iio_dev *indio_dev,
idx++;
}
- timestamp = &chan_array[idx];
- timestamp->type = IIO_TIMESTAMP;
- timestamp->channel = -1;
- timestamp->scan_index = idx;
- timestamp->scan_type.sign = 's';
- timestamp->scan_type.realbits = 64;
- timestamp->scan_type.storagebits = 64;
+ chan_array[idx] = IIO_CHAN_SOFT_TIMESTAMP(idx);
indio_dev->channels = chan_array;
diff --git a/include/linux/iio/iio.h b/include/linux/iio/iio.h
index 96b05c86c325..702b2fc66326 100644
--- a/include/linux/iio/iio.h
+++ b/include/linux/iio/iio.h
@@ -353,7 +353,7 @@ static inline bool iio_channel_has_available(const struct iio_chan_spec *chan,
(chan->info_mask_shared_by_all_available & BIT(type));
}
-#define IIO_CHAN_SOFT_TIMESTAMP(_si) { \
+#define IIO_CHAN_SOFT_TIMESTAMP(_si) (struct iio_chan_spec) { \
.type = IIO_TIMESTAMP, \
.channel = -1, \
.scan_index = _si, \
Doing that will mean we can spot any unusual use of IIO_TIMESTAMP much more
easily.
Anyhow, basic approach looks good to me.
Jonathan
>
> ---
> diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c
> index 9d66510a1d49..17f539fc23e2 100644
> --- a/drivers/iio/industrialio-buffer.c
> +++ b/drivers/iio/industrialio-buffer.c
> @@ -2300,8 +2300,10 @@ int iio_buffers_alloc_sysfs_and_mask(struct iio_dev *indio_dev)
> if (channels) {
> int ml = 0;
>
> - for (i = 0; i < indio_dev->num_channels; i++)
> - ml = max(ml, channels[i].scan_index + 1);
> + for (i = 0; i < indio_dev->num_channels; i++) {
> + if (channels[i].type != IIO_TIMESTAMP)
> + ml = max(ml, channels[i].scan_index + 1);
> + }
> ACCESS_PRIVATE(indio_dev, masklength) = ml;
> }
>
>
>
>
^ permalink raw reply related
* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Mario Limonciello @ 2026-05-18 14:21 UTC (permalink / raw)
To: Sumit Gupta, rafael, viresh.kumar, pierre.gondois,
ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan, rdunlap,
linux-pm, linux-doc, linux-kernel
Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
mochs, bbasu
In-Reply-To: <e1a546f2-6e7e-4236-97bb-f72bea0137f7@nvidia.com>
On 5/18/26 09:15, Sumit Gupta wrote:
>
> On 18/05/26 19:20, Mario Limonciello wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 5/18/26 08:44, Sumit Gupta wrote:
>>> Hi Mario,
>>>
>>>
>>> On 16/05/26 02:43, Mario Limonciello wrote:
>>>> External email: Use caution opening links or attachments
>>>>
>>>>
>>>> On 5/15/26 07:26, Sumit Gupta wrote:
>>>>> Add a kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable
>>>>> CPPC autonomous performance selection on all CPUs at system startup.
>>>>> When autonomous mode is enabled, the hardware automatically adjusts
>>>>> CPU performance based on workload demands using Energy Performance
>>>>> Preference (EPP) hints.
>>>>>
>>>>> When the parameter is set:
>>>>> - Configure all CPUs for autonomous operation on first init
>>>>> - Use HW min/max_perf when available; otherwise initialize from caps
>>>>> - Initialize desired_perf to max_perf as a starting hint
>>>>> - Hardware controls frequency instead of the OS governor
>>>>> - EPP behavior depends on parameter value:
>>>>> - performance (or 1): override EPP to performance preference (0x0)
>>>>> - default_epp (or 2): preserve EPP value programmed by BIOS/
>>>>> firmware
>>>>>
>>>>> The boot parameter is applied only during first policy initialization.
>>>>> Skip applying it on CPU hotplug to preserve runtime sysfs
>>>>> configuration.
>>>>>
>>>>> This patch depends on patch series [1] ("cpufreq: Set policy->min and
>>>>> max as real QoS constraints") so that the policy->min/max set in
>>>>> cppc_cpufreq_cpu_init() are not overridden by cpufreq_set_policy()
>>>>> during init.
>>>>>
>>>>> Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
>>>>> ---
>>>>> [1] https://lore.kernel.org/lkml/20260511135538.522653-1-
>>>>> pierre.gondois@arm.com/
>>>>> ---
>>>>> .../admin-guide/kernel-parameters.txt | 16 +++
>>>>> drivers/cpufreq/cppc_cpufreq.c | 122 +++++++++++++
>>>>> ++++-
>>>>> 2 files changed, 133 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/
>>>>> Documentation/admin-guide/kernel-parameters.txt
>>>>> index 0eb64aab3685..7e4b3a8fd76f 100644
>>>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>>>> @@ -1048,6 +1048,22 @@ Kernel parameters
>>>>> policy to use. This governor must be registered
>>>>> in the
>>>>> kernel before the cpufreq driver probes.
>>>>>
>>>>> + cppc_cpufreq.auto_sel_mode=
>>>>> + [CPU_FREQ] Enable ACPI CPPC autonomous
>>>>> performance
>>>>> + selection. When enabled, hardware automatically
>>>>> adjusts
>>>>> + CPU frequency on all CPUs based on workload
>>>>> demands.
>>>>> + In Autonomous mode, Energy Performance
>>>>> Preference (EPP)
>>>>> + hints guide hardware toward performance (0x0)
>>>>> or energy
>>>>> + efficiency (0xff).
>>>>> + Requires ACPI CPPC autonomous selection register
>>>>> + support.
>>>>> + Accepts:
>>>>> + performance, 1: enable auto_sel + set EPP to
>>>>> + performance (0x0)
>>>>> + default_epp, 2: enable auto_sel, preserve EPP
>>>>> value
>>>>> + programmed by BIOS/firmware
>>>>> + Unset: cpufreq governors are used (auto_sel
>>>>> disabled).
>>>>
>>>> Rather than unset doing nothing, have you considered having it take a
>>>> midpoint like 128? That's what we do in amd-pstate (default to
>>>> balance_performance). I think it turns into a reasonable balance.
>>>
>>> Thanks for the suggestion.
>>> I can add balance_performance that enables auto_sel with EPP=128 in v4.
>>>
>>> On changing the driver default (no param behavior) to auto enable
>>> balance_performance, it would be good to keep the current behavior for
>>> now since cppc_cpufreq is generic across ARM64/RISC-V platforms where
>>> EPP and Autonomous Selection registers are optional.
>>> A default change would affect existing users relying on governors.
>>>
>>> Thank you,
>>> Sumit Gupta
>>
>> But couldn't you make the "no module parameter set" follow the behavior
>> to only set the registers if they're available?
>>
>> So the systems that support it start using it, the ones that don't it's
>> a NOP.
>>
>
> Would it work to add balance_performance as a new mode in v4,
> and discuss changing the default separately as a follow-up?
>
Sure.
> Runtime detection helps for unsupported platforms. But platforms which
> support the registers use OS governors today, and silently switching
> them to autonomous mode on a kernel update is a behavior change for
> existing users. They would also have no way to boot into sw governor.
>
But hopefully it should be better battery life/responsiveness for those
scenarios too, right?
>
>
>>>
>>>
>>>>
>>>>> +
>>>>> cpu_init_udelay=N
>>>>> [X86,EARLY] Delay for N microsec between assert
>>>>> and de-assert
>>>>> of APIC INIT to start processors. This delay
>>>>> occurs
>>>>> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/
>>>>> cppc_cpufreq.c
>>>>> index 6b54427b52e1..5f4d735e7c7d 100644
>>>>> --- a/drivers/cpufreq/cppc_cpufreq.c
>>>>> +++ b/drivers/cpufreq/cppc_cpufreq.c
>>>>> @@ -28,6 +28,43 @@
>>>>>
>>>>> static struct cpufreq_driver cppc_cpufreq_driver;
>>>>>
>>>>> +/* Autonomous Selection boot parameter modes */
>>>>> +enum {
>>>>> + AUTO_SEL_PERFORMANCE = 1,
>>>>> + AUTO_SEL_DEFAULT_EPP = 2,
>>>>> +};
>>>>> +
>>>>> +static int auto_sel_mode;
>>>>> +
>>>>> +static int auto_sel_mode_set(const char *val, const struct
>>>>> kernel_param *kp)
>>>>> +{
>>>>> + if (sysfs_streq(val, "performance") || sysfs_streq(val, "1"))
>>>>> + *(int *)kp->arg = AUTO_SEL_PERFORMANCE;
>>>>> + else if (sysfs_streq(val, "default_epp") || sysfs_streq(val,
>>>>> "2"))
>>>>> + *(int *)kp->arg = AUTO_SEL_DEFAULT_EPP;
>>>>> + else
>>>>> + return -EINVAL;
>>>>> +
>>>>> + return 0;
>>>>> +}
>>>>> +
>>>>> +static int auto_sel_mode_get(char *buffer, const struct kernel_param
>>>>> *kp)
>>>>> +{
>>>>> + switch (*(int *)kp->arg) {
>>>>> + case AUTO_SEL_PERFORMANCE:
>>>>> + return sysfs_emit(buffer, "performance\n");
>>>>> + case AUTO_SEL_DEFAULT_EPP:
>>>>> + return sysfs_emit(buffer, "default_epp\n");
>>>>> + default:
>>>>> + return sysfs_emit(buffer, "disabled\n");
>>>>> + }
>>>>> +}
>>>>> +
>>>>> +static const struct kernel_param_ops auto_sel_mode_ops = {
>>>>> + .set = auto_sel_mode_set,
>>>>> + .get = auto_sel_mode_get,
>>>>> +};
>>>>> +
>>>>> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
>>>>> static enum {
>>>>> FIE_UNSET = -1,
>>>>> @@ -715,11 +752,75 @@ static int cppc_cpufreq_cpu_init(struct
>>>>> cpufreq_policy *policy)
>>>>> policy->cur = cppc_perf_to_khz(caps, caps->highest_perf);
>>>>> cpu_data->perf_ctrls.desired_perf = caps->highest_perf;
>>>>>
>>>>> - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>>> - if (ret) {
>>>>> - pr_debug("Err setting perf value:%d on CPU:%d. ret:
>>>>> %d\n",
>>>>> - caps->highest_perf, cpu, ret);
>>>>> - goto out;
>>>>> + /*
>>>>> + * Enable autonomous mode on first init if boot param is set.
>>>>> + * Check last_governor to detect first init and skip if auto_sel
>>>>> + * is already enabled.
>>>>> + */
>>>>> + if (auto_sel_mode && policy->last_governor[0] == '\0' &&
>>>>> + !cpu_data->perf_ctrls.auto_sel) {
>>>>> + /* Init min/max_perf from caps if not already set by
>>>>> HW. */
>>>>> + if (!cpu_data->perf_ctrls.min_perf)
>>>>> + cpu_data->perf_ctrls.min_perf = caps-
>>>>> >lowest_nonlinear_perf;
>>>>> + if (!cpu_data->perf_ctrls.max_perf)
>>>>> + cpu_data->perf_ctrls.max_perf = policy-
>>>>> >boost_enabled ?
>>>>> + caps->highest_perf : caps->nominal_perf;
>>>>> +
>>>>> + /*
>>>>> + * In autonomous mode desired_perf is only a hint;
>>>>> EPP and
>>>>> + * the platform drive actual selection within [min,
>>>>> max].
>>>>> + * Initialize it to max_perf so HW starts at the upper
>>>>> bound.
>>>>> + */
>>>>> + cpu_data->perf_ctrls.desired_perf = cpu_data-
>>>>> >perf_ctrls.max_perf;
>>>>> +
>>>>> + policy->cur = cppc_perf_to_khz(caps,
>>>>> + cpu_data->perf_ctrls.desired_perf);
>>>>> +
>>>>> + /*
>>>>> + * Override EPP only in 'performance' mode;
>>>>> 'default_epp' mode
>>>>> + * preserves the BIOS/firmware programmed EPP value.
>>>>> + * EPP is optional - some platforms may not support it.
>>>>> + */
>>>>> + if (auto_sel_mode == AUTO_SEL_PERFORMANCE) {
>>>>> + ret = cppc_set_epp(cpu,
>>>>> CPPC_EPP_PERFORMANCE_PREF);
>>>>> + if (ret && ret != -EOPNOTSUPP)
>>>>> + pr_warn("Failed to set EPP for CPU%d
>>>>> (%d)\n", cpu, ret);
>>>>> + else if (!ret)
>>>>> + cpu_data->perf_ctrls.energy_perf = CPPC_EPP_PERFORMANCE_PREF;
>>>>> + }
>>>>> +
>>>>> + /* Program min/max/desired into CPPC regs (non-fatal on
>>>>> failure). */
>>>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>>> + if (ret)
>>>>> + pr_warn("set_perf failed CPU%d (%d); using HW
>>>>> values\n",
>>>>> + cpu, ret);
>>>>> +
>>>>> + ret = cppc_set_auto_sel(cpu, true);
>>>>> + if (ret && ret != -EOPNOTSUPP)
>>>>> + pr_warn("auto_sel CPU%d failed (%d); using OS
>>>>> mode\n",
>>>>> + cpu, ret);
>>>>> + else if (!ret)
>>>>> + cpu_data->perf_ctrls.auto_sel = true;
>>>>> + }
>>>>> +
>>>>> + if (cpu_data->perf_ctrls.auto_sel) {
>>>>> + /* Sync policy limits from HW when autonomous mode is
>>>>> active */
>>>>> + policy->min = cppc_perf_to_khz(caps,
>>>>> + cpu_data->perf_ctrls.min_perf ?:
>>>>> + caps->lowest_nonlinear_perf);
>>>>> + policy->max = cppc_perf_to_khz(caps,
>>>>> + cpu_data->perf_ctrls.max_perf ?:
>>>>> + (policy->boost_enabled ?
>>>>> + caps->highest_perf :
>>>>> + caps->nominal_perf));
>>>>> + } else {
>>>>> + /* Normal mode: governors control frequency */
>>>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>>> + if (ret) {
>>>>> + pr_debug("Err setting perf value:%d on CPU:%d.
>>>>> ret:%d\n",
>>>>> + caps->highest_perf, cpu, ret);
>>>>> + goto out;
>>>>> + }
>>>>> }
>>>>>
>>>>> cppc_cpufreq_cpu_fie_init(policy);
>>>>> @@ -1079,10 +1180,21 @@ static int __init cppc_cpufreq_init(void)
>>>>>
>>>>> static void __exit cppc_cpufreq_exit(void)
>>>>> {
>>>>> + unsigned int cpu;
>>>>> +
>>>>> + for_each_present_cpu(cpu)
>>>>> + cppc_set_auto_sel(cpu, false);
>>>>> +
>>>>> cpufreq_unregister_driver(&cppc_cpufreq_driver);
>>>>> cppc_freq_invariance_exit();
>>>>> }
>>>>>
>>>>> +module_param_cb(auto_sel_mode, &auto_sel_mode_ops, &auto_sel_mode,
>>>>> 0444);
>>>>> +MODULE_PARM_DESC(auto_sel_mode,
>>>>> + "Enable CPPC autonomous performance selection at
>>>>> boot: "
>>>>> + "performance or 1 (EPP=performance), "
>>>>> + "default_epp or 2 (preserve BIOS/firmware EPP)");
>>>>> +
>>>>> module_exit(cppc_cpufreq_exit);
>>>>> MODULE_AUTHOR("Ashwin Chaugule");
>>>>> MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+
>>>>> spec");
>>>>
>>
^ permalink raw reply
* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Sumit Gupta @ 2026-05-18 14:15 UTC (permalink / raw)
To: Mario Limonciello, rafael, viresh.kumar, pierre.gondois,
ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan, rdunlap,
linux-pm, linux-doc, linux-kernel
Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
mochs, bbasu
In-Reply-To: <72fd2fcc-6303-4980-beb7-e4b711ad6406@amd.com>
On 18/05/26 19:20, Mario Limonciello wrote:
> External email: Use caution opening links or attachments
>
>
> On 5/18/26 08:44, Sumit Gupta wrote:
>> Hi Mario,
>>
>>
>> On 16/05/26 02:43, Mario Limonciello wrote:
>>> External email: Use caution opening links or attachments
>>>
>>>
>>> On 5/15/26 07:26, Sumit Gupta wrote:
>>>> Add a kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable
>>>> CPPC autonomous performance selection on all CPUs at system startup.
>>>> When autonomous mode is enabled, the hardware automatically adjusts
>>>> CPU performance based on workload demands using Energy Performance
>>>> Preference (EPP) hints.
>>>>
>>>> When the parameter is set:
>>>> - Configure all CPUs for autonomous operation on first init
>>>> - Use HW min/max_perf when available; otherwise initialize from caps
>>>> - Initialize desired_perf to max_perf as a starting hint
>>>> - Hardware controls frequency instead of the OS governor
>>>> - EPP behavior depends on parameter value:
>>>> - performance (or 1): override EPP to performance preference (0x0)
>>>> - default_epp (or 2): preserve EPP value programmed by
>>>> BIOS/firmware
>>>>
>>>> The boot parameter is applied only during first policy initialization.
>>>> Skip applying it on CPU hotplug to preserve runtime sysfs
>>>> configuration.
>>>>
>>>> This patch depends on patch series [1] ("cpufreq: Set policy->min and
>>>> max as real QoS constraints") so that the policy->min/max set in
>>>> cppc_cpufreq_cpu_init() are not overridden by cpufreq_set_policy()
>>>> during init.
>>>>
>>>> Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
>>>> ---
>>>> [1] https://lore.kernel.org/lkml/20260511135538.522653-1-
>>>> pierre.gondois@arm.com/
>>>> ---
>>>> .../admin-guide/kernel-parameters.txt | 16 +++
>>>> drivers/cpufreq/cppc_cpufreq.c | 122
>>>> +++++++++++++++++-
>>>> 2 files changed, 133 insertions(+), 5 deletions(-)
>>>>
>>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/
>>>> Documentation/admin-guide/kernel-parameters.txt
>>>> index 0eb64aab3685..7e4b3a8fd76f 100644
>>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>>> @@ -1048,6 +1048,22 @@ Kernel parameters
>>>> policy to use. This governor must be registered
>>>> in the
>>>> kernel before the cpufreq driver probes.
>>>>
>>>> + cppc_cpufreq.auto_sel_mode=
>>>> + [CPU_FREQ] Enable ACPI CPPC autonomous
>>>> performance
>>>> + selection. When enabled, hardware automatically
>>>> adjusts
>>>> + CPU frequency on all CPUs based on workload
>>>> demands.
>>>> + In Autonomous mode, Energy Performance
>>>> Preference (EPP)
>>>> + hints guide hardware toward performance (0x0)
>>>> or energy
>>>> + efficiency (0xff).
>>>> + Requires ACPI CPPC autonomous selection register
>>>> + support.
>>>> + Accepts:
>>>> + performance, 1: enable auto_sel + set EPP to
>>>> + performance (0x0)
>>>> + default_epp, 2: enable auto_sel, preserve EPP
>>>> value
>>>> + programmed by BIOS/firmware
>>>> + Unset: cpufreq governors are used (auto_sel
>>>> disabled).
>>>
>>> Rather than unset doing nothing, have you considered having it take a
>>> midpoint like 128? That's what we do in amd-pstate (default to
>>> balance_performance). I think it turns into a reasonable balance.
>>
>> Thanks for the suggestion.
>> I can add balance_performance that enables auto_sel with EPP=128 in v4.
>>
>> On changing the driver default (no param behavior) to auto enable
>> balance_performance, it would be good to keep the current behavior for
>> now since cppc_cpufreq is generic across ARM64/RISC-V platforms where
>> EPP and Autonomous Selection registers are optional.
>> A default change would affect existing users relying on governors.
>>
>> Thank you,
>> Sumit Gupta
>
> But couldn't you make the "no module parameter set" follow the behavior
> to only set the registers if they're available?
>
> So the systems that support it start using it, the ones that don't it's
> a NOP.
>
Would it work to add balance_performance as a new mode in v4,
and discuss changing the default separately as a follow-up?
Runtime detection helps for unsupported platforms. But platforms which
support the registers use OS governors today, and silently switching
them to autonomous mode on a kernel update is a behavior change for
existing users. They would also have no way to boot into sw governor.
Thank you,
Sumit Gupta
>>
>>
>>>
>>>> +
>>>> cpu_init_udelay=N
>>>> [X86,EARLY] Delay for N microsec between assert
>>>> and de-assert
>>>> of APIC INIT to start processors. This delay
>>>> occurs
>>>> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/
>>>> cppc_cpufreq.c
>>>> index 6b54427b52e1..5f4d735e7c7d 100644
>>>> --- a/drivers/cpufreq/cppc_cpufreq.c
>>>> +++ b/drivers/cpufreq/cppc_cpufreq.c
>>>> @@ -28,6 +28,43 @@
>>>>
>>>> static struct cpufreq_driver cppc_cpufreq_driver;
>>>>
>>>> +/* Autonomous Selection boot parameter modes */
>>>> +enum {
>>>> + AUTO_SEL_PERFORMANCE = 1,
>>>> + AUTO_SEL_DEFAULT_EPP = 2,
>>>> +};
>>>> +
>>>> +static int auto_sel_mode;
>>>> +
>>>> +static int auto_sel_mode_set(const char *val, const struct
>>>> kernel_param *kp)
>>>> +{
>>>> + if (sysfs_streq(val, "performance") || sysfs_streq(val, "1"))
>>>> + *(int *)kp->arg = AUTO_SEL_PERFORMANCE;
>>>> + else if (sysfs_streq(val, "default_epp") || sysfs_streq(val,
>>>> "2"))
>>>> + *(int *)kp->arg = AUTO_SEL_DEFAULT_EPP;
>>>> + else
>>>> + return -EINVAL;
>>>> +
>>>> + return 0;
>>>> +}
>>>> +
>>>> +static int auto_sel_mode_get(char *buffer, const struct kernel_param
>>>> *kp)
>>>> +{
>>>> + switch (*(int *)kp->arg) {
>>>> + case AUTO_SEL_PERFORMANCE:
>>>> + return sysfs_emit(buffer, "performance\n");
>>>> + case AUTO_SEL_DEFAULT_EPP:
>>>> + return sysfs_emit(buffer, "default_epp\n");
>>>> + default:
>>>> + return sysfs_emit(buffer, "disabled\n");
>>>> + }
>>>> +}
>>>> +
>>>> +static const struct kernel_param_ops auto_sel_mode_ops = {
>>>> + .set = auto_sel_mode_set,
>>>> + .get = auto_sel_mode_get,
>>>> +};
>>>> +
>>>> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
>>>> static enum {
>>>> FIE_UNSET = -1,
>>>> @@ -715,11 +752,75 @@ static int cppc_cpufreq_cpu_init(struct
>>>> cpufreq_policy *policy)
>>>> policy->cur = cppc_perf_to_khz(caps, caps->highest_perf);
>>>> cpu_data->perf_ctrls.desired_perf = caps->highest_perf;
>>>>
>>>> - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>> - if (ret) {
>>>> - pr_debug("Err setting perf value:%d on CPU:%d.
>>>> ret:%d\n",
>>>> - caps->highest_perf, cpu, ret);
>>>> - goto out;
>>>> + /*
>>>> + * Enable autonomous mode on first init if boot param is set.
>>>> + * Check last_governor to detect first init and skip if auto_sel
>>>> + * is already enabled.
>>>> + */
>>>> + if (auto_sel_mode && policy->last_governor[0] == '\0' &&
>>>> + !cpu_data->perf_ctrls.auto_sel) {
>>>> + /* Init min/max_perf from caps if not already set by
>>>> HW. */
>>>> + if (!cpu_data->perf_ctrls.min_perf)
>>>> + cpu_data->perf_ctrls.min_perf = caps-
>>>> >lowest_nonlinear_perf;
>>>> + if (!cpu_data->perf_ctrls.max_perf)
>>>> + cpu_data->perf_ctrls.max_perf = policy-
>>>> >boost_enabled ?
>>>> + caps->highest_perf : caps->nominal_perf;
>>>> +
>>>> + /*
>>>> + * In autonomous mode desired_perf is only a hint;
>>>> EPP and
>>>> + * the platform drive actual selection within [min,
>>>> max].
>>>> + * Initialize it to max_perf so HW starts at the upper
>>>> bound.
>>>> + */
>>>> + cpu_data->perf_ctrls.desired_perf = cpu_data-
>>>> >perf_ctrls.max_perf;
>>>> +
>>>> + policy->cur = cppc_perf_to_khz(caps,
>>>> + cpu_data->perf_ctrls.desired_perf);
>>>> +
>>>> + /*
>>>> + * Override EPP only in 'performance' mode;
>>>> 'default_epp' mode
>>>> + * preserves the BIOS/firmware programmed EPP value.
>>>> + * EPP is optional - some platforms may not support it.
>>>> + */
>>>> + if (auto_sel_mode == AUTO_SEL_PERFORMANCE) {
>>>> + ret = cppc_set_epp(cpu,
>>>> CPPC_EPP_PERFORMANCE_PREF);
>>>> + if (ret && ret != -EOPNOTSUPP)
>>>> + pr_warn("Failed to set EPP for CPU%d
>>>> (%d)\n", cpu, ret);
>>>> + else if (!ret)
>>>> + cpu_data->perf_ctrls.energy_perf = CPPC_EPP_PERFORMANCE_PREF;
>>>> + }
>>>> +
>>>> + /* Program min/max/desired into CPPC regs (non-fatal on
>>>> failure). */
>>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>> + if (ret)
>>>> + pr_warn("set_perf failed CPU%d (%d); using HW
>>>> values\n",
>>>> + cpu, ret);
>>>> +
>>>> + ret = cppc_set_auto_sel(cpu, true);
>>>> + if (ret && ret != -EOPNOTSUPP)
>>>> + pr_warn("auto_sel CPU%d failed (%d); using OS
>>>> mode\n",
>>>> + cpu, ret);
>>>> + else if (!ret)
>>>> + cpu_data->perf_ctrls.auto_sel = true;
>>>> + }
>>>> +
>>>> + if (cpu_data->perf_ctrls.auto_sel) {
>>>> + /* Sync policy limits from HW when autonomous mode is
>>>> active */
>>>> + policy->min = cppc_perf_to_khz(caps,
>>>> + cpu_data->perf_ctrls.min_perf ?:
>>>> + caps->lowest_nonlinear_perf);
>>>> + policy->max = cppc_perf_to_khz(caps,
>>>> + cpu_data->perf_ctrls.max_perf ?:
>>>> + (policy->boost_enabled ?
>>>> + caps->highest_perf :
>>>> + caps->nominal_perf));
>>>> + } else {
>>>> + /* Normal mode: governors control frequency */
>>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>>> + if (ret) {
>>>> + pr_debug("Err setting perf value:%d on CPU:%d.
>>>> ret:%d\n",
>>>> + caps->highest_perf, cpu, ret);
>>>> + goto out;
>>>> + }
>>>> }
>>>>
>>>> cppc_cpufreq_cpu_fie_init(policy);
>>>> @@ -1079,10 +1180,21 @@ static int __init cppc_cpufreq_init(void)
>>>>
>>>> static void __exit cppc_cpufreq_exit(void)
>>>> {
>>>> + unsigned int cpu;
>>>> +
>>>> + for_each_present_cpu(cpu)
>>>> + cppc_set_auto_sel(cpu, false);
>>>> +
>>>> cpufreq_unregister_driver(&cppc_cpufreq_driver);
>>>> cppc_freq_invariance_exit();
>>>> }
>>>>
>>>> +module_param_cb(auto_sel_mode, &auto_sel_mode_ops, &auto_sel_mode,
>>>> 0444);
>>>> +MODULE_PARM_DESC(auto_sel_mode,
>>>> + "Enable CPPC autonomous performance selection at
>>>> boot: "
>>>> + "performance or 1 (EPP=performance), "
>>>> + "default_epp or 2 (preserve BIOS/firmware EPP)");
>>>> +
>>>> module_exit(cppc_cpufreq_exit);
>>>> MODULE_AUTHOR("Ashwin Chaugule");
>>>> MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+
>>>> spec");
>>>
>
^ permalink raw reply
* Re: [PATCH RFC 2/5] dma-heap: charge dma-buf memory via explicit memcg
From: Christian König @ 2026-05-18 14:06 UTC (permalink / raw)
To: Albert Esteve
Cc: T.J. Mercier, Christian Brauner, Tejun Heo, Johannes Weiner,
Michal Koutný, Jonathan Corbet, Shuah Khan, Sumit Semwal,
Michal Hocko, Roman Gushchin, Shakeel Butt, Muchun Song,
Andrew Morton, Benjamin Gaignard, Brian Starkey, John Stultz,
Paul Moore, James Morris, Serge E. Hallyn, Stephen Smalley,
Ondrej Mosnacek, Shuah Khan, cgroups, linux-doc, linux-kernel,
linux-media, dri-devel, linaro-mm-sig, linux-mm,
linux-security-module, selinux, linux-kselftest, mripard,
echanude
In-Reply-To: <CADSE00Lh95ygoXGKJGsYvQGEsFV8sVmwEC3uvh8M6r3ERzaJwg@mail.gmail.com>
On 5/18/26 14:50, Albert Esteve wrote:
> On Mon, May 18, 2026 at 9:20 AM Christian König
> <christian.koenig@amd.com> wrote:
>>
>> On 5/15/26 19:06, T.J. Mercier wrote:
>>> On Fri, May 15, 2026 at 6:53 AM Christian Brauner <brauner@kernel.org> wrote:
>>>>
>>>> On Tue, May 12, 2026 at 11:10:44AM +0200, Albert Esteve wrote:
>>>>> On embedded platforms a central process often allocates dma-buf
>>>>> memory on behalf of client applications. Without a way to
>>>>> attribute the charge to the requesting client's cgroup, the
>>>>> cost lands on the allocator, making per-cgroup memory limits
>>>>> ineffective for the actual consumers.
>>>>>
>>>>> Add charge_pid_fd to struct dma_heap_allocation_data. When set to
>>>>
>>>> Please be aware that pidfds come in two flavors:
>>>>
>>>> thread-group pidfds and thread-specific pidfds. Make sure that your API
>>>> doesn't implicitly depend on this distinction not existing.
>>>
>>> Hi Christian,
>>>
>>> Memcg is not a controller that supports "thread mode" so all threads
>>> in a group should belong to the same memcg.
>>
>> BTW: Exactly that is the requirement automotive has with their native context use case.
>>
>> The use case is that you have a deamon which has multiple threads were each one is acting on behalve of some other process.
>>
>> At the moment we basically say they are simply not using cgroups for that use case, but it would be really nice if we could handle that as well.
>>
>> Summarizing the requirement of that use case: You need a different cgroup for each thread of a process.
>
> Hi Christian,
>
> Thanks for sharing this atuomotive usecase. If I understand correctly,
> the actual requirement is attributing dma-buf charges to the right
> client, not putting each daemon thread in a different cgroup?
Nope, exactly that's the difference.
The thread acts as a filtering agent for both memory allocation and command submission for somebody else, the process on which behalve the daemon does things can even be in a client VM, completely remote over some network or even something like a microcontroller.
Everything the thread does regarding CPU time, GPU driver memory allocation as well as resources like GPU processing and I/O time etc.. needs to be accounted to one client which can be different for each thread of the process.
The only thing which is shared with the main process thread is CPU memory resources, e.g. malloc() because that is basically just needed for housekeeping and pretty much irrelevant for this kind of use case.
The problem is now you can't do that with cgroups at the moment but unfortunately only the kernel has the information you need to know to do this.
So what you end up with is to define tons of interfaces just to get the necessary information from the kernel into userspace and then essentially duplicate the same infrastructure cgroup provides in the kernel in userspace again.
> If so,
> the `charge_pid_fd` approach achieves this directly by passing the
> client's `pid_fd`, without needing to add per-thread cgroup
> infrastructure.
Well it's already a massive improvemt, we could basically stop doing the whole duplication part for the GPU driver stack and just use cgroups for this part.
Doing that automatically for CPU and I/O time would just be nice to have additionally.
Regards,
Christian.
>
>>
>> Regards,
>> Christian.
>>
>>>
>>> Checking the flags from pidfd_get_pid would be the best way for an
>>> explicit check of the pidfd type?
>>>
>>>>> a valid pidfd, DMA_HEAP_IOCTL_ALLOC resolves the target task's
>>>>> memcg and charges the buffer there via mem_cgroup_charge_dmabuf()
>>>>> inside dma_heap_buffer_alloc(). Without charge_pid_fd, and with
>>>>> the mem_accounting module parameter enabled, the buffer is charged
>>>>> to the allocator's own cgroup.
>>>>>
>>>>> Additionally, commit 3c227be90659 ("dma-buf: system_heap: account for
>>>>> system heap allocation in memcg") adds __GFP_ACCOUNT to system-heap
>>>>> page allocations. Keeping __GFP_ACCOUNT would charge the same pages
>>>>> twice (once to kmem, once to MEMCG_DMABUF), thus remove it and route
>>>>> all accounting through a single MEMCG_DMABUF path.
>>>>>
>>>>> Usage examples:
>>>>>
>>>>> 1. Central allocator charging to a client at allocation time.
>>>>> The allocator knows the client's PID (e.g., from binder's
>>>>> sender_pid) and uses pidfd to attribute the charge:
>>>>>
>>>>> pid_t client_pid = txn->sender_pid;
>>>>> int pidfd = pidfd_open(client_pid, 0);
>>>>>
>>>>> struct dma_heap_allocation_data alloc = {
>>>>> .len = buffer_size,
>>>>> .fd_flags = O_RDWR | O_CLOEXEC,
>>>>> .charge_pid_fd = pidfd,
>>>>> };
>>>>> ioctl(heap_fd, DMA_HEAP_IOCTL_ALLOC, &alloc);
>>>>> close(pidfd);
>>>>> /* alloc.fd is now charged to client's cgroup */
>>>>>
>>>>> 2. Default allocation (no pidfd, mem_accounting=1).
>>>>> When charge_pid_fd is not set and the mem_accounting module
>>>>> parameter is enabled, the buffer is charged to the allocator's
>>>>> own cgroup:
>>>>>
>>>>> struct dma_heap_allocation_data alloc = {
>>>>> .len = buffer_size,
>>>>> .fd_flags = O_RDWR | O_CLOEXEC,
>>>>> };
>>>>> ioctl(heap_fd, DMA_HEAP_IOCTL_ALLOC, &alloc);
>>>>> /* charged to current process's cgroup */
>>>>>
>>>>> Current limitations:
>>>>>
>>>>> - Single-owner model: a dma-buf carries one memcg charge regardless of
>>>>> how many processes share it. Means only the first owner (and exporter)
>>>>> of the shared buffer bears the charge.
>>>>> - Only memcg accounting supported. While this makes sense for system
>>>>> heap buffers, other heaps (e.g., CMA heaps) will require selectively
>>>>> charging also for the dmem controller.
>>>>>
>>>>> Signed-off-by: Albert Esteve <aesteve@redhat.com>
>>>>> ---
>>>>> Documentation/admin-guide/cgroup-v2.rst | 5 ++--
>>>>> drivers/dma-buf/dma-buf.c | 16 ++++---------
>>>>> drivers/dma-buf/dma-heap.c | 42 ++++++++++++++++++++++++++++++---
>>>>> drivers/dma-buf/heaps/system_heap.c | 2 --
>>>>> include/uapi/linux/dma-heap.h | 6 +++++
>>>>> 5 files changed, 53 insertions(+), 18 deletions(-)
>>>>>
>>>>> diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst
>>>>> index 8bdbc2e866430..824d269531eb1 100644
>>>>> --- a/Documentation/admin-guide/cgroup-v2.rst
>>>>> +++ b/Documentation/admin-guide/cgroup-v2.rst
>>>>> @@ -1636,8 +1636,9 @@ The following nested keys are defined.
>>>>> structures.
>>>>>
>>>>> dmabuf (npn)
>>>>> - Amount of memory used for exported DMA buffers allocated by the cgroup.
>>>>> - Stays with the allocating cgroup regardless of how the buffer is shared.
>>>>> + Amount of memory used for exported DMA buffers allocated by or on
>>>>> + behalf of the cgroup. Stays with the allocating cgroup regardless
>>>>> + of how the buffer is shared.
>>>>>
>>>>> workingset_refault_anon
>>>>> Number of refaults of previously evicted anonymous pages.
>>>>> diff --git a/drivers/dma-buf/dma-buf.c b/drivers/dma-buf/dma-buf.c
>>>>> index ce02377f48908..23fb758b78297 100644
>>>>> --- a/drivers/dma-buf/dma-buf.c
>>>>> +++ b/drivers/dma-buf/dma-buf.c
>>>>> @@ -181,8 +181,11 @@ static void dma_buf_release(struct dentry *dentry)
>>>>> */
>>>>> BUG_ON(dmabuf->cb_in.active || dmabuf->cb_out.active);
>>>>>
>>>>> - mem_cgroup_uncharge_dmabuf(dmabuf->memcg, PAGE_ALIGN(dmabuf->size) / PAGE_SIZE);
>>>>> - mem_cgroup_put(dmabuf->memcg);
>>>>> + if (dmabuf->memcg) {
>>>>> + mem_cgroup_uncharge_dmabuf(dmabuf->memcg,
>>>>> + PAGE_ALIGN(dmabuf->size) / PAGE_SIZE);
>>>>> + mem_cgroup_put(dmabuf->memcg);
>>>>> + }
>>>>>
>>>>> dmabuf->ops->release(dmabuf);
>>>>>
>>>>> @@ -764,13 +767,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>> dmabuf->resv = resv;
>>>>> }
>>>>>
>>>>> - dmabuf->memcg = get_mem_cgroup_from_mm(current->mm);
>>>>> - if (!mem_cgroup_charge_dmabuf(dmabuf->memcg, PAGE_ALIGN(dmabuf->size) / PAGE_SIZE,
>>>>> - GFP_KERNEL)) {
>>>>> - ret = -ENOMEM;
>>>>> - goto err_memcg;
>>>>> - }
>>>>> -
>>>>> file->private_data = dmabuf;
>>>>> file->f_path.dentry->d_fsdata = dmabuf;
>>>>> dmabuf->file = file;
>>>>> @@ -781,8 +777,6 @@ struct dma_buf *dma_buf_export(const struct dma_buf_export_info *exp_info)
>>>>>
>>>>> return dmabuf;
>>>>>
>>>>> -err_memcg:
>>>>> - mem_cgroup_put(dmabuf->memcg);
>>>>> err_file:
>>>>> fput(file);
>>>>> err_module:
>>>>> diff --git a/drivers/dma-buf/dma-heap.c b/drivers/dma-buf/dma-heap.c
>>>>> index ac5f8685a6494..ff6e259afcdc0 100644
>>>>> --- a/drivers/dma-buf/dma-heap.c
>>>>> +++ b/drivers/dma-buf/dma-heap.c
>>>>> @@ -7,13 +7,17 @@
>>>>> */
>>>>>
>>>>> #include <linux/cdev.h>
>>>>> +#include <linux/cgroup.h>
>>>>> #include <linux/device.h>
>>>>> #include <linux/dma-buf.h>
>>>>> #include <linux/dma-heap.h>
>>>>> +#include <linux/memcontrol.h>
>>>>> +#include <linux/sched/mm.h>
>>>>> #include <linux/err.h>
>>>>> #include <linux/export.h>
>>>>> #include <linux/list.h>
>>>>> #include <linux/nospec.h>
>>>>> +#include <linux/pidfd.h>
>>>>> #include <linux/syscalls.h>
>>>>> #include <linux/uaccess.h>
>>>>> #include <linux/xarray.h>
>>>>> @@ -55,10 +59,12 @@ MODULE_PARM_DESC(mem_accounting,
>>>>> "Enable cgroup-based memory accounting for dma-buf heap allocations (default=false).");
>>>>>
>>>>> static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>>> - u32 fd_flags,
>>>>> - u64 heap_flags)
>>>>> + u32 fd_flags, u64 heap_flags,
>>>>> + struct mem_cgroup *charge_to)
>>>>> {
>>>>> struct dma_buf *dmabuf;
>>>>> + unsigned int nr_pages;
>>>>> + struct mem_cgroup *memcg = charge_to;
>>>>> int fd;
>>>>>
>>>>> /*
>>>>> @@ -73,6 +79,22 @@ static int dma_heap_buffer_alloc(struct dma_heap *heap, size_t len,
>>>>> if (IS_ERR(dmabuf))
>>>>> return PTR_ERR(dmabuf);
>>>>>
>>>>> + nr_pages = len / PAGE_SIZE;
>>>>> +
>>>>> + if (memcg)
>>>>> + css_get(&memcg->css);
>>>>> + else if (mem_accounting)
>>>>> + memcg = get_mem_cgroup_from_mm(current->mm);
>>>>> +
>>>>> + if (memcg) {
>>>>> + if (!mem_cgroup_charge_dmabuf(memcg, nr_pages, GFP_KERNEL)) {
>>>>> + mem_cgroup_put(memcg);
>>>>> + dma_buf_put(dmabuf);
>>>>> + return -ENOMEM;
>>>>> + }
>>>>> + dmabuf->memcg = memcg;
>>>>> + }
>>>>> +
>>>>> fd = dma_buf_fd(dmabuf, fd_flags);
>>>>> if (fd < 0) {
>>>>> dma_buf_put(dmabuf);
>>>>> @@ -102,6 +124,9 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>> {
>>>>> struct dma_heap_allocation_data *heap_allocation = data;
>>>>> struct dma_heap *heap = file->private_data;
>>>>> + struct mem_cgroup *memcg = NULL;
>>>>> + struct task_struct *task;
>>>>> + unsigned int pidfd_flags;
>>>>> int fd;
>>>>>
>>>>> if (heap_allocation->fd)
>>>>> @@ -113,9 +138,20 @@ static long dma_heap_ioctl_allocate(struct file *file, void *data)
>>>>> if (heap_allocation->heap_flags & ~DMA_HEAP_VALID_HEAP_FLAGS)
>>>>> return -EINVAL;
>>>>>
>>>>> + if (heap_allocation->charge_pid_fd) {
>>>>> + task = pidfd_get_task(heap_allocation->charge_pid_fd, &pidfd_flags);
>>>>
>>>> Will always get a thread-group leader pidfd and will fail if this is a
>>>> thread-specific pidfd. pidfd_open(1234, PIDFD_THREAD) can be used to
>>>> open a thread-specific pidfd.
>>>>
>>>>> + if (IS_ERR(task))
>>>>> + return PTR_ERR(task);
>>>>> +
>>>>> + memcg = get_mem_cgroup_from_mm(task->mm);
>>>>> + put_task_struct(task);
>>>>> + }
>>>>> +
>>>>> fd = dma_heap_buffer_alloc(heap, heap_allocation->len,
>>>>> heap_allocation->fd_flags,
>>>>> - heap_allocation->heap_flags);
>>>>> + heap_allocation->heap_flags,
>>>>> + memcg);
>>>>> + mem_cgroup_put(memcg);
>>>>> if (fd < 0)
>>>>> return fd;
>>>>>
>>>>> diff --git a/drivers/dma-buf/heaps/system_heap.c b/drivers/dma-buf/heaps/system_heap.c
>>>>> index 03c2b87cb1112..95d7688167b93 100644
>>>>> --- a/drivers/dma-buf/heaps/system_heap.c
>>>>> +++ b/drivers/dma-buf/heaps/system_heap.c
>>>>> @@ -385,8 +385,6 @@ static struct page *alloc_largest_available(unsigned long size,
>>>>> if (max_order < orders[i])
>>>>> continue;
>>>>> flags = order_flags[i];
>>>>> - if (mem_accounting)
>>>>> - flags |= __GFP_ACCOUNT;
>>>>> page = alloc_pages(flags, orders[i]);
>>>>> if (!page)
>>>>> continue;
>>>>> diff --git a/include/uapi/linux/dma-heap.h b/include/uapi/linux/dma-heap.h
>>>>> index a4cf716a49fa6..e02b0f8cbc6a1 100644
>>>>> --- a/include/uapi/linux/dma-heap.h
>>>>> +++ b/include/uapi/linux/dma-heap.h
>>>>> @@ -29,6 +29,10 @@
>>>>> * handle to the allocated dma-buf
>>>>> * @fd_flags: file descriptor flags used when allocating
>>>>> * @heap_flags: flags passed to heap
>>>>> + * @charge_pid_fd: optional pidfd of the process whose cgroup should be
>>>>> + * charged for this allocation; 0 means charge the calling
>>>>> + * process's cgroup
>>>>> + * @__padding: reserved, must be zero
>>>>> *
>>>>> * Provided by userspace as an argument to the ioctl
>>>>> */
>>>>> @@ -37,6 +41,8 @@ struct dma_heap_allocation_data {
>>>>> __u32 fd;
>>>>> __u32 fd_flags;
>>>>> __u64 heap_flags;
>>>>> + __u32 charge_pid_fd;
>>>>> + __u32 __padding;
>>>>> };
>>>>>
>>>>> #define DMA_HEAP_IOC_MAGIC 'H'
>>>>>
>>>>> --
>>>>> 2.53.0
>>>>>
>>
>
^ permalink raw reply
* Re: [PATCH v3 0/4] KVM: arm64: vgic: Fix IGROUPR writability and IIDR revision control
From: David Woodhouse @ 2026-05-18 13:56 UTC (permalink / raw)
To: Paolo Bonzini
Cc: Jonathan Corbet, Shuah Khan, Marc Zyngier, Oliver Upton,
Joey Gouly, Suzuki K Poulose, Zenghui Yu, Catalin Marinas,
Will Deacon, Jonathan Cameron, Sascha Bischoff, Eric Auger,
Raghavendra Rao Ananta, Maxim Levitsky, Kees Cook, Timothy Hayes,
Arnd Bergmann, kvm, linux-doc, linux-kernel, linux-arm-kernel,
kvmarm, linux-kselftest, Peter Maydell, qemu-arm, qemu-devel
In-Reply-To: <20260511113558.3325004-1-dwmw2@infradead.org>
[-- Attachment #1: Type: text/plain, Size: 876 bytes --]
On Mon, 2026-05-11 at 12:30 +0100, David Woodhouse wrote:
> Maintaining precise guest compatibility across host kernel upgrades —
> and even downgrades, since rollback is sometimes necessary — is not
> optional. That *shouldn't* need saying, but maybe it does:
> https://lore.kernel.org/all/6856b269d2af706eae397e0cf9c1231f89d9a932.camel@infradead.org/
>
> This series fixes the GICv2/v3 IGROUPR writability model to be
> consistently controlled by the GICD_IIDR implementation revision,
> replacing the ad-hoc v2_groups_user_writable flag.
The more generic question of whether KVM should actually maintain guest
compatibility from one version of the host kernel to another is being
discussed in that other thread.
Setting that aside, is there any *technical* feedback on this series? I
believe I've dealt with all of Marc's feedback on v1 (thanks).
[-- Attachment #2: smime.p7s --]
[-- Type: application/pkcs7-signature, Size: 5069 bytes --]
^ permalink raw reply
* Re: [PATCH v5 02/13] iio: core: support 64-bit register through debugfs
From: Rodrigo Alencar @ 2026-05-18 13:56 UTC (permalink / raw)
To: rodrigo.alencar, linux-iio, devicetree, linux-kernel, linux-doc,
linux-hardening
Cc: Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
David Lechner, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
Kees Cook, Gustavo A. R. Silva
In-Reply-To: <20260517-ad9910-iio-driver-v5-2-31599c88314a@analog.com>
On 26/05/17 06:56PM, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 1 potential issue(s) to consider:
> - [Medium] Silent truncation of 64-bit values when written to a 32-bit register access function.
> --
>
> commit 64785a0c1416e2cf718b558aa7a2cc9ac39b2d81
> Author: Rodrigo Alencar <rodrigo.alencar@analog.com>
>
> iio: core: support 64-bit register through debugfs
>
> This commit introduces a debugfs_reg64_access function pointer to iio_info
> to allow 64-bit register reads and writes through debugfs. The file operation
> callbacks are modified to prefer the 64-bit variant when available.
>
> > diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c
> > index e587aca79b8e7..5c8404efd0a56 100644
> > --- a/drivers/iio/industrialio-core.c
> > +++ b/drivers/iio/industrialio-core.c
>
> [ ... ]
>
> > @@ -429,7 +439,7 @@ static ssize_t iio_debugfs_write_reg(struct file *file,
> >
> > buf[ret] = '\0';
> >
> > - ret = sscanf(buf, "%i %i", ®, &val);
> > + ret = sscanf(buf, "%i %lli", ®, &val64);
> >
> > switch (ret) {
> > case 1:
> > @@ -437,8 +447,12 @@ static ssize_t iio_debugfs_write_reg(struct file *file,
> > break;
> > case 2:
> > iio_dev_opaque->cached_reg_addr = reg;
> > - ret = indio_dev->info->debugfs_reg_access(indio_dev, reg,
> > - val, NULL);
> > + if (indio_dev->info->debugfs_reg64_access)
> > + ret = indio_dev->info->debugfs_reg64_access(indio_dev, reg,
> > + val64, NULL);
> > + else
> > + ret = indio_dev->info->debugfs_reg_access(indio_dev, reg,
> > + val64, NULL);
>
> Could this introduce a regression where values are silently truncated if the
> provided input exceeds 32 bits?
>
> Since the input is now unconditionally parsed as a 64-bit integer, falling
> back to debugfs_reg_access (which takes an unsigned int for writeval) will
> implicitly cast val64. If a user provides a value larger than UINT_MAX to a
> driver that only provides the 32-bit callback, it appears the value will be
> truncated rather than rejected with an error like -EINVAL.
Maybe -EINVAL can be returned for that case indeed. For a debug interface I
did not see that as a problem.
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/20260517-ad9910-iio-driver-v5-0-31599c88314a@analog.com?part=2
--
Kind regards,
Rodrigo Alencar
^ permalink raw reply
* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Mario Limonciello @ 2026-05-18 13:50 UTC (permalink / raw)
To: Sumit Gupta, rafael, viresh.kumar, pierre.gondois,
ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan, rdunlap,
linux-pm, linux-doc, linux-kernel
Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
mochs, bbasu
In-Reply-To: <139d2f0e-72d9-4721-9d5a-d1d4a2a95fa1@nvidia.com>
On 5/18/26 08:44, Sumit Gupta wrote:
> Hi Mario,
>
>
> On 16/05/26 02:43, Mario Limonciello wrote:
>> External email: Use caution opening links or attachments
>>
>>
>> On 5/15/26 07:26, Sumit Gupta wrote:
>>> Add a kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable
>>> CPPC autonomous performance selection on all CPUs at system startup.
>>> When autonomous mode is enabled, the hardware automatically adjusts
>>> CPU performance based on workload demands using Energy Performance
>>> Preference (EPP) hints.
>>>
>>> When the parameter is set:
>>> - Configure all CPUs for autonomous operation on first init
>>> - Use HW min/max_perf when available; otherwise initialize from caps
>>> - Initialize desired_perf to max_perf as a starting hint
>>> - Hardware controls frequency instead of the OS governor
>>> - EPP behavior depends on parameter value:
>>> - performance (or 1): override EPP to performance preference (0x0)
>>> - default_epp (or 2): preserve EPP value programmed by BIOS/firmware
>>>
>>> The boot parameter is applied only during first policy initialization.
>>> Skip applying it on CPU hotplug to preserve runtime sysfs configuration.
>>>
>>> This patch depends on patch series [1] ("cpufreq: Set policy->min and
>>> max as real QoS constraints") so that the policy->min/max set in
>>> cppc_cpufreq_cpu_init() are not overridden by cpufreq_set_policy()
>>> during init.
>>>
>>> Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
>>> ---
>>> [1] https://lore.kernel.org/lkml/20260511135538.522653-1-
>>> pierre.gondois@arm.com/
>>> ---
>>> .../admin-guide/kernel-parameters.txt | 16 +++
>>> drivers/cpufreq/cppc_cpufreq.c | 122 +++++++++++++++++-
>>> 2 files changed, 133 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/
>>> Documentation/admin-guide/kernel-parameters.txt
>>> index 0eb64aab3685..7e4b3a8fd76f 100644
>>> --- a/Documentation/admin-guide/kernel-parameters.txt
>>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>>> @@ -1048,6 +1048,22 @@ Kernel parameters
>>> policy to use. This governor must be registered
>>> in the
>>> kernel before the cpufreq driver probes.
>>>
>>> + cppc_cpufreq.auto_sel_mode=
>>> + [CPU_FREQ] Enable ACPI CPPC autonomous performance
>>> + selection. When enabled, hardware automatically
>>> adjusts
>>> + CPU frequency on all CPUs based on workload
>>> demands.
>>> + In Autonomous mode, Energy Performance
>>> Preference (EPP)
>>> + hints guide hardware toward performance (0x0)
>>> or energy
>>> + efficiency (0xff).
>>> + Requires ACPI CPPC autonomous selection register
>>> + support.
>>> + Accepts:
>>> + performance, 1: enable auto_sel + set EPP to
>>> + performance (0x0)
>>> + default_epp, 2: enable auto_sel, preserve EPP
>>> value
>>> + programmed by BIOS/firmware
>>> + Unset: cpufreq governors are used (auto_sel
>>> disabled).
>>
>> Rather than unset doing nothing, have you considered having it take a
>> midpoint like 128? That's what we do in amd-pstate (default to
>> balance_performance). I think it turns into a reasonable balance.
>
> Thanks for the suggestion.
> I can add balance_performance that enables auto_sel with EPP=128 in v4.
>
> On changing the driver default (no param behavior) to auto enable
> balance_performance, it would be good to keep the current behavior for
> now since cppc_cpufreq is generic across ARM64/RISC-V platforms where
> EPP and Autonomous Selection registers are optional.
> A default change would affect existing users relying on governors.
>
> Thank you,
> Sumit Gupta
But couldn't you make the "no module parameter set" follow the behavior
to only set the registers if they're available?
So the systems that support it start using it, the ones that don't it's
a NOP.
>
>
>>
>>> +
>>> cpu_init_udelay=N
>>> [X86,EARLY] Delay for N microsec between assert
>>> and de-assert
>>> of APIC INIT to start processors. This delay
>>> occurs
>>> diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/
>>> cppc_cpufreq.c
>>> index 6b54427b52e1..5f4d735e7c7d 100644
>>> --- a/drivers/cpufreq/cppc_cpufreq.c
>>> +++ b/drivers/cpufreq/cppc_cpufreq.c
>>> @@ -28,6 +28,43 @@
>>>
>>> static struct cpufreq_driver cppc_cpufreq_driver;
>>>
>>> +/* Autonomous Selection boot parameter modes */
>>> +enum {
>>> + AUTO_SEL_PERFORMANCE = 1,
>>> + AUTO_SEL_DEFAULT_EPP = 2,
>>> +};
>>> +
>>> +static int auto_sel_mode;
>>> +
>>> +static int auto_sel_mode_set(const char *val, const struct
>>> kernel_param *kp)
>>> +{
>>> + if (sysfs_streq(val, "performance") || sysfs_streq(val, "1"))
>>> + *(int *)kp->arg = AUTO_SEL_PERFORMANCE;
>>> + else if (sysfs_streq(val, "default_epp") || sysfs_streq(val, "2"))
>>> + *(int *)kp->arg = AUTO_SEL_DEFAULT_EPP;
>>> + else
>>> + return -EINVAL;
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int auto_sel_mode_get(char *buffer, const struct kernel_param
>>> *kp)
>>> +{
>>> + switch (*(int *)kp->arg) {
>>> + case AUTO_SEL_PERFORMANCE:
>>> + return sysfs_emit(buffer, "performance\n");
>>> + case AUTO_SEL_DEFAULT_EPP:
>>> + return sysfs_emit(buffer, "default_epp\n");
>>> + default:
>>> + return sysfs_emit(buffer, "disabled\n");
>>> + }
>>> +}
>>> +
>>> +static const struct kernel_param_ops auto_sel_mode_ops = {
>>> + .set = auto_sel_mode_set,
>>> + .get = auto_sel_mode_get,
>>> +};
>>> +
>>> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
>>> static enum {
>>> FIE_UNSET = -1,
>>> @@ -715,11 +752,75 @@ static int cppc_cpufreq_cpu_init(struct
>>> cpufreq_policy *policy)
>>> policy->cur = cppc_perf_to_khz(caps, caps->highest_perf);
>>> cpu_data->perf_ctrls.desired_perf = caps->highest_perf;
>>>
>>> - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>> - if (ret) {
>>> - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n",
>>> - caps->highest_perf, cpu, ret);
>>> - goto out;
>>> + /*
>>> + * Enable autonomous mode on first init if boot param is set.
>>> + * Check last_governor to detect first init and skip if auto_sel
>>> + * is already enabled.
>>> + */
>>> + if (auto_sel_mode && policy->last_governor[0] == '\0' &&
>>> + !cpu_data->perf_ctrls.auto_sel) {
>>> + /* Init min/max_perf from caps if not already set by
>>> HW. */
>>> + if (!cpu_data->perf_ctrls.min_perf)
>>> + cpu_data->perf_ctrls.min_perf = caps-
>>> >lowest_nonlinear_perf;
>>> + if (!cpu_data->perf_ctrls.max_perf)
>>> + cpu_data->perf_ctrls.max_perf = policy-
>>> >boost_enabled ?
>>> + caps->highest_perf : caps->nominal_perf;
>>> +
>>> + /*
>>> + * In autonomous mode desired_perf is only a hint; EPP and
>>> + * the platform drive actual selection within [min, max].
>>> + * Initialize it to max_perf so HW starts at the upper
>>> bound.
>>> + */
>>> + cpu_data->perf_ctrls.desired_perf = cpu_data-
>>> >perf_ctrls.max_perf;
>>> +
>>> + policy->cur = cppc_perf_to_khz(caps,
>>> + cpu_data->perf_ctrls.desired_perf);
>>> +
>>> + /*
>>> + * Override EPP only in 'performance' mode;
>>> 'default_epp' mode
>>> + * preserves the BIOS/firmware programmed EPP value.
>>> + * EPP is optional - some platforms may not support it.
>>> + */
>>> + if (auto_sel_mode == AUTO_SEL_PERFORMANCE) {
>>> + ret = cppc_set_epp(cpu,
>>> CPPC_EPP_PERFORMANCE_PREF);
>>> + if (ret && ret != -EOPNOTSUPP)
>>> + pr_warn("Failed to set EPP for CPU%d
>>> (%d)\n", cpu, ret);
>>> + else if (!ret)
>>> + cpu_data->perf_ctrls.energy_perf = CPPC_EPP_PERFORMANCE_PREF;
>>> + }
>>> +
>>> + /* Program min/max/desired into CPPC regs (non-fatal on
>>> failure). */
>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>> + if (ret)
>>> + pr_warn("set_perf failed CPU%d (%d); using HW
>>> values\n",
>>> + cpu, ret);
>>> +
>>> + ret = cppc_set_auto_sel(cpu, true);
>>> + if (ret && ret != -EOPNOTSUPP)
>>> + pr_warn("auto_sel CPU%d failed (%d); using OS
>>> mode\n",
>>> + cpu, ret);
>>> + else if (!ret)
>>> + cpu_data->perf_ctrls.auto_sel = true;
>>> + }
>>> +
>>> + if (cpu_data->perf_ctrls.auto_sel) {
>>> + /* Sync policy limits from HW when autonomous mode is
>>> active */
>>> + policy->min = cppc_perf_to_khz(caps,
>>> + cpu_data->perf_ctrls.min_perf ?:
>>> + caps->lowest_nonlinear_perf);
>>> + policy->max = cppc_perf_to_khz(caps,
>>> + cpu_data->perf_ctrls.max_perf ?:
>>> + (policy->boost_enabled ?
>>> + caps->highest_perf :
>>> + caps->nominal_perf));
>>> + } else {
>>> + /* Normal mode: governors control frequency */
>>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>>> + if (ret) {
>>> + pr_debug("Err setting perf value:%d on CPU:%d.
>>> ret:%d\n",
>>> + caps->highest_perf, cpu, ret);
>>> + goto out;
>>> + }
>>> }
>>>
>>> cppc_cpufreq_cpu_fie_init(policy);
>>> @@ -1079,10 +1180,21 @@ static int __init cppc_cpufreq_init(void)
>>>
>>> static void __exit cppc_cpufreq_exit(void)
>>> {
>>> + unsigned int cpu;
>>> +
>>> + for_each_present_cpu(cpu)
>>> + cppc_set_auto_sel(cpu, false);
>>> +
>>> cpufreq_unregister_driver(&cppc_cpufreq_driver);
>>> cppc_freq_invariance_exit();
>>> }
>>>
>>> +module_param_cb(auto_sel_mode, &auto_sel_mode_ops, &auto_sel_mode,
>>> 0444);
>>> +MODULE_PARM_DESC(auto_sel_mode,
>>> + "Enable CPPC autonomous performance selection at boot: "
>>> + "performance or 1 (EPP=performance), "
>>> + "default_epp or 2 (preserve BIOS/firmware EPP)");
>>> +
>>> module_exit(cppc_cpufreq_exit);
>>> MODULE_AUTHOR("Ashwin Chaugule");
>>> MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+
>>> spec");
>>
^ permalink raw reply
* Re: [PATCH v5 09/13] iio: frequency: ad9910: add RAM mode support
From: Rodrigo Alencar @ 2026-05-18 13:50 UTC (permalink / raw)
To: rodrigo.alencar, linux-iio, devicetree, linux-kernel, linux-doc,
linux-hardening
Cc: Lars-Peter Clausen, Michael Hennerich, Jonathan Cameron,
David Lechner, Andy Shevchenko, Rob Herring, Krzysztof Kozlowski,
Conor Dooley, Philipp Zabel, Jonathan Corbet, Shuah Khan,
Kees Cook, Gustavo A. R. Silva
In-Reply-To: <20260517-ad9910-iio-driver-v5-9-31599c88314a@analog.com>
On 26/05/17 07:19PM, sashiko-bot@kernel.org wrote:
> Thank you for your contribution! Sashiko AI review found 5 potential issue(s) to consider:
> - [High] Incomplete hardware rollback during a profile swap failure leaves AD9910 shadow registers corrupted.
> - [High] Missing software cache and hardware rollback when CFR1 register update fails.
> - [High] Unprotected software profile cache corruption on firmware upload failure.
> - [High] Firmware upload `.write` callback fails on chunked payloads and performs out-of-bounds memory reads.
> - [Medium] The RAM firmware upload sysfs node and debugfs symlinks are registered with a `(null)` name.
> --
>
> commit 3876f09fc86743085e5099fea5762c34dc150ecf
> Author: Rodrigo Alencar <rodrigo.alencar@analog.com>
>
> iio: frequency: ad9910: add RAM mode support
>
> Add RAM control channel, which includes RAM data loading via firmware
> upload interface, per-profile configuration, profile switching,
> sampling frequency control, and ram-enable-aware read/write paths.
>
> > diff --git a/drivers/iio/frequency/ad9910.c b/drivers/iio/frequency/ad9910.c
> > index 4ad80475139d7..86ed350011cff 100644
> > --- a/drivers/iio/frequency/ad9910.c
> > +++ b/drivers/iio/frequency/ad9910.c
>
> [ ... ]
>
> > @@ -1202,6 +1306,41 @@ static int ad9910_write_raw(struct iio_dev *indio_dev,
> > return ad9910_reg32_update(st, AD9910_REG_CFR2,
> > AD9910_CFR2_DRG_ENABLE_MSK,
> > tmp32, true);
> > + case AD9910_CHANNEL_RAM:
> > + if (AD9910_RAM_ENABLED(st) == !!val)
> > + return 0;
> > +
> > + /* swap profile configs */
> > + for (i = 0; i < AD9910_NUM_PROFILES; i++) {
> > + tmp64 = st->reg[AD9910_REG_PROFILE(i)].val64;
> > + ret = ad9910_reg64_write(st,
> > + AD9910_REG_PROFILE(i),
> > + st->reg_profile[i],
> > + false);
> > + if (ret)
> > + break;
> > + st->reg_profile[i] = tmp64;
> > + }
> > +
> > + if (ret) {
> > + /*
> > + * After the write failure, profiles 0..i-1 were
> > + * already swapped in SW, but Hw registers are
> > + * still pending an IO update, so swap them back
> > + * in SW to keep the state consistent.
> > + */
> > + while (i--) {
> > + tmp64 = st->reg[AD9910_REG_PROFILE(i)].val64;
> > + st->reg[AD9910_REG_PROFILE(i)].val64 = st->reg_profile[i];
> > + st->reg_profile[i] = tmp64;
> > + }
> > + return ret;
>
> If a write fails mid-loop, this rollback block restores the software cache
> but doesn't issue SPI writes to revert the hardware shadow registers.
> Could the next successful SPI write with an IO_UPDATE pulse inadvertently
> latch these corrupted, partially-swapped profiles into the active hardware
> state?
The approach reverts the changes on cached values, so we do not have RAM and single
tone profile values mixed in software. The hardware might be in an invalid state,
but an error is reported to the user.
> > + }
> > +
> > + tmp32 = FIELD_PREP(AD9910_CFR1_RAM_ENABLE_MSK, !!val);
> > + return ad9910_reg32_update(st, AD9910_REG_CFR1,
> > + AD9910_CFR1_RAM_ENABLE_MSK,
> > + tmp32, true);
>
> If ad9910_reg32_update() fails here, the function returns immediately without
> reverting the swapped profiles. Should there be a rollback of the swapped
> profiles in this error path to prevent the device from operating with corrupted
> synthesizer parameters?
Yes, I'd this part can be improved. Writing to CFR1 can be moved up (before the cache
restore).
>
> > default:
> > return -EINVAL;
> > }
>
> [ ... ]
>
> > @@ -1475,6 +1657,126 @@ static int ad9910_read_label(struct iio_dev *indio_dev,
> > +static enum fw_upload_err ad9910_ram_fwu_write(struct fw_upload *fw_upload,
> > + const u8 *data, u32 offset,
> > + u32 size, u32 *written)
> > +{
> > + const struct ad9910_ram_fw *fw_data = (const struct ad9910_ram_fw *)data;
> > + struct ad9910_state *st = fw_upload->dd_handle;
> > + int ret, ret2, idx, wcount;
> > + u64 tmp64, backup;
> > +
> > + if (offset != 0)
> > + return FW_UPLOAD_ERR_INVALID_SIZE;
>
> The sysfs firmware upload framework chunks data into blocks bounded by
> PAGE_SIZE (typically 4096 bytes). Since the maximum AD9910 firmware size
> can be 4176 bytes, this driver will reject any chunks after the first.
> Can this be adjusted to support firmware files larger than PAGE_SIZE?
That does not seem to be the case:
https://elixir.bootlin.com/linux/v7.1-rc3/source/drivers/base/firmware_loader/sysfs_upload.c#L184
Sysfs binary attribute writes from userspace are limited to PAGE_SIZE per
write() syscall (enforced in fs/sysfs/file.c). But the firmware loader
framework accumulates these into an internal buffer (fw->data).
The driver's write callback is called from fw_upload_main() in sysfs_upload.c.
So, the key question is whether that function passes fw->size - offset
(the full remaining size) or min(fw->size - offset, PAGE_SIZE) (PAGE_SIZE-bounded chunks)
to the driver callback.
The referenced code shows that the write callback is not called every
PAGE_SIZE chunks, but the callback implementation that defines the how
much it can handle at each call.
Also, I have already tested this code with a FW file bigger than 4k.
> > +
> > + guard(mutex)(&st->lock);
> > +
> > + if (st->ram_fwu_cancel)
> > + return FW_UPLOAD_ERR_CANCELED;
> > +
> > + if (AD9910_RAM_ENABLED(st))
> > + return FW_UPLOAD_ERR_HW_ERROR;
> > +
> > + for (idx = 0; idx < AD9910_NUM_PROFILES; idx++)
> > + st->reg_profile[idx] = get_unaligned_be64(&fw_data->profiles[idx]) |
> > + AD9910_PROFILE_RAM_OPEN_MSK;
>
> Here the software cache st->reg_profile[] is overwritten before the SPI
> hardware operations. If any subsequent hardware writes fail, the function
> aborts and the cache is left with partially uploaded profiles.
> Could the original cache be backed up and restored on failure?
I'd say that is not an issue. When RAM is disabled (requirement to enter this function),
this cache is exclusive to be managed by the RAM mode. This is meant to be
written here.
>
> > +
> > + ret = ad9910_reg32_update(st, AD9910_REG_CFR1,
> > + AD9910_CFR1_RAM_PLAYBACK_DEST_MSK |
> > + AD9910_CFR1_INT_PROFILE_CTL_MSK,
> > + get_unaligned_be32(&fw_data->cfr1), true);
> > + if (ret)
> > + return FW_UPLOAD_ERR_RW_ERROR;
> > +
> > + wcount = get_unaligned_be16(&fw_data->wcount);
> > + if (!wcount) {
> > + *written = size;
> > + return FW_UPLOAD_ERR_NONE; /* nothing else to write */
> > + }
> > +
> > + ret = ad9910_profile_set(st, st->profile);
> > + if (ret)
> > + return FW_UPLOAD_ERR_HW_ERROR;
> > +
> > + /* backup profile register and update it with required address range */
> > + backup = st->reg[AD9910_REG_PROFILE(st->profile)].val64;
> > + tmp64 = AD9910_PROFILE_RAM_STEP_RATE_MSK |
> > + FIELD_PREP(AD9910_PROFILE_RAM_START_ADDR_MSK, 0) |
> > + FIELD_PREP(AD9910_PROFILE_RAM_END_ADDR_MSK, wcount - 1);
> > + ret = ad9910_reg64_write(st, AD9910_REG_PROFILE(st->profile), tmp64, true);
> > + if (ret)
> > + return FW_UPLOAD_ERR_RW_ERROR;
> > +
> > + memcpy(&st->tx_buf[1], fw_data->words, wcount * AD9910_RAM_WORD_SIZE);
>
> If the firmware is larger than PAGE_SIZE, size will be 4096 on the first
> call, but wcount could request more bytes than are present in the chunk.
> Does this memcpy() risk reading out-of-bounds memory from the data buffer?
this is dicussed above, from drivers/base/firmware_loader/sysfs_upload.c it
does not seem to be true that the write happens in chunks of PAGE_SIZE.
> [ ... ]
>
> > @@ -1688,7 +2014,25 @@ static int ad9910_probe(struct spi_device *spi)
> > if (ret)
> > return dev_err_probe(dev, ret, "device setup failed\n");
> >
> > - return devm_iio_device_register(dev, indio_dev);
> > + snprintf(st->ram_fwu_name, sizeof(st->ram_fwu_name), "%s:ram",
> > + dev_name(&indio_dev->dev));
>
> The IIO core does not assign the device name to indio_dev->dev until
> devm_iio_device_register() is called. Will dev_name() return "(null)" here,
> resulting in sysfs nodes named "/sys/class/firmware/(null):ram"?
> This might also cause probe failures due to sysfs name collisions if multiple
> AD9910 devices are present on the system.
I suppose that is incorrect.
dev_set_name(&indio_dev->dev, "iio:device%d", iio_dev_opaque->id)
happens during iio_device_alloc().
>
> > + st->ram_fwu = firmware_upload_register(THIS_MODULE, dev, st->ram_fwu_name,
> > + &ad9910_ram_fwu_ops, st);
> > + if (IS_ERR(st->ram_fwu))
> > + return dev_err_probe(dev, PTR_ERR(st->ram_fwu),
> > + "failed to register ram upload ops\n");
> > +
> > + ret = devm_add_action_or_reset(dev, ad9910_ram_fwu_unregister, st->ram_fwu);
> > + if (ret)
> > + return dev_err_probe(dev, ret,
> > + "failed to add ram upload unregister action\n");
> > +
> > + ret = devm_iio_device_register(dev, indio_dev);
> > + if (ret)
> > + return ret;
> > +
> > + ad9910_debugfs_init(st, indio_dev);
> > + return 0;
> > }
>
> --
> Sashiko AI review · https://sashiko.dev/#/patchset/20260517-ad9910-iio-driver-v5-0-31599c88314a@analog.com?part=9
--
Kind regards,
Rodrigo Alencar
^ permalink raw reply
* Re: [PATCH mm-unstable v17 04/14] mm/khugepaged: generalize __collapse_huge_page_* for mTHP support
From: Usama Arif @ 2026-05-18 13:49 UTC (permalink / raw)
To: Nico Pache
Cc: Usama Arif, linux-doc, linux-kernel, linux-mm, linux-trace-kernel,
akpm, anshuman.khandual, apopple, baohua, baolin.wang, byungchul,
catalin.marinas, cl, corbet, dave.hansen, david, dev.jain, gourry,
hannes, hughd, jack, jackmanb, jannh, jglisse, joshua.hahnjy, kas,
lance.yang, liam, ljs, mathieu.desnoyers, matthew.brost, mhiramat,
mhocko, peterx, pfalcato, rakie.kim, raquini, rdunlap,
richard.weiyang, rientjes, rostedt, rppt, ryan.roberts, shivankg,
sunnanyong, surenb, thomas.hellstrom, tiwai, usamaarif642, vbabka,
vishal.moola, wangkefeng.wang, will, willy, yang, ying.huang, ziy,
zokeefe
In-Reply-To: <20260511185817.686831-5-npache@redhat.com>
On Mon, 11 May 2026 12:58:04 -0600 Nico Pache <npache@redhat.com> wrote:
> generalize the order of the __collapse_huge_page_* and collapse_max_*
> functions to support future mTHP collapse.
>
> The current mechanism for determining collapse with the
> khugepaged_max_ptes_none value is not designed with mTHP in mind. This
> raises a key design issue: if we support user defined max_pte_none values
> (even those scaled by order), a collapse of a lower order can introduces
> an feedback loop, or "creep", when max_ptes_none is set to a value greater
> than HPAGE_PMD_NR / 2. [1]
>
> With this configuration, a successful collapse to order N will populate
> enough pages to satisfy the collapse condition on order N+1 on the next
> scan. This leads to unnecessary work and memory churn.
>
> To fix this issue introduce a helper function that will limit mTHP
> collapse support to two max_ptes_none values, 0 and HPAGE_PMD_NR - 1.
> This effectively supports two modes: [2]
>
> - max_ptes_none=0: never collapses if it encounters an empty PTE or a PTE
> that maps the shared zeropage. Consequently, no memory bloat.
> - max_ptes_none=511 (on 4k pagesz): Always collapse to the highest
> available mTHP order.
>
> This removes the possiblilty of "creep", while not modifying any uAPI
> expectations. A warning will be emitted if any non-supported
> max_ptes_none value is configured with mTHP enabled.
>
> mTHP collapse will not honor the khugepaged_max_ptes_shared or
> khugepaged_max_ptes_swap parameters, and will fail if it encounters a
> shared or swapped entry.
>
> No functional changes in this patch; however it defines future behavior
> for mTHP collapse.
>
> [1] - https://lore.kernel.org/all/e46ab3ab-a3d7-4fb7-9970-d0704bd5d05a@arm.com
> [2] - https://lore.kernel.org/all/37375ace-5601-4d6c-9dac-d1c8268698e9@redhat.com
>
> Co-developed-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Dev Jain <dev.jain@arm.com>
> Signed-off-by: Nico Pache <npache@redhat.com>
> ---
> include/trace/events/huge_memory.h | 3 +-
> mm/khugepaged.c | 117 ++++++++++++++++++++---------
> 2 files changed, 85 insertions(+), 35 deletions(-)
>
> diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h
> index bcdc57eea270..443e0bd13fdb 100644
> --- a/include/trace/events/huge_memory.h
> +++ b/include/trace/events/huge_memory.h
> @@ -39,7 +39,8 @@
> EM( SCAN_STORE_FAILED, "store_failed") \
> EM( SCAN_COPY_MC, "copy_poisoned_page") \
> EM( SCAN_PAGE_FILLED, "page_filled") \
> - EMe(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback")
> + EM(SCAN_PAGE_DIRTY_OR_WRITEBACK, "page_dirty_or_writeback") \
> + EMe(SCAN_INVALID_PTES_NONE, "invalid_ptes_none")
>
> #undef EM
> #undef EMe
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index f68853b3caa7..27465161fa6d 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -61,6 +61,7 @@ enum scan_result {
> SCAN_COPY_MC,
> SCAN_PAGE_FILLED,
> SCAN_PAGE_DIRTY_OR_WRITEBACK,
> + SCAN_INVALID_PTES_NONE,
> };
>
> #define CREATE_TRACE_POINTS
> @@ -353,37 +354,60 @@ static bool pte_none_or_zero(pte_t pte)
> * PTEs for the given collapse operation.
> * @cc: The collapse control struct
> * @vma: The vma to check for userfaultfd
> + * @order: The folio order being collapsed to
> *
> * Return: Maximum number of none-page or zero-page PTEs allowed for the
> * collapse operation.
> */
> -static unsigned int collapse_max_ptes_none(struct collapse_control *cc,
> - struct vm_area_struct *vma)
> +static int collapse_max_ptes_none(struct collapse_control *cc,
> + struct vm_area_struct *vma, unsigned int order)
> {
> + unsigned int max_ptes_none = khugepaged_max_ptes_none;
> // If the vma is userfaultfd-armed, allow no none-page or zero-page PTEs.
> if (vma && userfaultfd_armed(vma))
> return 0;
> // for MADV_COLLAPSE, allow any none-page or zero-page PTEs.
> if (!cc->is_khugepaged)
> return HPAGE_PMD_NR;
> - // For all other cases repect the user defined maximum.
> - return khugepaged_max_ptes_none;
> + // for PMD collapse, respect the user defined maximum.
> + if (is_pmd_order(order))
> + return max_ptes_none;
> + /* Zero/non-present collapse disabled. */
> + if (!max_ptes_none)
> + return 0;
> + // for mTHP collapse with the sysctl value set to KHUGEPAGED_MAX_PTES_LIMIT,
> + // scale the maximum number of PTEs to the order of the collapse.
> + if (max_ptes_none == KHUGEPAGED_MAX_PTES_LIMIT)
> + return (1 << order) - 1;
> +
> + // We currently only support max_ptes_none values of 0 or KHUGEPAGED_MAX_PTES_LIMIT.
> + // Emit a warning and return -EINVAL.
> + pr_warn_once("mTHP collapse only supports max_ptes_none values of 0 or %u\n",
> + KHUGEPAGED_MAX_PTES_LIMIT);
> + return -EINVAL;
> }
>
> /**
> * collapse_max_ptes_shared - Calculate maximum allowed PTEs that map shared
> * anonymous pages for the given collapse operation.
> * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
> *
> * Return: Maximum number of PTEs that map shared anonymous pages for the
> * collapse operation
> */
> -static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_shared(struct collapse_control *cc,
> + unsigned int order)
> {
> // for MADV_COLLAPSE, do not restrict the number of PTEs that map shared
> // anonymous pages.
> if (!cc->is_khugepaged)
> return HPAGE_PMD_NR;
> + // for mTHP collapse do not allow collapsing anonymous memory pages that
> + // are shared between processes.
> + if (!is_pmd_order(order))
> + return 0;
> + // for PMD collapse, respect the user defined maximum.
> return khugepaged_max_ptes_shared;
> }
>
> @@ -391,16 +415,22 @@ static unsigned int collapse_max_ptes_shared(struct collapse_control *cc)
> * collapse_max_ptes_swap - Calculate the maximum allowed non-present PTEs or the
> * maximum allowed non-present pagecache entries for the given collapse operation.
> * @cc: The collapse control struct
> + * @order: The folio order being collapsed to
> *
> * Return: Maximum number of non-present PTEs or the maximum allowed non-present
> * pagecache entries for the collapse operation.
> */
> -static unsigned int collapse_max_ptes_swap(struct collapse_control *cc)
> +static unsigned int collapse_max_ptes_swap(struct collapse_control *cc,
> + unsigned int order)
> {
> // for MADV_COLLAPSE, do not restrict the number PTEs entries or
> // pagecache entries that are non-present.
> if (!cc->is_khugepaged)
> return HPAGE_PMD_NR;
> + // for mTHP collapse do not allow any non-present PTEs or pagecache entries.
> + if (!is_pmd_order(order))
> + return 0;
> + // for PMD collapse, respect the user defined maximum.
> return khugepaged_max_ptes_swap;
> }
>
> @@ -594,18 +624,22 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte,
>
> static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> unsigned long start_addr, pte_t *pte, struct collapse_control *cc,
> - struct list_head *compound_pagelist)
> + unsigned int order, struct list_head *compound_pagelist)
> {
> + const unsigned long nr_pages = 1UL << order;
> struct page *page = NULL;
> struct folio *folio = NULL;
> unsigned long addr = start_addr;
> pte_t *_pte;
> int none_or_zero = 0, shared = 0, referenced = 0;
> enum scan_result result = SCAN_FAIL;
> - unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> - unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
> + int max_ptes_none = collapse_max_ptes_none(cc, vma, order);
> + unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, order);
> +
> + if (max_ptes_none < 0)
> + return SCAN_INVALID_PTES_NONE;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
> + for (_pte = pte; _pte < pte + nr_pages;
> _pte++, addr += PAGE_SIZE) {
> pte_t pteval = ptep_get(_pte);
> if (pte_none_or_zero(pteval)) {
> @@ -738,18 +772,18 @@ static enum scan_result __collapse_huge_page_isolate(struct vm_area_struct *vma,
> }
>
> static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> - struct vm_area_struct *vma,
> - unsigned long address,
> - spinlock_t *ptl,
> - struct list_head *compound_pagelist)
> + struct vm_area_struct *vma, unsigned long address,
> + spinlock_t *ptl, unsigned int order,
> + struct list_head *compound_pagelist)
> {
> - unsigned long end = address + HPAGE_PMD_SIZE;
> + const unsigned long nr_pages = 1UL << order;
> + unsigned long end = address + (PAGE_SIZE << order);
> struct folio *src, *tmp;
> pte_t pteval;
> pte_t *_pte;
> unsigned int nr_ptes;
>
> - for (_pte = pte; _pte < pte + HPAGE_PMD_NR; _pte += nr_ptes,
> + for (_pte = pte; _pte < pte + nr_pages; _pte += nr_ptes,
> address += nr_ptes * PAGE_SIZE) {
> nr_ptes = 1;
> pteval = ptep_get(_pte);
> @@ -802,11 +836,10 @@ static void __collapse_huge_page_copy_succeeded(pte_t *pte,
> }
>
> static void __collapse_huge_page_copy_failed(pte_t *pte,
> - pmd_t *pmd,
> - pmd_t orig_pmd,
> - struct vm_area_struct *vma,
> - struct list_head *compound_pagelist)
> + pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> + unsigned int order, struct list_head *compound_pagelist)
> {
> + const unsigned long nr_pages = 1UL << order;
> spinlock_t *pmd_ptl;
>
> /*
> @@ -822,7 +855,7 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> * Release both raw and compound pages isolated
> * in __collapse_huge_page_isolate.
> */
> - release_pte_pages(pte, pte + HPAGE_PMD_NR, compound_pagelist);
> + release_pte_pages(pte, pte + nr_pages, compound_pagelist);
> }
>
> /*
> @@ -842,16 +875,17 @@ static void __collapse_huge_page_copy_failed(pte_t *pte,
> */
> static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *folio,
> pmd_t *pmd, pmd_t orig_pmd, struct vm_area_struct *vma,
> - unsigned long address, spinlock_t *ptl,
> + unsigned long address, spinlock_t *ptl, unsigned int order,
> struct list_head *compound_pagelist)
> {
> + const unsigned long nr_pages = 1UL << order;
> unsigned int i;
> enum scan_result result = SCAN_SUCCEED;
>
> /*
> * Copying pages' contents is subject to memory poison at any iteration.
> */
> - for (i = 0; i < HPAGE_PMD_NR; i++) {
> + for (i = 0; i < nr_pages; i++) {
> pte_t pteval = ptep_get(pte + i);
> struct page *page = folio_page(folio, i);
> unsigned long src_addr = address + i * PAGE_SIZE;
> @@ -870,10 +904,10 @@ static enum scan_result __collapse_huge_page_copy(pte_t *pte, struct folio *foli
>
> if (likely(result == SCAN_SUCCEED))
> __collapse_huge_page_copy_succeeded(pte, vma, address, ptl,
> - compound_pagelist);
> + order, compound_pagelist);
> else
> __collapse_huge_page_copy_failed(pte, pmd, orig_pmd, vma,
> - compound_pagelist);
> + order, compound_pagelist);
>
> return result;
> }
> @@ -1044,12 +1078,12 @@ static enum scan_result check_pmd_still_valid(struct mm_struct *mm,
> * Returns result: if not SCAN_SUCCEED, mmap_lock has been released.
> */
Can you add a comment above __collapse_huge_page_swapin function that says its only
done for PMD size only? Something like:
For PMD-order collapse this faults in any swap entries it finds. For mTHP
orders the function bails on the first swap entry with SCAN_EXCEED_SWAP_PTE,
because faulting pages back in during a lower-order collapse could re-populate
PTEs that push a later scan over the threshold for a higher-order collapse.
> static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> - struct vm_area_struct *vma, unsigned long start_addr, pmd_t *pmd,
> - int referenced)
> + struct vm_area_struct *vma, unsigned long start_addr,
> + pmd_t *pmd, int referenced, unsigned int order)
> {
> int swapped_in = 0;
> vm_fault_t ret = 0;
> - unsigned long addr, end = start_addr + (HPAGE_PMD_NR * PAGE_SIZE);
> + unsigned long addr, end = start_addr + (PAGE_SIZE << order);
> enum scan_result result;
> pte_t *pte = NULL;
> spinlock_t *ptl;
> @@ -1081,6 +1115,19 @@ static enum scan_result __collapse_huge_page_swapin(struct mm_struct *mm,
> pte_present(vmf.orig_pte))
> continue;
>
> + /*
> + * TODO: Support swapin without leading to further mTHP
> + * collapses. Currently bringing in new pages via swapin may
> + * cause a future higher order collapse on a rescan of the same
> + * range.
> + */
> + if (!is_pmd_order(order)) {
> + pte_unmap(pte);
> + mmap_read_unlock(mm);
> + result = SCAN_EXCEED_SWAP_PTE;
> + goto out;
> + }
> +
> vmf.pte = pte;
> vmf.ptl = ptl;
> ret = do_swap_page(&vmf);
> @@ -1200,7 +1247,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> * that case. Continuing to collapse causes inconsistency.
> */
> result = __collapse_huge_page_swapin(mm, vma, address, pmd,
> - referenced);
> + referenced, HPAGE_PMD_ORDER);
> if (result != SCAN_SUCCEED)
> goto out_nolock;
> }
> @@ -1248,6 +1295,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
> pte = pte_offset_map_lock(mm, &_pmd, address, &pte_ptl);
> if (pte) {
> result = __collapse_huge_page_isolate(vma, address, pte, cc,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> spin_unlock(pte_ptl);
> } else {
> @@ -1278,6 +1326,7 @@ static enum scan_result collapse_huge_page(struct mm_struct *mm, unsigned long a
>
> result = __collapse_huge_page_copy(pte, folio, pmd, _pmd,
> vma, address, pte_ptl,
> + HPAGE_PMD_ORDER,
> &compound_pagelist);
> pte_unmap(pte);
> if (unlikely(result != SCAN_SUCCEED))
> @@ -1313,9 +1362,9 @@ static enum scan_result collapse_scan_pmd(struct mm_struct *mm,
> struct vm_area_struct *vma, unsigned long start_addr,
> bool *lock_dropped, struct collapse_control *cc)
> {
> - const unsigned int max_ptes_none = collapse_max_ptes_none(cc, vma);
> - const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc);
> - const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
> + const int max_ptes_none = collapse_max_ptes_none(cc, vma, HPAGE_PMD_ORDER);
> + const unsigned int max_ptes_shared = collapse_max_ptes_shared(cc, HPAGE_PMD_ORDER);
> + const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
> pmd_t *pmd;
> pte_t *pte, *_pte;
> int none_or_zero = 0, shared = 0, referenced = 0;
> @@ -2369,8 +2418,8 @@ static enum scan_result collapse_scan_file(struct mm_struct *mm,
> unsigned long addr, struct file *file, pgoff_t start,
> struct collapse_control *cc)
> {
> - const unsigned int max_ptes_none = collapse_max_ptes_none(cc, NULL);
> - const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc);
> + const int max_ptes_none = collapse_max_ptes_none(cc, NULL, HPAGE_PMD_ORDER);
> + const unsigned int max_ptes_swap = collapse_max_ptes_swap(cc, HPAGE_PMD_ORDER);
> struct folio *folio = NULL;
> struct address_space *mapping = file->f_mapping;
> XA_STATE(xas, &mapping->i_pages, start);
> --
> 2.54.0
>
>
^ permalink raw reply
* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Sumit Gupta @ 2026-05-18 13:49 UTC (permalink / raw)
To: Randy Dunlap, rafael, viresh.kumar, pierre.gondois,
ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan,
mario.limonciello, linux-pm, linux-doc, linux-kernel
Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
mochs, bbasu
In-Reply-To: <b4516579-c4bf-4ddd-843a-30d4a4992519@infradead.org>
Hi Randy,
On 16/05/26 03:44, Randy Dunlap wrote:
>
> On 5/15/26 5:26 AM, Sumit Gupta wrote:
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
>> index 0eb64aab3685..7e4b3a8fd76f 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -1048,6 +1048,22 @@ Kernel parameters
>> policy to use. This governor must be registered in the
>> kernel before the cpufreq driver probes.
>>
>> + cppc_cpufreq.auto_sel_mode=
>> + [CPU_FREQ] Enable ACPI CPPC autonomous performance
> I just noticed that we should have both CPU_FREQ and CPU_IDLE added to the
> legend (meanings) section at the very beginning of this file, but that
> doesn't have to be part of this patch.
Thanks.
Will send a separate patch adding CPU_FREQ and CPU_IDLE to the legend.
Thank you,
Sumit Gupta
>
>> + selection. When enabled, hardware automatically adjusts
>> + CPU frequency on all CPUs based on workload demands.
>> + In Autonomous mode, Energy Performance Preference (EPP)
>> + hints guide hardware toward performance (0x0) or energy
>> + efficiency (0xff).
>> + Requires ACPI CPPC autonomous selection register
>> + support.
>> + Accepts:
>> + performance, 1: enable auto_sel + set EPP to
>> + performance (0x0)
>> + default_epp, 2: enable auto_sel, preserve EPP value
>> + programmed by BIOS/firmware
>> + Unset: cpufreq governors are used (auto_sel disabled).
^ permalink raw reply
* Re: [PATCH RFC v4 09/10] Documentation: ABI: testing: add docs for ad9910 sysfs entries
From: Jonathan Cameron @ 2026-05-18 13:45 UTC (permalink / raw)
To: Rodrigo Alencar
Cc: Rodrigo Alencar via B4 Relay, rodrigo.alencar, linux-iio,
devicetree, linux-kernel, linux-doc, linux-hardening,
Lars-Peter Clausen, Michael Hennerich, David Lechner,
Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Philipp Zabel, Jonathan Corbet, Shuah Khan, Kees Cook,
Gustavo A. R. Silva
In-Reply-To: <yrabhhhdkzmiuxlqzrrj6a47ftlzwvva7r2korzeszdy4yqrin@xl6obhhnnas4>
On Sun, 17 May 2026 18:30:27 +0100
Rodrigo Alencar <455.rodrigo.alencar@gmail.com> wrote:
> On 26/05/17 03:58PM, Jonathan Cameron wrote:
> > On Fri, 08 May 2026 18:00:25 +0100
> > Rodrigo Alencar via B4 Relay <devnull+rodrigo.alencar.analog.com@kernel.org> wrote:
> >
> > > From: Rodrigo Alencar <rodrigo.alencar@analog.com>
> > >
> > > Add custom ABI documentation file for the DDS AD9910 with sysfs entries to
> > > control Parallel Port, Digital Ramp Generator and OSK parameters.
> > >
> > > Signed-off-by: Rodrigo Alencar <rodrigo.alencar@analog.com>
> > I'm fine with phase and frequency as defined, but for the scaling it made me wonder.
> > For outvoltage0 channels the assumption the value is the peak voltage so if
> > we know what input to be modulated by the ramp generator can we express them
> > in volts (well milivolts) rather than as a scaling multiplier?
>
> The DAC output is current-based and differential. Voltage conversion would happen
> outside the device...
Why aren't we representing this as out_altcurrentX-Y_xxxx?
> using a resistor load or an op-amp transimpedance stage,
> and I am no expert on that, but that often requires impedance matching so voltage
> levels may depend on the frequency. Then, I suppose that voltage is not the right
> unit to use.
Understood that it can get complex!
>
> The scale here controls the amplitude of the varying signal. Assuming the peak voltage
> (amplitude) is constant means we have a constant envelope, but that should not mean
> we can't control it or it should not mean that the hardware can have other ways to
> control it. That said, scale behaves as a "gain multiplier".
Understood. Given it's the envelope then if scale happened to be 1 always it would
be presented as _processed. So this is consistent with other channel types.
>
> >
> > That seems to me like it fits better with the overall ABI.
> >
> > > +What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_scale_offset
> > > +KernelVersion:
> > > +Contact: linux-iio@vger.kernel.org
> > > +Description:
> > > + For a channel that allows amplitude control through buffers, this
> > > + represents the value for a base amplitude scale. The actual output
> > > + amplitude scale is a result with the sum of this value.
> > > +
> >
> > > +
> > > +What: /sys/bus/iio/devices/iio:deviceX/out_altvoltageY_scale_roc
> >
> > Silly question perhaps but can work out how this related to millivolts/sec
> > That might make a more intuitive interface than scaling multiplier per sec
> > Perhaps the combination with offset makes this impossible though maybe that
> > could be a expressed as a voltage offset? Afterall if the amplitude being
> > scaled is 5V then 5 * (offset + scale) = 5 * offset + 5 * scale
> >
> > > +KernelVersion:
> > > +Contact: linux-iio@vger.kernel.org
> > > +Description:
> > > + Amplitude scale rate of change in 1/s for channels that ramp
> > > + amplitude. This value may be influenced by the channel's
> > > + sampling_frequency setting.
> >
> >
>
^ permalink raw reply
* Re: [PATCH v3 2/2] cpufreq: CPPC: add autonomous mode boot parameter support
From: Sumit Gupta @ 2026-05-18 13:44 UTC (permalink / raw)
To: Mario Limonciello, rafael, viresh.kumar, pierre.gondois,
ionela.voinescu, zhenglifeng1, zhanjie9, corbet, skhan, rdunlap,
linux-pm, linux-doc, linux-kernel
Cc: linux-tegra, treding, jonathanh, vsethi, ksitaraman, sanjayc,
mochs, bbasu
In-Reply-To: <bf521e4e-1aa5-49ce-bec5-52845f02214e@amd.com>
Hi Mario,
On 16/05/26 02:43, Mario Limonciello wrote:
> External email: Use caution opening links or attachments
>
>
> On 5/15/26 07:26, Sumit Gupta wrote:
>> Add a kernel boot parameter 'cppc_cpufreq.auto_sel_mode' to enable
>> CPPC autonomous performance selection on all CPUs at system startup.
>> When autonomous mode is enabled, the hardware automatically adjusts
>> CPU performance based on workload demands using Energy Performance
>> Preference (EPP) hints.
>>
>> When the parameter is set:
>> - Configure all CPUs for autonomous operation on first init
>> - Use HW min/max_perf when available; otherwise initialize from caps
>> - Initialize desired_perf to max_perf as a starting hint
>> - Hardware controls frequency instead of the OS governor
>> - EPP behavior depends on parameter value:
>> - performance (or 1): override EPP to performance preference (0x0)
>> - default_epp (or 2): preserve EPP value programmed by BIOS/firmware
>>
>> The boot parameter is applied only during first policy initialization.
>> Skip applying it on CPU hotplug to preserve runtime sysfs configuration.
>>
>> This patch depends on patch series [1] ("cpufreq: Set policy->min and
>> max as real QoS constraints") so that the policy->min/max set in
>> cppc_cpufreq_cpu_init() are not overridden by cpufreq_set_policy()
>> during init.
>>
>> Signed-off-by: Sumit Gupta <sumitg@nvidia.com>
>> ---
>> [1]
>> https://lore.kernel.org/lkml/20260511135538.522653-1-pierre.gondois@arm.com/
>> ---
>> .../admin-guide/kernel-parameters.txt | 16 +++
>> drivers/cpufreq/cppc_cpufreq.c | 122 +++++++++++++++++-
>> 2 files changed, 133 insertions(+), 5 deletions(-)
>>
>> diff --git a/Documentation/admin-guide/kernel-parameters.txt
>> b/Documentation/admin-guide/kernel-parameters.txt
>> index 0eb64aab3685..7e4b3a8fd76f 100644
>> --- a/Documentation/admin-guide/kernel-parameters.txt
>> +++ b/Documentation/admin-guide/kernel-parameters.txt
>> @@ -1048,6 +1048,22 @@ Kernel parameters
>> policy to use. This governor must be registered
>> in the
>> kernel before the cpufreq driver probes.
>>
>> + cppc_cpufreq.auto_sel_mode=
>> + [CPU_FREQ] Enable ACPI CPPC autonomous performance
>> + selection. When enabled, hardware automatically
>> adjusts
>> + CPU frequency on all CPUs based on workload
>> demands.
>> + In Autonomous mode, Energy Performance
>> Preference (EPP)
>> + hints guide hardware toward performance (0x0)
>> or energy
>> + efficiency (0xff).
>> + Requires ACPI CPPC autonomous selection register
>> + support.
>> + Accepts:
>> + performance, 1: enable auto_sel + set EPP to
>> + performance (0x0)
>> + default_epp, 2: enable auto_sel, preserve EPP
>> value
>> + programmed by BIOS/firmware
>> + Unset: cpufreq governors are used (auto_sel
>> disabled).
>
> Rather than unset doing nothing, have you considered having it take a
> midpoint like 128? That's what we do in amd-pstate (default to
> balance_performance). I think it turns into a reasonable balance.
Thanks for the suggestion.
I can add balance_performance that enables auto_sel with EPP=128 in v4.
On changing the driver default (no param behavior) to auto enable
balance_performance, it would be good to keep the current behavior for
now since cppc_cpufreq is generic across ARM64/RISC-V platforms where
EPP and Autonomous Selection registers are optional.
A default change would affect existing users relying on governors.
Thank you,
Sumit Gupta
>
>> +
>> cpu_init_udelay=N
>> [X86,EARLY] Delay for N microsec between assert
>> and de-assert
>> of APIC INIT to start processors. This delay
>> occurs
>> diff --git a/drivers/cpufreq/cppc_cpufreq.c
>> b/drivers/cpufreq/cppc_cpufreq.c
>> index 6b54427b52e1..5f4d735e7c7d 100644
>> --- a/drivers/cpufreq/cppc_cpufreq.c
>> +++ b/drivers/cpufreq/cppc_cpufreq.c
>> @@ -28,6 +28,43 @@
>>
>> static struct cpufreq_driver cppc_cpufreq_driver;
>>
>> +/* Autonomous Selection boot parameter modes */
>> +enum {
>> + AUTO_SEL_PERFORMANCE = 1,
>> + AUTO_SEL_DEFAULT_EPP = 2,
>> +};
>> +
>> +static int auto_sel_mode;
>> +
>> +static int auto_sel_mode_set(const char *val, const struct
>> kernel_param *kp)
>> +{
>> + if (sysfs_streq(val, "performance") || sysfs_streq(val, "1"))
>> + *(int *)kp->arg = AUTO_SEL_PERFORMANCE;
>> + else if (sysfs_streq(val, "default_epp") || sysfs_streq(val, "2"))
>> + *(int *)kp->arg = AUTO_SEL_DEFAULT_EPP;
>> + else
>> + return -EINVAL;
>> +
>> + return 0;
>> +}
>> +
>> +static int auto_sel_mode_get(char *buffer, const struct kernel_param
>> *kp)
>> +{
>> + switch (*(int *)kp->arg) {
>> + case AUTO_SEL_PERFORMANCE:
>> + return sysfs_emit(buffer, "performance\n");
>> + case AUTO_SEL_DEFAULT_EPP:
>> + return sysfs_emit(buffer, "default_epp\n");
>> + default:
>> + return sysfs_emit(buffer, "disabled\n");
>> + }
>> +}
>> +
>> +static const struct kernel_param_ops auto_sel_mode_ops = {
>> + .set = auto_sel_mode_set,
>> + .get = auto_sel_mode_get,
>> +};
>> +
>> #ifdef CONFIG_ACPI_CPPC_CPUFREQ_FIE
>> static enum {
>> FIE_UNSET = -1,
>> @@ -715,11 +752,75 @@ static int cppc_cpufreq_cpu_init(struct
>> cpufreq_policy *policy)
>> policy->cur = cppc_perf_to_khz(caps, caps->highest_perf);
>> cpu_data->perf_ctrls.desired_perf = caps->highest_perf;
>>
>> - ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>> - if (ret) {
>> - pr_debug("Err setting perf value:%d on CPU:%d. ret:%d\n",
>> - caps->highest_perf, cpu, ret);
>> - goto out;
>> + /*
>> + * Enable autonomous mode on first init if boot param is set.
>> + * Check last_governor to detect first init and skip if auto_sel
>> + * is already enabled.
>> + */
>> + if (auto_sel_mode && policy->last_governor[0] == '\0' &&
>> + !cpu_data->perf_ctrls.auto_sel) {
>> + /* Init min/max_perf from caps if not already set by
>> HW. */
>> + if (!cpu_data->perf_ctrls.min_perf)
>> + cpu_data->perf_ctrls.min_perf =
>> caps->lowest_nonlinear_perf;
>> + if (!cpu_data->perf_ctrls.max_perf)
>> + cpu_data->perf_ctrls.max_perf =
>> policy->boost_enabled ?
>> + caps->highest_perf : caps->nominal_perf;
>> +
>> + /*
>> + * In autonomous mode desired_perf is only a hint; EPP and
>> + * the platform drive actual selection within [min, max].
>> + * Initialize it to max_perf so HW starts at the upper
>> bound.
>> + */
>> + cpu_data->perf_ctrls.desired_perf =
>> cpu_data->perf_ctrls.max_perf;
>> +
>> + policy->cur = cppc_perf_to_khz(caps,
>> + cpu_data->perf_ctrls.desired_perf);
>> +
>> + /*
>> + * Override EPP only in 'performance' mode;
>> 'default_epp' mode
>> + * preserves the BIOS/firmware programmed EPP value.
>> + * EPP is optional - some platforms may not support it.
>> + */
>> + if (auto_sel_mode == AUTO_SEL_PERFORMANCE) {
>> + ret = cppc_set_epp(cpu,
>> CPPC_EPP_PERFORMANCE_PREF);
>> + if (ret && ret != -EOPNOTSUPP)
>> + pr_warn("Failed to set EPP for CPU%d
>> (%d)\n", cpu, ret);
>> + else if (!ret)
>> + cpu_data->perf_ctrls.energy_perf = CPPC_EPP_PERFORMANCE_PREF;
>> + }
>> +
>> + /* Program min/max/desired into CPPC regs (non-fatal on
>> failure). */
>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>> + if (ret)
>> + pr_warn("set_perf failed CPU%d (%d); using HW
>> values\n",
>> + cpu, ret);
>> +
>> + ret = cppc_set_auto_sel(cpu, true);
>> + if (ret && ret != -EOPNOTSUPP)
>> + pr_warn("auto_sel CPU%d failed (%d); using OS
>> mode\n",
>> + cpu, ret);
>> + else if (!ret)
>> + cpu_data->perf_ctrls.auto_sel = true;
>> + }
>> +
>> + if (cpu_data->perf_ctrls.auto_sel) {
>> + /* Sync policy limits from HW when autonomous mode is
>> active */
>> + policy->min = cppc_perf_to_khz(caps,
>> + cpu_data->perf_ctrls.min_perf ?:
>> + caps->lowest_nonlinear_perf);
>> + policy->max = cppc_perf_to_khz(caps,
>> + cpu_data->perf_ctrls.max_perf ?:
>> + (policy->boost_enabled ?
>> + caps->highest_perf :
>> + caps->nominal_perf));
>> + } else {
>> + /* Normal mode: governors control frequency */
>> + ret = cppc_set_perf(cpu, &cpu_data->perf_ctrls);
>> + if (ret) {
>> + pr_debug("Err setting perf value:%d on CPU:%d.
>> ret:%d\n",
>> + caps->highest_perf, cpu, ret);
>> + goto out;
>> + }
>> }
>>
>> cppc_cpufreq_cpu_fie_init(policy);
>> @@ -1079,10 +1180,21 @@ static int __init cppc_cpufreq_init(void)
>>
>> static void __exit cppc_cpufreq_exit(void)
>> {
>> + unsigned int cpu;
>> +
>> + for_each_present_cpu(cpu)
>> + cppc_set_auto_sel(cpu, false);
>> +
>> cpufreq_unregister_driver(&cppc_cpufreq_driver);
>> cppc_freq_invariance_exit();
>> }
>>
>> +module_param_cb(auto_sel_mode, &auto_sel_mode_ops, &auto_sel_mode,
>> 0444);
>> +MODULE_PARM_DESC(auto_sel_mode,
>> + "Enable CPPC autonomous performance selection at boot: "
>> + "performance or 1 (EPP=performance), "
>> + "default_epp or 2 (preserve BIOS/firmware EPP)");
>> +
>> module_exit(cppc_cpufreq_exit);
>> MODULE_AUTHOR("Ashwin Chaugule");
>> MODULE_DESCRIPTION("CPUFreq driver based on the ACPI CPPC v5.0+
>> spec");
>
^ permalink raw reply
* Re: [PATCH RFC v4 03/10] iio: frequency: ad9910: initial driver implementation
From: Jonathan Cameron @ 2026-05-18 13:42 UTC (permalink / raw)
To: Rodrigo Alencar
Cc: Rodrigo Alencar via B4 Relay, rodrigo.alencar, linux-iio,
devicetree, linux-kernel, linux-doc, linux-hardening,
Lars-Peter Clausen, Michael Hennerich, David Lechner,
Andy Shevchenko, Rob Herring, Krzysztof Kozlowski, Conor Dooley,
Philipp Zabel, Jonathan Corbet, Shuah Khan, Kees Cook,
Gustavo A. R. Silva
In-Reply-To: <is4rbxohz5icbaslatmjmzhb5oztnh6ytmntgkn3rssjijppb3@mur2heagprdi>
> > > + case IIO_CHAN_INFO_FREQUENCY:
> > > + switch (chan->channel) {
> > > + case AD9910_CHANNEL_PROFILE_0 ... AD9910_CHANNEL_PROFILE_7:
> > > + tmp32 = chan->channel - AD9910_CHANNEL_PROFILE_0;
> > > + tmp64 = FIELD_GET(AD9910_PROFILE_ST_FTW_MSK,
> > > + st->reg[AD9910_REG_PROFILE(tmp32)].val64);
> > > + break;
> > > + default:
> > > + return -EINVAL;
> > > + }
> > > + tmp64 *= st->data.sysclk_freq_hz;
> > > + *val = tmp64 >> 32;
> > > + *val2 = ((tmp64 & GENMASK_ULL(31, 0)) * MICRO) >> 32;
> >
> > Why in this particular case have this outside the switch / case whereas in others
> > you do the full maths and set inside? I'd put it inside and not worry about slightly
> > long lines.
>
> for frequency, those calculations are going to be common for the other channels that are
> going to be populated by other patches...
>
> DRG up/down and RAM will have tmp64 populated with a FTW value.
Makes sense. Thanks,
^ permalink raw reply
* Re: [PATCH] dcache: add fs.dentry-limit sysctl with negative-first reaper
From: Ian Kent @ 2026-05-18 13:39 UTC (permalink / raw)
To: Jan Kara
Cc: NeilBrown, Horst Birthelmer, Amir Goldstein, Miklos Szeredi,
Jonathan Corbet, Shuah Khan, Alexander Viro, Christian Brauner,
linux-doc, linux-kernel, linux-fsdevel, Horst Birthelmer
In-Reply-To: <yk2hem4zwinm4glenpc74to7sm5kyriksgwn6mxh7t4saotiba@7zik7jcnbs5m>
On 18/5/26 16:19, Jan Kara wrote:
> Hi Ian,
>
> On Mon 18-05-26 10:55:43, Ian Kent wrote:
>> On 18/5/26 07:55, NeilBrown wrote:
>>> On Fri, 15 May 2026, Horst Birthelmer wrote:
>>> According to the email you linked, a problem arises when a directory has
>>> a great many negative children. Code which walks the list of children
>>> (such as fsnotify) while holding a lock can suffer unpredictable delays
>>> and result in long lock-hold times. So maybe a limit on negative
>>> dentries for any parent is what we really want. That would be clumsy to
>>> implement I imagine.
>> But the notion of dropping the dentry in ->d_delete() on last dput() is
>> simple enough but did see regressions (the only other place in the VFS
>> besides dentry_kill() that the inode is unlinked from the dentry on
>> dput()). I wonder if the regression was related to the test itself
>> deliberately recreating deleted files and if that really is normal
>> behaviour. By itself that should prevent almost all negative dentries
>> being retained. Although file systems could do this as well (think XFS
>> inode recycling) it should be reasonable to require it be left to the
>> VFS.
>>
>> But even that's not enough given that, in my case, there would still be
>> around 4 million dentries in the LRU cache and in fsnotify there are
>> directory child traversals holding the parent i_lock "spinlock" that are
>> going to cause problems.
> Do you mean there are very many positive children of a directory?
Didn't quantify that.
The symptom is the "Spinlock held for more than ... seconds" occurring
in the log. So there are certainly a lot of children in the list, but
it's an assumption the ratio of positive to negative entries is roughly
the same as the overall ratio in the dcache.
>
>> That's all that much more puzzling when I see things like commit
>> 172e422ffea2 ("fsnotify: clear PARENT_WATCHED flags lazily") which looks
>> like it implies the child flag depends entirely on the parent state (what
>> am I missing Amir?)
> PARENT_WATCHED dentry flags (as the name suggests) are only caching the
> information whether the parent has notification marks receiving events from
> the child. So yes, the flag fully depends on the parent state.
Ok, this is something I was after, I will keep looking at the fsnotify
code since there is something to find, thanks for that.
>
>> so why is this traversal even retained in fsnotify?
> Not sure which traversal you mean but if you set watch on a parent, you
> have to walk all children to set PARENT_WATCHED flag so that you don't miss
> events on children...
Yes, that traversal is what I'm questioning ... again thanks.
I think the function name is still fsnotify_set_children_dentry_flags() in
recent kernels, the subject of commit 172e422ffea2 I mentioned above.
When you say miss events are you saying that accessing the parent dentry to
work out if the child needs to respond to an event is quite expensive in the
overall event processing context, that might make more sense to me ... or do
I completely not yet understand the reasoning behind the need for the flag?
>
>>> But what if we move dentries to the end of the list when they become
>>> negative, and to the start of the list when they become positive? Then
>>> code which walks the child list could simply abort on the first
>>> negative.
>>>
>>> I doubt that would be quite as easy as it sounds, but it would at least
>>> be more focused on the observed symptom rather than some whole-system
>>> number which only vaguely correlates with the observed symptom.
>>>
>>> Maybe a completely different approach: change children-walking code to
>>> drop and retake the lock (with appropriate validation) periodically.
>>> What too would address the specific symptom.
>> Another good question.
>>
>> I have assumed that dropping and re-taking the lock cannot be done but
>> this is a question I would like answered as well. Dropping and re-taking
>> lock would require, as Miklos pointed out to me off-list, recording the
>> list position with say a cursor, introducing unwanted complexity when it
>> would be better to accept the cost of a single extra access to the parent
>> flags (which I assume is one reason to set the flag in the child).
> The parent access is actually more expensive than you might think. Based on
> experience with past fsnotify related performance regression I expect some
> 20% performance hit for small tmpfs writes if you add unconditional parent
> access to the write path.
That sounds like a lot for what should be a memory access of an already in
memory structure since the parent must be accessed to traverse the list of
child entries. I clearly don't fully understand the implications of what
I'm saying but there has been mention of another context ...
Nevertheless more useful information, ;)
Thanks again,
Ian
^ permalink raw reply
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox