* Re: [PATCH v4 4/9] bus: mhi: Centralize firmware image table selection at probe time
From: Kishore Batta @ 2026-04-14 9:49 UTC (permalink / raw)
To: Manivannan Sadhasivam
Cc: Jonathan Corbet, Shuah Khan, Jeff Hugo, Carl Vanderlip,
Oded Gabbay, andersson, linux-doc, linux-kernel, linux-arm-msm,
dri-devel, mhi
In-Reply-To: <2sykuv6r643v3i6ymdoevzohoxdmgrrodvgpbaystskz7fwgun@fd3p7gcso252>
On 4/13/2026 4:56 PM, Manivannan Sadhasivam wrote:
> On Thu, Mar 19, 2026 at 12:01:44PM +0530, Kishore Batta wrote:
>> The Sahara driver currently selects firmware image tables using
>> scattered, device specific conditionals in the probe path, making the
>> logic harder to follow and extend.
>>
>> Refactor firmware image table selection into a single, explicit probe-time
>> mechanism by introducing a variant table that captures device matching,
>> firmware image tables, firmware folder names, and streaming behavior in
>> one place.
>>
>> This centralizes device specific decisions, simplifies the probe logic,
>> and avoids ad-hoc conditionals while preserving the existing behavior for
>> all supported AIC devices.
>>
>> Signed-off-by: Kishore Batta <kishore.batta@oss.qualcomm.com>
>> ---
>> drivers/bus/mhi/sahara/sahara.c | 66 ++++++++++++++++++++++++++++++++++++-----
>> 1 file changed, 58 insertions(+), 8 deletions(-)
>>
>> diff --git a/drivers/bus/mhi/sahara/sahara.c b/drivers/bus/mhi/sahara/sahara.c
>> index e3499977e7c6b53bc624a8eb00d0636f2ea63307..8f1c0d72066c0cf80c09d78bfc51df2e482133b9 100644
>> --- a/drivers/bus/mhi/sahara/sahara.c
>> +++ b/drivers/bus/mhi/sahara/sahara.c
>> @@ -180,6 +180,16 @@ struct sahara_context {
>> u32 read_data_length;
>> bool is_mem_dump_mode;
>> bool non_streaming;
>> + const char *fw_folder;
>> +};
>> +
>> +struct sahara_variant {
>> + const char *match;
>> + bool match_is_chan;
> This name makes no sense.
>
> - Mani
I will drop this in the next version.
>> + const char * const *image_table;
>> + size_t table_size;
>> + const char *fw_folder;
>> + bool non_streaming;
>> };
>>
>> static const char * const aic100_image_table[] = {
>> @@ -224,11 +234,50 @@ static const char * const aic200_image_table[] = {
>> [78] = "qcom/aic200/pvs.bin",
>> };
>>
>> +static const struct sahara_variant sahara_variants[] = {
>> + {
>> + .match = "AIC100",
>> + .match_is_chan = false,
>> + .image_table = aic100_image_table,
>> + .table_size = ARRAY_SIZE(aic100_image_table),
>> + .fw_folder = "aic100",
>> + .non_streaming = true,
>> + },
>> + {
>> + .match = "AIC200",
>> + .match_is_chan = false,
>> + .image_table = aic200_image_table,
>> + .table_size = ARRAY_SIZE(aic200_image_table),
>> + .fw_folder = "aic200",
>> + .non_streaming = false,
>> + }
>> +};
>> +
>> static bool is_streaming(struct sahara_context *context)
>> {
>> return !context->non_streaming;
>> }
>>
>> +static const struct sahara_variant *sahara_select_variant(struct mhi_device *mhi_dev,
>> + const struct mhi_device_id *id)
>> +{
>> + int i;
>> +
>> + for (i = 0; i < ARRAY_SIZE(sahara_variants); i++) {
>> + const struct sahara_variant *v = &sahara_variants[i];
>> +
>> + if (v->match_is_chan) {
>> + if (id && id->chan && !strcmp(id->chan, v->match))
>> + return v;
>> + } else {
>> + if (mhi_dev->mhi_cntrl && mhi_dev->mhi_cntrl->name &&
>> + !strcmp(mhi_dev->mhi_cntrl->name, v->match))
>> + return v;
>> + }
>> + }
>> + return NULL;
>> +}
>> +
>> static int sahara_find_image(struct sahara_context *context, u32 image_id)
>> {
>> int ret;
>> @@ -797,6 +846,7 @@ static void sahara_read_data_processing(struct work_struct *work)
>>
>> static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_id *id)
>> {
>> + const struct sahara_variant *variant;
>> struct sahara_context *context;
>> int ret;
>> int i;
>> @@ -809,14 +859,14 @@ static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_
>> if (!context->rx)
>> return -ENOMEM;
>>
>> - if (!strcmp(mhi_dev->mhi_cntrl->name, "AIC200")) {
>> - context->image_table = aic200_image_table;
>> - context->table_size = ARRAY_SIZE(aic200_image_table);
>> - } else {
>> - context->image_table = aic100_image_table;
>> - context->table_size = ARRAY_SIZE(aic100_image_table);
>> - context->non_streaming = true;
>> - }
>> + variant = sahara_select_variant(mhi_dev, id);
>> + if (!variant)
>> + return -ENODEV;
>> +
>> + context->image_table = variant->image_table;
>> + context->table_size = variant->table_size;
>> + context->non_streaming = variant->non_streaming;
>> + context->fw_folder = variant->fw_folder;
>>
>> /*
>> * There are two firmware implementations for READ_DATA handling.
>>
>> --
>> 2.34.1
>>
^ permalink raw reply
* Re: [PATCH v4 5/9] bus: mhi: Add QDU100 variant and image_id firmware fallback
From: Kishore Batta @ 2026-04-14 9:51 UTC (permalink / raw)
To: Manivannan Sadhasivam
Cc: Jonathan Corbet, Shuah Khan, Jeff Hugo, Carl Vanderlip,
Oded Gabbay, andersson, linux-doc, linux-kernel, linux-arm-msm,
dri-devel, mhi
In-Reply-To: <5lfbhyzyyji6cuve3uzd26rfgnqotcupelppgehdj36dq7op6j@hn3jmhtqzntq>
On 4/13/2026 5:04 PM, Manivannan Sadhasivam wrote:
> On Thu, Mar 19, 2026 at 12:01:45PM +0530, Kishore Batta wrote:
>> The Sahara driver currently selects a firmware image table based on the
>> attached device, but it does not recognize QDU100 devices that expose the
>> protocol on the SAHARA MHI channel. As a result, the host cannot associate
>> QDU100 devices with the correct firmware namespace during image transfer.
>>
>> Extend the probe-time variant selection to match the SAHARA MHI channel
>> and associate it with the QDU100 firmware folder. Add an image_id based
>> firmware lookup fallback for cases where an image does not have an explicit
>> table entry. This allows required images to be provisioned by the platform
>> without requiring device specific client drivers or additional registration
>> mechanisms.
>>
>> This change only affects devices matched on the SAHARA channel and does not
>> change behavior for existing AIC100 and AIC200 devices.
>>
>> Signed-off-by: Kishore Batta <kishore.batta@oss.qualcomm.com>
>> ---
>> drivers/bus/mhi/sahara/sahara.c | 77 ++++++++++++++++++++++++++++++++++++++---
>> 1 file changed, 72 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/bus/mhi/sahara/sahara.c b/drivers/bus/mhi/sahara/sahara.c
>> index 8f1c0d72066c0cf80c09d78bfc51df2e482133b9..4ea14c57774f51a778289d7409372a6ab21fea60 100644
>> --- a/drivers/bus/mhi/sahara/sahara.c
>> +++ b/drivers/bus/mhi/sahara/sahara.c
>> @@ -234,6 +234,36 @@ static const char * const aic200_image_table[] = {
>> [78] = "qcom/aic200/pvs.bin",
>> };
>>
>> +static const char * const qdu100_image_table[] = {
>> + [5] = "qcom/qdu100/uefi.elf",
>> + [8] = "qcom/qdu100/qdsp6sw.mbn",
>> + [16] = "qcom/qdu100/efs1.bin",
>> + [17] = "qcom/qdu100/efs2.bin",
>> + [20] = "qcom/qdu100/efs3.bin",
>> + [23] = "qcom/qdu100/aop.mbn",
>> + [25] = "qcom/qdu100/tz.mbn",
>> + [29] = "qcom/qdu100/zeros_1sector.bin",
>> + [33] = "qcom/qdu100/hypvm.mbn",
>> + [34] = "qcom/qdu100/mdmddr.mbn",
>> + [36] = "qcom/qdu100/multi_image_qti.mbn",
>> + [37] = "qcom/qdu100/multi_image.mbn",
>> + [38] = "qcom/qdu100/xbl_config.elf",
>> + [39] = "qcom/qdu100/abl_userdebug.elf",
>> + [40] = "qcom/qdu100/zeros_1sector.bin",
>> + [41] = "qcom/qdu100/devcfg.mbn",
>> + [42] = "qcom/qdu100/zeros_1sector.bin",
>> + [45] = "qcom/qdu100/tools_l.elf",
>> + [46] = "qcom/qdu100/Quantum.elf",
>> + [47] = "qcom/qdu100/quest.elf",
>> + [48] = "qcom/qdu100/xbl_ramdump.elf",
>> + [49] = "qcom/qdu100/shrm.elf",
>> + [50] = "qcom/qdu100/cpucp.elf",
>> + [51] = "qcom/qdu100/aop_devcfg.mbn",
>> + [52] = "qcom/qdu100/fw_csm_gsi_3.0.elf",
>> + [53] = "qcom/qdu100/qdsp6sw_dtbs.elf",
>> + [54] = "qcom/qdu100/qupv3fw.elf",
>> +};
> Why the Sahara driver hardcodes these firmware names in the first place? Sahara
> is just a protocol to transfer these images to the device, so this driver
> shouldn't have any device specific info hardcoded. IMO, this should just act as
> a pure library. These firmware names should come from MHI controller drivers
> instead.
>
> - Mani
ACK. I will move these image tables to respective MHI controller drivers
by implementing a registration mechanism.
>
^ permalink raw reply
* Re: [PATCH 4/6] hugetlb: drop vma_hugecache_offset() in favor of linear_page_index()
From: Oscar Salvador @ 2026-04-14 9:53 UTC (permalink / raw)
To: Jane Chu
Cc: akpm, david, muchun.song, lorenzo.stoakes, Liam.Howlett, vbabka,
rppt, surenb, mhocko, corbet, skhan, hughd, baolin.wang, peterx,
linux-mm, linux-doc, linux-kernel
In-Reply-To: <20260409234158.837786-5-jane.chu@oracle.com>
On Thu, Apr 09, 2026 at 05:41:55PM -0600, Jane Chu wrote:
> vma_hugecache_offset() converts a hugetlb VMA address into a mapping
> offset in hugepage units. While the helper is small, its name is not very
> clear, and the resulting code is harder to follow than using the common MM
> helper directly.
>
> Use linear_page_index() instead, with an explicit conversion from
> PAGE_SIZE units to hugepage units at each call site, and remove
> vma_hugecache_offset().
>
> This makes the code a bit more direct and avoids a hugetlb-specific helper
> whose behavior is already expressible with existing MM primitives.
>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
Looks good to me, the only thing is the conversion to hugepage units
which may not be very clear to the casual reader, but you already
mentioned that you will add a helper, so all good.
--
Oscar Salvador
SUSE Labs
^ permalink raw reply
* Re: [PATCH v4 8/9] bus: mhi: Expose DDR training data via controller sysfs
From: Kishore Batta @ 2026-04-14 9:56 UTC (permalink / raw)
To: Manivannan Sadhasivam
Cc: Jonathan Corbet, Shuah Khan, Jeff Hugo, Carl Vanderlip,
Oded Gabbay, andersson, linux-doc, linux-kernel, linux-arm-msm,
dri-devel, mhi
In-Reply-To: <tbwahssgudfeacfj3wcg32yw5fkqorswees4gv4geypjmmdcyu@tv6qkuhyw23l>
On 4/13/2026 5:28 PM, Manivannan Sadhasivam wrote:
> On Thu, Mar 19, 2026 at 12:01:48PM +0530, Kishore Batta wrote:
>> DDR training data captured during Sahara command mode needs to be
>> accessible to userspace so it can be persisted and reused on subsequent
>> boots. Currently, the training data is stored internally in the driver
>> but has no external visibility once the sahara channel is torn down.
>>
> Maybe share some steps on how the userspace is expected to use this calibration
> data.
Sure. will update the commit message with the required details in the
next version.
>> Expose the captured DDR training data via a read-only binary sysfs
>> attribute on the MHI controller device. The sysfs file is created under
>> the controller node, allowing userspace to read the training data even
>> after the sahara channel device has been removed.
>>
> So once the calibration data is read, how it can be used further?
The userspace will store the calibration data in
"mdmddr_0x<serial_no>.mbn format. In the next boot, Sahara driver loads
the real DDR calibration data and training data will be restored. No
repeated DDR training is performed at target end.
>
>> The sysfs attribute reads directly from controller-scoped storage and
>> relies on device managed resources for cleanup when the controller
>> device is destroyed. No explicit sysfs removal is required, avoiding
>> lifetime dependencies on the Sahara channel device.
>>
> Missing ABI documentation.
>
> - Mani
Currently i have added in a separate patch(9/9). I will squash it with
this patch in the next version.
>> Signed-off-by: Kishore Batta <kishore.batta@oss.qualcomm.com>
>> ---
>> drivers/bus/mhi/sahara/sahara.c | 69 +++++++++++++++++++++++++++++++++++++++++
>> 1 file changed, 69 insertions(+)
>>
>> diff --git a/drivers/bus/mhi/sahara/sahara.c b/drivers/bus/mhi/sahara/sahara.c
>> index c88f1220199ac4373d3552167870c19a0d5f23b9..b7208738df10fc3c3895acd46873412818dc1730 100644
>> --- a/drivers/bus/mhi/sahara/sahara.c
>> +++ b/drivers/bus/mhi/sahara/sahara.c
>> @@ -415,6 +415,73 @@ static struct sahara_ctrl_trng_data *sahara_ctrl_trng_get(struct device *dev)
>> return ct;
>> }
>>
>> +static ssize_t ddr_training_data_read(struct file *filp, struct kobject *kobj,
>> + const struct bin_attribute *attr, char *buf,
>> + loff_t offset, size_t count)
>> +{
>> + struct device *dev = kobj_to_dev(kobj);
>> + struct sahara_ctrl_trng_data *ct;
>> + size_t available;
>> +
>> + ct = sahara_ctrl_trng_get(dev);
>> + if (!ct)
>> + return -ENODEV;
>> +
>> + mutex_lock(&ct->lock);
>> +
>> + /* No data yet or offset past end */
>> + if (!ct->data || offset >= ct->size) {
>> + mutex_unlock(&ct->lock);
>> + return 0;
>> + }
>> +
>> + available = ct->size - offset;
>> + count = min(count, available);
>> + memcpy(buf, (u8 *)ct->data + offset, count);
>> +
>> + mutex_unlock(&ct->lock);
>> +
>> + return count;
>> +}
>> +
>> +static const struct bin_attribute ddr_training_data_attr = {
>> + .attr = {
>> + .name = "ddr_training_data",
>> + .mode = 0444,
>> + },
>> + .read = ddr_training_data_read,
>> +};
>> +
>> +static void sahara_sysfs_devres_release(struct device *dev, void *res)
>> +{
>> + device_remove_bin_file(dev, &ddr_training_data_attr);
>> +}
>> +
>> +static void sahara_sysfs_create(struct mhi_device *mhi_dev)
>> +{
>> + struct device *dev = &mhi_dev->mhi_cntrl->mhi_dev->dev;
>> + void *cookie;
>> + int ret;
>> +
>> + if (devres_find(dev, sahara_sysfs_devres_release, NULL, NULL))
>> + return;
>> +
>> + ret = device_create_bin_file(dev, &ddr_training_data_attr);
>> + if (ret) {
>> + dev_warn(&mhi_dev->dev,
>> + "Failed to create DDR training sysfs node (%d)\n", ret);
>> + return;
>> + }
>> +
>> + cookie = devres_alloc(sahara_sysfs_devres_release, 1, GFP_KERNEL);
>> + if (!cookie) {
>> + device_remove_bin_file(dev, &ddr_training_data_attr);
>> + return;
>> + }
>> +
>> + devres_add(dev, cookie);
>> +}
>> +
>> static int sahara_find_image(struct sahara_context *context, u32 image_id)
>> {
>> char *fw_path;
>> @@ -1272,6 +1339,8 @@ static int sahara_mhi_probe(struct mhi_device *mhi_dev, const struct mhi_device_
>> return ret;
>> }
>>
>> + sahara_sysfs_create(mhi_dev);
>> +
>> return 0;
>> }
>>
>>
>> --
>> 2.34.1
>>
^ permalink raw reply
* Re: [PATCH v4 9/9] Documentation: ABI: Add sysfs ABI documentation for DDR training data
From: Kishore Batta @ 2026-04-14 9:57 UTC (permalink / raw)
To: Manivannan Sadhasivam
Cc: Jonathan Corbet, Shuah Khan, Jeff Hugo, Carl Vanderlip,
Oded Gabbay, andersson, linux-doc, linux-kernel, linux-arm-msm,
dri-devel, mhi
In-Reply-To: <yttrssaw4k2vx7r6l4vsb535qcrr4phsgj6qlnu2r764inai7o@d4qgr7uu5t2s>
On 4/13/2026 5:29 PM, Manivannan Sadhasivam wrote:
> On Thu, Mar 19, 2026 at 12:01:49PM +0530, Kishore Batta wrote:
>> Add ABI documentation for the DDR training data sysfs attribute exposed by
>> the sahara MHI driver.
>>
>> The documented sysfs node provides read-only access to the DDR training
>> data captured during sahara command mode and exposed via the MHI
>> controller device. This allows userspace to read the training data and
>> manage it as needed outside the kernel.
>>
>> Signed-off-by: Kishore Batta <kishore.batta@oss.qualcomm.com>
> Ah, this should be squashed with previous patch.
>
> - Mani
Sure. I will do it.
>> ---
>> .../ABI/testing/sysfs-bus-mhi-ddr_training_data | 19 +++++++++++++++++++
>> 1 file changed, 19 insertions(+)
>>
>> diff --git a/Documentation/ABI/testing/sysfs-bus-mhi-ddr_training_data b/Documentation/ABI/testing/sysfs-bus-mhi-ddr_training_data
>> new file mode 100644
>> index 0000000000000000000000000000000000000000..810b487b5a5fdba133d81255f9879844e3938a10
>> --- /dev/null
>> +++ b/Documentation/ABI/testing/sysfs-bus-mhi-ddr_training_data
>> @@ -0,0 +1,19 @@
>> +What: /sys/bus/mhi/devices/<mhi-cntrl>/ddr_training_data
>> +
>> +Date: March 2026
>> +
>> +Contact: Kishore Batta <kishore.batta@oss.qualcomm.com>
>> +
>> +Description: Contains the DDR training data for the Qualcomm device
>> + connected. MHI driver populates different controller
>> + nodes for each device. The DDR training data is exposed
>> + to userspace to read and save the training data file to
>> + the filesystem. In the subsequent boot up of the device,
>> + the training data is restored from host to device
>> + optimizing the boot up time of the device.
>> +
>> +Usage: Example for reading DDR training data:
>> + cat /sys/bus/mhi/devices/mhi0/ddr_training_data
>> +
>> +Permissions: The file permissions are set to 0444 allowing read
>> + access.
>>
>> --
>> 2.34.1
>>
^ permalink raw reply
* Re: [PATCH 5/6] hugetlb: make hugetlb_add_to_page_cache() use PAGE_SIZE-based index
From: Oscar Salvador @ 2026-04-14 10:23 UTC (permalink / raw)
To: Jane Chu
Cc: akpm, david, muchun.song, lorenzo.stoakes, Liam.Howlett, vbabka,
rppt, surenb, mhocko, corbet, skhan, hughd, baolin.wang, peterx,
linux-mm, linux-doc, linux-kernel
In-Reply-To: <20260409234158.837786-6-jane.chu@oracle.com>
On Thu, Apr 09, 2026 at 05:41:56PM -0600, Jane Chu wrote:
> hugetlb_add_to_page_cache() currently takes a parameter named 'idx',
> but internally converts it from hugetlb page units into PAGE_SIZE-based
> page-cache index units before calling __filemap_add_folio().
>
> Make hugetlb_add_to_page_cache() take a PAGE_SIZE-based index directly
> and update its callers accordingly. This removes the internal shift,
> keeps the index units consistent with filemap_lock_folio() and
> __filemap_add_folio(), and simplifies the surrounding code.
>
> Signed-off-by: Jane Chu <jane.chu@oracle.com>
Acked-by: Oscar Salvador <osalvador@suse.de>
--
Oscar Salvador
SUSE Labs
^ permalink raw reply
* RE: [PATCH v7 4/6] iio: adc: ad4691: add SPI offload support
From: Sabau, Radu bogdan @ 2026-04-14 10:28 UTC (permalink / raw)
To: David Lechner, Lars-Peter Clausen, Hennerich, Michael,
Jonathan Cameron, Sa, Nuno, Andy Shevchenko, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Uwe Kleine-König,
Liam Girdwood, Mark Brown, Linus Walleij, Bartosz Golaszewski,
Philipp Zabel, Jonathan Corbet, Shuah Khan
Cc: linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <1170956f-da05-4280-990f-64306ca905c2@baylibre.com>
> -----Original Message-----
> From: David Lechner <dlechner@baylibre.com>
> Sent: Saturday, April 11, 2026 12:01 AM
...
> >
> > static const struct ad4691_chip_info ad4694_chip_info = {
> > .name = "ad4694",
> > .max_rate = 1 * HZ_PER_MHZ,
> > .sw_info = &ad4693_sw_info,
> > + .offload_info = &ad4693_offload_info,
> > +};
> > +
> > +struct ad4691_offload_state {
> > + struct spi_offload *spi;
>
> I would call this "offload" or "instance". "spi" is usally the SPI
> device handle.
I thought about this too, will implement it as offload then.
>
> > + struct spi_offload_trigger *trigger;
> > + u64 trigger_hz;
> > + u8 tx_cmd[17][2];
> > + u8 tx_reset[4];
> > };
> >
>
> ...
>
> > +
> > +static int ad4691_cnv_burst_offload_buffer_predisable(struct iio_dev
> *indio_dev)
> > +{
> > + struct ad4691_state *st = iio_priv(indio_dev);
> > + struct ad4691_offload_state *offload = st->offload;
> > + int ret;
> > +
> > + spi_offload_trigger_disable(offload->spi, offload->trigger);
> > +
> > + ret = ad4691_sampling_enable(st, false);
> > + if (ret)
> > + return ret;
> > +
> > + ret = regmap_write(st->regmap, AD4691_STD_SEQ_CONFIG,
> > + AD4691_SEQ_ALL_CHANNELS_OFF);
>
> Why this extra step? We don't have it when unwinding in the
> error path of the postenable function.
This is a mistake from my end. Perhaps this could be removed since
the sequencer is over-written upon new buffers/raw readings anyway.
>
> > + if (ret)
> > + return ret;
> > +
> > + spi_unoptimize_message(&st->scan_msg);
> > +
> > + return ad4691_exit_conversion_mode(st);
> > +}
> > +
> > +static const struct iio_buffer_setup_ops
> ad4691_cnv_burst_offload_buffer_setup_ops = {
> > + .postenable = &ad4691_cnv_burst_offload_buffer_postenable,
> > + .predisable = &ad4691_cnv_burst_offload_buffer_predisable,
> > +};
> > +
> > static ssize_t sampling_frequency_show(struct device *dev,
> > struct device_attribute *attr,
> > char *buf)
^ permalink raw reply
* Re: [PATCH v3 2/3] mm/memory-failure: add CONFIG_BOOTPARAM_MEMORY_FAILURE_PANIC option
From: Breno Leitao @ 2026-04-14 10:29 UTC (permalink / raw)
To: Miaohe Lin, Naoya Horiguchi, Andrew Morton, Jonathan Corbet,
Shuah Khan, David Hildenbrand, Lorenzo Stoakes, Liam R. Howlett,
Vlastimil Babka, Mike Rapoport, Suren Baghdasaryan, Michal Hocko
Cc: linux-mm, linux-kernel, linux-doc, kernel-team, gustavold
In-Reply-To: <20260413-ecc_panic-v3-2-1dcbb2f12bc4@debian.org>
On Mon, Apr 13, 2026 at 06:26:34AM -0700, Breno Leitao wrote:
> +config BOOTPARAM_MEMORY_FAILURE_PANIC
> + bool "Panic on unrecoverable memory failure"
> + depends on MEMORY_FAILURE
> + help
> + Say Y here to panic when an unrecoverable memory failure is
> + detected. This covers kernel pages, high-order kernel pages,
> + and unknown page types that cannot be recovered. Can be disabled
> + at runtime via the panic_on_unrecoverable_memory_failure sysctl.
After considering Linus's recent feedback on kernel configuration
complexity, I'm reconsidering this approach. He recently emphasized:
"The kernel config phase is probably one of the biggest pain points for
random new people trying to build their own kernels, and we DO NOT ASK
PEOPLE STUIPID THINGS." --Linus
https://lore.kernel.org/all/CAHk-=whigg3hvOy7c1j1MXFy6o6CHp0g4Tc3Y-MAk+XDssHU0A@mail.gmail.com/
I will respin a new version, dropping this patch from the series to keep Linus’
blood pressure in check.
--breno
^ permalink raw reply
* RE: [PATCH v7 5/6] iio: adc: ad4691: add oversampling support
From: Sabau, Radu bogdan @ 2026-04-14 10:32 UTC (permalink / raw)
To: David Lechner, Lars-Peter Clausen, Hennerich, Michael,
Jonathan Cameron, Sa, Nuno, Andy Shevchenko, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Uwe Kleine-König,
Liam Girdwood, Mark Brown, Linus Walleij, Bartosz Golaszewski,
Philipp Zabel, Jonathan Corbet, Shuah Khan
Cc: linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <742b1821-9103-414e-a860-c2e8d5406e35@baylibre.com>
> -----Original Message-----
> From: David Lechner <dlechner@baylibre.com>
> Sent: Saturday, April 11, 2026 12:15 AM
...
> >
> > osc_idx = FIELD_GET(AD4691_OSC_FREQ_MASK, reg_val);
> > - /* Wait 2 oscillator periods for the conversion to complete. */
> > - period_us = DIV_ROUND_UP(2UL * USEC_PER_SEC,
> ad4691_osc_freqs_Hz[osc_idx]);
> > + /* Wait osr oscillator periods for all accumulator samples to complete.
> */
>
> Why did we need to way 2 before and only 1 now when OSR == 1?
>
You are right, that extra period should exist when reading raw not dependent
on the OSR. If OSR = 4 then we should wait 5 just to make sure we are reading
a correct result, since the single_shot_read doesn’t use any interrupts as the
buffers do.
^ permalink raw reply
* [PATCH bpf] bpf,tcp: avoid infinite recursion in BPF_SOCK_OPS_HDR_OPT_LEN_CB
From: Jiayuan Chen @ 2026-04-14 10:57 UTC (permalink / raw)
To: bpf
Cc: Jiayuan Chen, Quan Sun, Yinhao Hu, Kaiyan Mei, Dongliang Mu,
Eric Dumazet, Neal Cardwell, Kuniyuki Iwashima, David S. Miller,
Jakub Kicinski, Paolo Abeni, Simon Horman, Jonathan Corbet,
Shuah Khan, Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko,
Martin KaFai Lau, Eduard Zingerman, Song Liu, Yonghong Song,
John Fastabend, KP Singh, Stanislav Fomichev, Hao Luo, Jiri Olsa,
David Ahern, netdev, linux-doc, linux-kernel
A BPF_PROG_TYPE_SOCK_OPS program can set BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG
to inject custom TCP header options. When the kernel builds a TCP packet,
it calls tcp_established_options() to calculate the header size, which
invokes bpf_skops_hdr_opt_len() to trigger the BPF_SOCK_OPS_HDR_OPT_LEN_CB
callback.
If the BPF program calls bpf_setsockopt(TCP_NODELAY) inside this callback,
__tcp_sock_set_nodelay() will call tcp_push_pending_frames(), which calls
tcp_current_mss(), which calls tcp_established_options() again,
re-triggering the same BPF callback. This creates an infinite recursion
that exhausts the kernel stack and causes a panic.
BPF_SOCK_OPS_HDR_OPT_LEN_CB
-> bpf_setsockopt(TCP_NODELAY)
-> tcp_push_pending_frames()
-> tcp_current_mss()
-> tcp_established_options()
-> bpf_skops_hdr_opt_len()
/* infinite recursion */
-> BPF_SOCK_OPS_HDR_OPT_LEN_CB
A similar reentrancy issue exists for TCP congestion control, which is
guarded by tp->bpf_chg_cc_inprogress. Adopt the same approach: introduce
tp->bpf_hdr_opt_len_cb_inprogress, set it before invoking the callback in
bpf_skops_hdr_opt_len(), and check it in sol_tcp_sockopt() to reject
bpf_setsockopt(TCP_NODELAY) calls that would trigger
tcp_push_pending_frames() and cause the recursion.
Reported-by: Quan Sun <2022090917019@std.uestc.edu.cn>
Reported-by: Yinhao Hu <dddddd@hust.edu.cn>
Reported-by: Kaiyan Mei <M202472210@hust.edu.cn>
Reported-by: Dongliang Mu <dzm91@hust.edu.cn>
Closes: https://lore.kernel.org/bpf/d1d523c9-6901-4454-a183-94462b8f3e4e@std.uestc.edu.cn/
Fixes: 0813a841566f ("bpf: tcp: Allow bpf prog to write and parse TCP header option")
Signed-off-by: Jiayuan Chen <jiayuan.chen@linux.dev>
---
Documentation/networking/net_cachelines/tcp_sock.rst | 1 +
include/linux/tcp.h | 11 ++++++++++-
net/core/filter.c | 4 ++++
net/ipv4/tcp_minisocks.c | 1 +
net/ipv4/tcp_output.c | 3 +++
5 files changed, 19 insertions(+), 1 deletion(-)
diff --git a/Documentation/networking/net_cachelines/tcp_sock.rst b/Documentation/networking/net_cachelines/tcp_sock.rst
index 563daea10d6c..07d3226d90cc 100644
--- a/Documentation/networking/net_cachelines/tcp_sock.rst
+++ b/Documentation/networking/net_cachelines/tcp_sock.rst
@@ -152,6 +152,7 @@ unsigned_int keepalive_intvl
int linger2
u8 bpf_sock_ops_cb_flags
u8:1 bpf_chg_cc_inprogress
+u8:1 bpf_hdr_opt_len_cb_inprogress
u16 timeout_rehash
u32 rcv_ooopack
u32 rcv_rtt_last_tsecr
diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index f72eef31fa23..2bfb73cf922e 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -475,12 +475,21 @@ struct tcp_sock {
u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs
* values defined in uapi/linux/tcp.h
*/
- u8 bpf_chg_cc_inprogress:1; /* In the middle of
+ u8 bpf_chg_cc_inprogress:1, /* In the middle of
* bpf_setsockopt(TCP_CONGESTION),
* it is to avoid the bpf_tcp_cc->init()
* to recur itself by calling
* bpf_setsockopt(TCP_CONGESTION, "itself").
*/
+ bpf_hdr_opt_len_cb_inprogress:1; /* It is set before invoking the
+ * callback so that a nested
+ * bpf_setsockopt(TCP_NODELAY) or
+ * bpf_setsockopt(TCP_CORK) cannot
+ * trigger tcp_push_pending_frames(),
+ * which would call tcp_current_mss()
+ * -> bpf_skops_hdr_opt_len(), causing
+ * infinite recursion.
+ */
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG)
#else
#define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0
diff --git a/net/core/filter.c b/net/core/filter.c
index 78b548158fb0..518699429a7a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5483,6 +5483,10 @@ static int sol_tcp_sockopt(struct sock *sk, int optname,
if (sk->sk_protocol != IPPROTO_TCP)
return -EINVAL;
+ if ((optname == TCP_NODELAY || optname == TCP_CORK) &&
+ tcp_sk(sk)->bpf_hdr_opt_len_cb_inprogress)
+ return -EBUSY;
+
switch (optname) {
case TCP_NODELAY:
case TCP_MAXSEG:
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index dafb63b923d0..fb06c464ac16 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -663,6 +663,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
RCU_INIT_POINTER(newtp->fastopen_rsk, NULL);
newtp->bpf_chg_cc_inprogress = 0;
+ newtp->bpf_hdr_opt_len_cb_inprogress = 0;
tcp_bpf_clone(sk, newsk);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS);
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 326b58ff1118..c9654e690e1a 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -475,6 +475,7 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
unsigned int *remaining)
{
struct bpf_sock_ops_kern sock_ops;
+ struct tcp_sock *tp = tcp_sk(sk);
int err;
if (likely(!BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk),
@@ -519,7 +520,9 @@ static void bpf_skops_hdr_opt_len(struct sock *sk, struct sk_buff *skb,
if (skb)
bpf_skops_init_skb(&sock_ops, skb, 0);
+ tp->bpf_hdr_opt_len_cb_inprogress = 1;
err = BPF_CGROUP_RUN_PROG_SOCK_OPS_SK(&sock_ops, sk);
+ tp->bpf_hdr_opt_len_cb_inprogress = 0;
if (err || sock_ops.remaining_opt_len == *remaining)
return;
--
2.43.0
^ permalink raw reply related
* Re: maintainer profiles
From: Krzysztof Kozlowski @ 2026-04-14 11:18 UTC (permalink / raw)
To: Randy Dunlap, Linux Documentation, Linux Kernel Mailing List
Cc: Jonathan Corbet, Linux Kernel Workflows
In-Reply-To: <b7775383-da94-4098-8af9-2f672c4f1a71@infradead.org>
On 10/04/2026 02:18, Randy Dunlap wrote:
> Hi,
>
> Is there supposed to be a difference (or distinction) in the contents of
>
> Documentation/process/maintainer-handbooks.rst
> and
> Documentation/maintainer/maintainer-entry-profile.rst
> ?
>
> Can they be combined into one location?
Yes, please! Including also the location of actual profiles. I am mostly
looking at them in the sources directly, not web docs, so confusing and
annoying to find them distributed.
Best regards,
Krzysztof
^ permalink raw reply
* Re: maintainer profiles
From: Mauro Carvalho Chehab @ 2026-04-14 12:37 UTC (permalink / raw)
To: Dan Williams
Cc: Jonathan Corbet, Randy Dunlap, Linux Documentation,
Linux Kernel Mailing List, Linux Kernel Workflows
In-Reply-To: <69dd6299440be_147c801005b@djbw-dev.notmuch>
On Mon, 13 Apr 2026 14:39:37 -0700
Dan Williams <djbw@kernel.org> wrote:
> Jonathan Corbet wrote:
> > Randy Dunlap <rdunlap@infradead.org> writes:
> >
> > > Hi,
> > >
> > > Is there supposed to be a difference (or distinction) in the contents of
> > >
> > > Documentation/process/maintainer-handbooks.rst
> > > and
> > > Documentation/maintainer/maintainer-entry-profile.rst
> > > ?
> > >
> > > Can they be combined into one location?
> >
> > Late to the party, sorry ... the original idea, I believe, was that
> > maintainer-handbooks.rst would be for developers looking for a guidebook
> > for a specific subsystem, while maintainer-entry-profile.rst was about
> > how maintainers themselves should write their subsystem guide.
> > Doubtless things have drifted since then... But the intended audiences
> > were different, so it might be good to think about bringing them back
> > into focus.
>
> Right, I think something (roughly / hand-wavy) like the below is the
> intent. However, as I write that I notice that the combined list is a
> bit of a mess. I also notice that there are more "P:" entries in
> MAINTAINERS than there are entries in this maintainer-handbooks.rst
> list.
>
> So this probably wants to be a script that can build Documentation links
> from MAINTAINERS, or otherwise provide a script for developers to query
> a kernel tree for additional submission guides. It is probably not as
> important for the built docs to link all guides as it is for developers
> (or their agents) to live query a tree they are developing against.
There is already a Python script which parses MAINTAINERS file
(Documentation/sphinx/maintainers_include.py).
Currently, it expects a Sphinx meta-tag inside
Documentation/process/maintainers.rst:
.. maintainers-include::
I guess it shouldn't be hard to add support there for a
.. maintainers-profile::
Making it creating a set of cross-references is probably easy. Not
sure how easy/hard would be to create a TOC tree, though.
> Note the problem goes both ways, there are P: entries not in the
> combined handbook list, like the Security subsystem, and there are
> handbook entries without a P:, like the Tip tree.
Assuming we add such extension, we'll need to sync the P: entries.
I'll take a look on trying to extend the Sphinx maintainers
extension.
>
> diff --git a/Documentation/maintainer/maintainer-entry-profile.rst b/Documentation/maintainer/maintainer-entry-profile.rst
> index 6020d188e13d..58e2af333692 100644
> --- a/Documentation/maintainer/maintainer-entry-profile.rst
> +++ b/Documentation/maintainer/maintainer-entry-profile.rst
> @@ -92,24 +92,8 @@ full series, or privately send a reminder email. This section might also
> list how review works for this code area and methods to get feedback
> that are not directly from the maintainer.
>
> -Existing profiles
> ------------------
> -
> -For now, existing maintainer profiles are listed here; we will likely want
> -to do something different in the near future.
> -
> -.. toctree::
> - :maxdepth: 1
> -
> - ../doc-guide/maintainer-profile
> - ../nvdimm/maintainer-entry-profile
> - ../arch/riscv/patch-acceptance
> - ../process/maintainer-soc
> - ../process/maintainer-soc-clean-dts
> - ../driver-api/media/maintainer-entry-profile
> - ../process/maintainer-netdev
> - ../driver-api/vfio-pci-device-specific-driver-acceptance
> - ../nvme/feature-and-quirk-policy
> - ../filesystems/nfs/nfsd-maintainer-entry-profile
> - ../filesystems/xfs/xfs-maintainer-entry-profile
> - ../mm/damon/maintainer-profile
> +Maintainer Handbooks
> +--------------------
> +
> +For examples of other subsystem handbooks see
> +Documentation/process/maintainer-handbooks.rst.
> diff --git a/Documentation/process/maintainer-handbooks.rst b/Documentation/process/maintainer-handbooks.rst
> index 976391cec528..bc9299a04b1f 100644
> --- a/Documentation/process/maintainer-handbooks.rst
> +++ b/Documentation/process/maintainer-handbooks.rst
> @@ -9,14 +9,33 @@ The purpose of this document is to provide subsystem specific information
> which is supplementary to the general development process handbook
> :ref:`Documentation/process <development_process_main>`.
>
> +For developers, see below for all the known subsystem specific guides.
> +If the subsystem you are contributing to does not have a guide listed
> +here, it is fair to seek clarification of questions raised in
> +Documentation/maintainer/maintainer-entry-profile.rst.
> +
> +For maintainers, consider documenting additional requirements and
> +expectations if submissions routinely overlook specific submission
> +criteria. See Documentation/maintainer/maintainer-entry-profile.rst.
> +
> Contents:
>
> .. toctree::
> :numbered:
> :maxdepth: 2
>
> + maintainer-kvm-x86
> maintainer-netdev
> maintainer-soc
> maintainer-soc-clean-dts
> + maintainer-soc-clean-dts
> maintainer-tip
> - maintainer-kvm-x86
> + ../arch/riscv/patch-acceptance
> + ../doc-guide/maintainer-profile
> + ../driver-api/media/maintainer-entry-profile
> + ../driver-api/vfio-pci-device-specific-driver-acceptance
> + ../filesystems/nfs/nfsd-maintainer-entry-profile
> + ../filesystems/xfs/xfs-maintainer-entry-profile
> + ../mm/damon/maintainer-profile
> + ../nvdimm/maintainer-entry-profile
> + ../nvme/feature-and-quirk-policy
Sounds good on my eyes.
Reviewed-by: Mauro Carvalho Chehab <mchehab+huawei@kernel.org>
--
Thanks,
Mauro
^ permalink raw reply
* Re: [PATCH] docs: fix typos in kernel documentation
From: Jonathan Corbet @ 2026-04-14 12:54 UTC (permalink / raw)
To: fru1tworld; +Cc: skhan, linux-doc, fru1tworld
In-Reply-To: <20260414084553.22762-1-fruitworld.planet@gmail.com>
Thank you for working to improve our documentation.
fru1tworld <fruitworld.planet@gmail.com> writes:
> reinitalizes => reinitializes
> unpriviledged => unprivileged
> the the => the (duplicated word)
> sub-struture => sub-structure
These changes generally look OK, but...
> Signed-off-by: fru1tworld <fruitworld.planet@gmail.com>
We need a proper signoff with your real name, please.
> ---
> Documentation/block/data-integrity.rst | 2 +-
> Documentation/core-api/list.rst | 2 +-
> Documentation/core-api/real-time/differences.rst | 2 +-
This one has already been fixed; it's always best to prepare your
patches against docs-next or linux-next.
> Documentation/gpu/drm-uapi.rst | 2 +-
> 4 files changed, 4 insertions(+), 4 deletions(-)
Thanks,
jon
^ permalink raw reply
* RE: [PATCH v7 6/6] docs: iio: adc: ad4691: add driver documentation
From: Sabau, Radu bogdan @ 2026-04-14 12:54 UTC (permalink / raw)
To: David Lechner, Lars-Peter Clausen, Hennerich, Michael,
Jonathan Cameron, Sa, Nuno, Andy Shevchenko, Rob Herring,
Krzysztof Kozlowski, Conor Dooley, Uwe Kleine-König,
Liam Girdwood, Mark Brown, Linus Walleij, Bartosz Golaszewski,
Philipp Zabel, Jonathan Corbet, Shuah Khan
Cc: linux-iio@vger.kernel.org, devicetree@vger.kernel.org,
linux-kernel@vger.kernel.org, linux-pwm@vger.kernel.org,
linux-gpio@vger.kernel.org, linux-doc@vger.kernel.org
In-Reply-To: <9c36ee85-12da-41e8-b9ab-e32b7ec29e75@baylibre.com>
> -----Original Message-----
> From: David Lechner <dlechner@baylibre.com>
> Sent: Saturday, April 11, 2026 12:39 AM
...
> > +Buffer data format
> > +==================
> > +
> > +The IIO buffer data format (``in_voltageN_type``) is the same across all
> > +paths: 16-bit unsigned big-endian samples with no shift.
> > +
> > ++-------------------------+-------------+----------+-------+
> > +| Path | storagebits | realbits | shift |
> > ++=========================+=============+==========+=======+
> > +| Triggered buffer | 16 | 16 | 0 |
> > ++-------------------------+-------------+----------+-------+
> > +| CNV Burst offload (DMA) | 16 | 16 | 0 |
> > ++-------------------------+-------------+----------+-------+
> > +| Manual offload (DMA) | 16 | 16 | 0 |
> > ++-------------------------+-------------+----------+-------+
>
> Not sure this table is helpful since all values are the same everywhere.
>
> Also, doesn't SPI offload have storagebits == 32?
I tried using 16 storage bits for offload too, and so use the same channels
macro. For Manual its received in the next transfer and for CNV only the
receive transfers are rx streamed, and so 16 storage bits suffice for both.
^ permalink raw reply
* Re: [PATCH V10 00/10] famfs: port into fuse
From: Miklos Szeredi @ 2026-04-14 13:19 UTC (permalink / raw)
To: Joanne Koong
Cc: John Groves, Bernd Schubert, John Groves, Dan Williams,
Bernd Schubert, Alison Schofield, John Groves, Jonathan Corbet,
Shuah Khan, Vishal Verma, Dave Jiang, Matthew Wilcox, Jan Kara,
Alexander Viro, David Hildenbrand, Christian Brauner,
Darrick J . Wong, Randy Dunlap, Jeff Layton, Amir Goldstein,
Jonathan Cameron, Stefan Hajnoczi, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org, djbw
In-Reply-To: <CAJnrk1a06zkUmXW5EFiUmgAoFauwtzsYvnotaPH0ifVtyh7iDQ@mail.gmail.com>
On Fri, 10 Apr 2026 at 21:44, Joanne Koong <joannelkoong@gmail.com> wrote:
> Overall, my intention with bringing this up is just to make sure we're
> at least aware of this alternative before anything is merged and
> permanent. If Miklos and you think we should land this series, then
> I'm on board with that.
TBH, I'd prefer not to add the famfs specific mapping interface if not
absolutely necessary. This was the main sticking point originally,
but there seemed to be no better alternative.
However with the bpf approach this would be gone, which is great.
So let us please at least have a try at this. I'm not into bpf yet,
but willing to learn.
Thanks,
Miklos
^ permalink raw reply
* Re: [PATCH V10 00/10] famfs: port into fuse
From: John Groves @ 2026-04-14 13:41 UTC (permalink / raw)
To: Miklos Szeredi
Cc: Joanne Koong, Bernd Schubert, John Groves, Dan Williams,
Bernd Schubert, Alison Schofield, John Groves, Jonathan Corbet,
Shuah Khan, Vishal Verma, Dave Jiang, Matthew Wilcox, Jan Kara,
Alexander Viro, David Hildenbrand, Christian Brauner,
Darrick J . Wong, Randy Dunlap, Jeff Layton, Amir Goldstein,
Jonathan Cameron, Stefan Hajnoczi, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org, djbw
In-Reply-To: <CAJfpegvVTcV89=q3L326aGQjhduBcv7PVg5QKftGLjNZmCLmaw@mail.gmail.com>
On 26/04/14 03:19PM, Miklos Szeredi wrote:
> On Fri, 10 Apr 2026 at 21:44, Joanne Koong <joannelkoong@gmail.com> wrote:
>
> > Overall, my intention with bringing this up is just to make sure we're
> > at least aware of this alternative before anything is merged and
> > permanent. If Miklos and you think we should land this series, then
> > I'm on board with that.
>
> TBH, I'd prefer not to add the famfs specific mapping interface if not
> absolutely necessary. This was the main sticking point originally,
> but there seemed to be no better alternative.
>
> However with the bpf approach this would be gone, which is great.
>
> So let us please at least have a try at this. I'm not into bpf yet,
> but willing to learn.
>
> Thanks,
> Miklos
Thanks for responding...
My short response: Noooooooooo!!!!!!
I very strongly object to making this a prerequisite to merging. This
is an untested idea that will certainly delay us by at least a couple
of merge windows when products are shipping now, and the existing approach
has been in circulation for a long time. It is TOO LATE!!!!!!
Famfs is not a science project, it's enablement for actual products and
early versions are available now!!!
That doesn't mean we couldn't convert later IF THERE ARE NO HIDDEN PROBLEMS.
What are the risks of converting to BPF?
- I don't know how to do it - so it'll be slow (kinda like my fuse learning
curve cost about a year because this is not that similar to anything
else that was already in fuse.
- Those of us who are involved don't fully understand either the security
or performance implications of this. It
- Famfs is enabling access to memory and mapping fault handling must be
at "memory speed". We know that BPF walks some data structures when a
program executes. That exposes us to additional serialized L3 cache
misses each time we service a mapping fault (any TLB & page table miss).
This should be studied side-by-side with the existing approach under
multiple loads before being adopted for production.
- This has never been done in production, and we're throwing it in the way
of a project that has been soaking for years and needs to support early
shipments of products.
If this is the only path, I'd like to revive famfs as a standalone file
system. I'm still maintaining that and it's still in use.
Please reconsider Miklos. To use an American football metaphor, this moves
the goal posts by a mile, and that's not reasonable!!!
Thanks,
John
^ permalink raw reply
* Re: [PATCH V10 00/10] famfs: port into fuse
From: Miklos Szeredi @ 2026-04-14 14:18 UTC (permalink / raw)
To: John Groves
Cc: Joanne Koong, Bernd Schubert, John Groves, Dan Williams,
Bernd Schubert, Alison Schofield, John Groves, Jonathan Corbet,
Shuah Khan, Vishal Verma, Dave Jiang, Matthew Wilcox, Jan Kara,
Alexander Viro, David Hildenbrand, Christian Brauner,
Darrick J . Wong, Randy Dunlap, Jeff Layton, Amir Goldstein,
Jonathan Cameron, Stefan Hajnoczi, Josef Bacik, Bagas Sanjaya,
Chen Linxuan, James Morse, Fuad Tabba, Sean Christopherson,
Shivank Garg, Ackerley Tng, Gregory Price, Aravind Ramesh,
Ajay Joshi, venkataravis@micron.com, linux-doc@vger.kernel.org,
linux-kernel@vger.kernel.org, nvdimm@lists.linux.dev,
linux-cxl@vger.kernel.org, linux-fsdevel@vger.kernel.org, djbw
In-Reply-To: <ad4_jFsR951c2Mtn@groves.net>
On Tue, 14 Apr 2026 at 15:41, John Groves <John@groves.net> wrote:
> My short response: Noooooooooo!!!!!!
:) Seems like this is a highly emotional topic... I suggest that we
go ahead with bpf experiments, then discuss results and path forward
at LSM.
Thanks,
Miklos
^ permalink raw reply
* [RFC, PATCH 00/12] userfaultfd: working set tracking for VM guest memory
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
This series adds userfaultfd support for tracking the working set of
VM guest memory, enabling VMMs to identify cold pages and evict them
to tiered or remote storage.
== Problem ==
VMMs managing guest memory need to:
1. Track which pages are actively used (working set detection)
2. Safely evict cold pages to slower storage
3. Fetch pages back on demand when accessed again
For shmem-backed guest memory, working set tracking partially works
today: MADV_DONTNEED zaps PTEs while pages stay in page cache, and
re-access auto-resolves from cache. But safe eviction still requires
synchronous fault interception to prevent data loss races.
For anonymous guest memory (needed for KSM cross-VM deduplication),
there is no mechanism at all — clearing a PTE loses the page.
== Solution ==
The series introduces a unified userfaultfd interface that works
across both anonymous and shmem-backed memory:
UFFD_FEATURE_MINOR_ANON: extends MODE_MINOR registration to anonymous
private memory. Uses the PROT_NONE hinting mechanism (same as NUMA
balancing) to make pages inaccessible without freeing them.
UFFD_FEATURE_MINOR_ASYNC: auto-resolves minor faults without handler
involvement. The kernel restores PTE permissions immediately and the
faulting thread continues. Works for anonymous, shmem, and hugetlbfs.
UFFDIO_DEACTIVATE: marks pages as deactivated. For anonymous memory,
sets PROT_NONE on PTEs (pages stay resident). For shmem/hugetlbfs,
zaps PTEs (pages stay in page cache).
UFFDIO_SET_MODE: toggles MINOR_ASYNC at runtime, synchronized via
mmap_write_lock. Enables the VMM workflow: async mode for lightweight
detection, sync mode for race-free eviction.
PAGE_IS_UFFD_DEACTIVATED: PAGEMAP_SCAN category flag for efficient
batch detection of cold (still-deactivated) anonymous pages.
== VMM Workflow ==
UFFDIO_DEACTIVATE(all) -- async, no vCPU stalls
sleep(interval)
PAGEMAP_SCAN -- find cold pages
UFFDIO_SET_MODE(sync) -- block faults for eviction
pwrite + MADV_DONTNEED cold pages -- safe, faults block
UFFDIO_SET_MODE(async) -- resume tracking
The same workflow applies to shmem, with a different PAGEMAP_SCAN mask
(!PAGE_IS_PRESENT instead of PAGE_IS_UFFD_DEACTIVATED).
== NUMA Balancing ==
NUMA balancing scanning is skipped on anonymous VM_UFFD_MINOR VMAs to
avoid protnone conflicts. NUMA locality stats are fed from the uffd
fault path via task_numa_fault() so the scheduler retains placement
data. Shmem VMAs are unaffected (UFFDIO_DEACTIVATE zaps PTEs there,
no protnone involved).
== Testing ==
The series includes 6 new selftests covering async/sync modes,
PAGEMAP_SCAN cold detection, GUP through protnone, UFFDIO_SET_MODE
toggling, and cleanup on close. All 73 uffd unit tests pass
(including hugetlb) across defconfig, allnoconfig, allmodconfig,
and randomized configs.
Kiryl Shutsemau (Meta) (12):
userfaultfd: define UAPI constants for anonymous minor faults
userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
userfaultfd: implement UFFDIO_DEACTIVATE ioctl
userfaultfd: UFFDIO_CONTINUE for anonymous memory
mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async
mode
sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
userfaultfd: enable UFFD_FEATURE_MINOR_ANON
mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN
userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
selftests/mm: add userfaultfd anonymous minor fault tests
Documentation/userfaultfd: document working set tracking
Documentation/admin-guide/mm/userfaultfd.rst | 141 ++++-
fs/proc/task_mmu.c | 11 +-
fs/userfaultfd.c | 184 +++++-
include/linux/huge_mm.h | 6 +
include/linux/mm.h | 2 +
include/linux/sched/numa_balancing.h | 1 +
include/linux/userfaultfd_k.h | 21 +-
include/trace/events/sched.h | 3 +-
include/uapi/linux/fs.h | 1 +
include/uapi/linux/userfaultfd.h | 40 +-
kernel/sched/fair.c | 13 +
mm/huge_memory.c | 33 +-
mm/hugetlb.c | 3 +-
mm/memory.c | 51 +-
mm/mprotect.c | 9 +-
mm/shmem.c | 3 +-
mm/userfaultfd.c | 164 +++++-
tools/testing/selftests/mm/uffd-unit-tests.c | 458 +++++++++++++++
18 files changed, 1096 insertions(+), 48 deletions(-)
Kiryl Shutsemau (Meta) (12):
userfaultfd: define UAPI constants for anonymous minor faults
userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
userfaultfd: implement UFFDIO_DEACTIVATE ioctl
userfaultfd: UFFDIO_CONTINUE for anonymous memory
mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async
mode
sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
userfaultfd: enable UFFD_FEATURE_MINOR_ANON
mm/pagemap: add PAGE_IS_UFFD_DEACTIVATED to PAGEMAP_SCAN
userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
selftests/mm: add userfaultfd anonymous minor fault tests
Documentation/userfaultfd: document working set tracking
Documentation/admin-guide/mm/userfaultfd.rst | 141 +++++-
fs/proc/task_mmu.c | 11 +-
fs/userfaultfd.c | 184 +++++++-
include/linux/huge_mm.h | 6 +
include/linux/mm.h | 2 +
include/linux/sched/numa_balancing.h | 1 +
include/linux/userfaultfd_k.h | 21 +-
include/trace/events/sched.h | 3 +-
include/uapi/linux/fs.h | 1 +
include/uapi/linux/userfaultfd.h | 40 +-
kernel/sched/fair.c | 13 +
mm/huge_memory.c | 33 +-
mm/hugetlb.c | 3 +-
mm/memory.c | 51 ++-
mm/mprotect.c | 9 +-
mm/shmem.c | 3 +-
mm/userfaultfd.c | 164 ++++++-
tools/testing/selftests/mm/uffd-unit-tests.c | 458 +++++++++++++++++++
18 files changed, 1096 insertions(+), 48 deletions(-)
--
2.51.2
^ permalink raw reply
* [RFC, PATCH 01/12] userfaultfd: define UAPI constants for anonymous minor faults
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
Add UAPI definitions for userfaultfd working set tracking on anonymous
memory:
- UFFD_FEATURE_MINOR_ANON: minor fault support for anonymous memory
- UFFD_FEATURE_MINOR_ASYNC: auto-resolve minor faults without handler
- UFFDIO_DEACTIVATE: mark pages as deactivated (protnone or PTE zap)
Not yet added to UFFD_API_FEATURES or UFFD_API_RANGE_IOCTLS.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
include/uapi/linux/userfaultfd.h | 17 +++++++++++++++++
1 file changed, 17 insertions(+)
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2841e4ea8f2c..336d07e1b6de 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -79,6 +79,7 @@
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_POISON (0x08)
+#define _UFFDIO_DEACTIVATE (0x09)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -103,6 +104,8 @@
struct uffdio_continue)
#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
struct uffdio_poison)
+#define UFFDIO_DEACTIVATE _IOR(UFFDIO, _UFFDIO_DEACTIVATE, \
+ struct uffdio_range)
/* read() structure */
struct uffd_msg {
@@ -230,6 +233,18 @@ struct uffdio_api {
*
* UFFD_FEATURE_MOVE indicates that the kernel supports moving an
* existing page contents from userspace.
+ *
+ * UFFD_FEATURE_MINOR_ANON indicates that minor fault interception
+ * is supported for anonymous private memory. Pages are made
+ * inaccessible via UFFDIO_DEACTIVATE (sets PROT_NONE while
+ * preserving the page) and faults are delivered when the pages
+ * are re-accessed.
+ *
+ * UFFD_FEATURE_MINOR_ASYNC indicates asynchronous minor fault
+ * mode. When set, faults on deactivated pages are auto-resolved
+ * by the kernel (PTE permissions restored immediately) without
+ * delivering a message to the userfaultfd handler. Use
+ * PAGEMAP_SCAN to find pages that were not re-accessed.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -248,6 +263,8 @@ struct uffdio_api {
#define UFFD_FEATURE_POISON (1<<14)
#define UFFD_FEATURE_WP_ASYNC (1<<15)
#define UFFD_FEATURE_MOVE (1<<16)
+#define UFFD_FEATURE_MINOR_ANON (1<<17)
+#define UFFD_FEATURE_MINOR_ASYNC (1<<18)
__u64 features;
__u64 ioctls;
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 02/12] userfaultfd: add UFFD_FEATURE_MINOR_ANON registration support
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
Allow UFFDIO_REGISTER_MODE_MINOR on anonymous VMAs when the
UFFD_FEATURE_MINOR_ANON feature is enabled.
Replace the bool wp_async parameter in vma_can_userfault() and
userfaultfd_register_range() with an extensible ctx_flags bitmap.
Add UFFD_CTX_WP_ASYNC and UFFD_CTX_MINOR_ANON flags, and
userfaultfd_ctx_flags() to build the bitmap from ctx->features.
Add userfaultfd_minor_async() helper for checking async minor mode
from the fault path.
Gate UFFD_FEATURE_MINOR_ANON and UFFD_FEATURE_MINOR_ASYNC on
CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Validate that MINOR_ASYNC
requires at least one minor feature.
Not yet visible to userspace (not in UFFD_API_FEATURES).
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 49 ++++++++++++++++++++++++++++++-----
include/linux/userfaultfd_k.h | 19 +++++++++++---
mm/userfaultfd.c | 4 +--
3 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bdc84e5219cd..8d508ad19e89 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -89,6 +89,27 @@ static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}
+static bool userfaultfd_minor_anon_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_MINOR_ANON);
+}
+
+static bool userfaultfd_minor_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_MINOR_ASYNC);
+}
+
+static unsigned int userfaultfd_ctx_flags(struct userfaultfd_ctx *ctx)
+{
+ unsigned int flags = 0;
+
+ if (userfaultfd_wp_async_ctx(ctx))
+ flags |= UFFD_CTX_WP_ASYNC;
+ if (userfaultfd_minor_anon_ctx(ctx))
+ flags |= UFFD_CTX_MINOR_ANON;
+ return flags;
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -1271,7 +1292,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
bool basic_ioctls;
unsigned long start, end;
struct vma_iterator vmi;
- bool wp_async = userfaultfd_wp_async_ctx(ctx);
+ unsigned int ctx_flags = userfaultfd_ctx_flags(ctx);
user_uffdio_register = (struct uffdio_register __user *) arg;
@@ -1345,7 +1366,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
/* check not compatible vmas */
ret = -EINVAL;
- if (!vma_can_userfault(cur, vm_flags, wp_async))
+ if (!vma_can_userfault(cur, vm_flags, ctx_flags))
goto out_unlock;
/*
@@ -1398,7 +1419,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
VM_WARN_ON_ONCE(!found);
ret = userfaultfd_register_range(ctx, vma, vm_flags, start, end,
- wp_async);
+ ctx_flags);
out_unlock:
mmap_write_unlock(mm);
@@ -1443,7 +1464,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
unsigned long start, end, vma_end;
const void __user *buf = (void __user *)arg;
struct vma_iterator vmi;
- bool wp_async = userfaultfd_wp_async_ctx(ctx);
+ unsigned int ctx_flags = userfaultfd_ctx_flags(ctx);
ret = -EFAULT;
if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
@@ -1505,7 +1526,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
* provides for more strict behavior to notice
* unregistration errors.
*/
- if (!vma_can_userfault(cur, cur->vm_flags, wp_async))
+ if (!vma_can_userfault(cur, cur->vm_flags, ctx_flags))
goto out_unlock;
found = true;
@@ -1526,7 +1547,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
goto skip;
VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx != ctx);
- VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vma->vm_flags, ctx_flags));
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
if (vma->vm_start > start)
@@ -1890,6 +1911,11 @@ bool userfaultfd_wp_async(struct vm_area_struct *vma)
return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}
+bool userfaultfd_minor_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_minor_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -1993,11 +2019,20 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (features & UFFD_FEATURE_WP_ASYNC)
features |= UFFD_FEATURE_WP_UNPOPULATED;
+ ret = -EINVAL;
+ /* MINOR_ASYNC requires at least one minor feature */
+ if ((features & UFFD_FEATURE_MINOR_ASYNC) &&
+ !(features & (UFFD_FEATURE_MINOR_ANON |
+ UFFD_FEATURE_MINOR_HUGETLBFS |
+ UFFD_FEATURE_MINOR_SHMEM)))
+ goto err_out;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
uffdio_api.features &=
- ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+ ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM |
+ UFFD_FEATURE_MINOR_ANON | UFFD_FEATURE_MINOR_ASYNC);
#endif
if (!pgtable_supports_uffd_wp())
uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index fd5f42765497..d1d4ed4a08b0 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -208,9 +208,13 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma)
return vma->vm_flags & __VM_UFFD_FLAGS;
}
+/* Flags for vma_can_userfault() describing uffd context capabilities */
+#define UFFD_CTX_WP_ASYNC (1 << 0)
+#define UFFD_CTX_MINOR_ANON (1 << 1)
+
static inline bool vma_can_userfault(struct vm_area_struct *vma,
vm_flags_t vm_flags,
- bool wp_async)
+ unsigned int ctx_flags)
{
vm_flags &= __VM_UFFD_FLAGS;
@@ -218,14 +222,15 @@ static inline bool vma_can_userfault(struct vm_area_struct *vma,
return false;
if ((vm_flags & VM_UFFD_MINOR) &&
- (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma)))
+ !is_vm_hugetlb_page(vma) && !vma_is_shmem(vma) &&
+ !(vma_is_anonymous(vma) && (ctx_flags & UFFD_CTX_MINOR_ANON)))
return false;
/*
* If wp async enabled, and WP is the only mode enabled, allow any
* memory type.
*/
- if (wp_async && (vm_flags == VM_UFFD_WP))
+ if ((ctx_flags & UFFD_CTX_WP_ASYNC) && (vm_flags == VM_UFFD_WP))
return true;
/*
@@ -270,6 +275,7 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
+extern bool userfaultfd_minor_async(struct vm_area_struct *vma);
void userfaultfd_reset_ctx(struct vm_area_struct *vma);
@@ -283,7 +289,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
vm_flags_t vm_flags,
unsigned long start, unsigned long end,
- bool wp_async);
+ unsigned int ctx_flags);
void userfaultfd_release_new(struct userfaultfd_ctx *ctx);
@@ -446,6 +452,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_minor_async(struct vm_area_struct *vma)
+{
+ return false;
+}
+
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
return false;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 927086bb4a3c..dba1ea26fdfe 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -2008,7 +2008,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
struct vm_area_struct *vma,
vm_flags_t vm_flags,
unsigned long start, unsigned long end,
- bool wp_async)
+ unsigned int ctx_flags)
{
VMA_ITERATOR(vmi, ctx->mm, start);
struct vm_area_struct *prev = vma_prev(&vmi);
@@ -2021,7 +2021,7 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
for_each_vma_range(vmi, vma, end) {
cond_resched();
- VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, wp_async));
+ VM_WARN_ON_ONCE(!vma_can_userfault(vma, vm_flags, ctx_flags));
VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx &&
vma->vm_userfaultfd_ctx.ctx != ctx);
VM_WARN_ON_ONCE(!(vma->vm_flags & VM_MAYWRITE));
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 03/12] userfaultfd: implement UFFDIO_DEACTIVATE ioctl
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
UFFDIO_DEACTIVATE marks pages as deactivated within a VM_UFFD_MINOR
range:
- Anonymous memory: set protnone via change_protection(MM_CP_UFFD_DEACTIVATE).
Pages stay resident with PFNs preserved, only permissions removed.
MM_CP_UFFD_DEACTIVATE is handled independently from MM_CP_PROT_NUMA,
bypassing folio_can_map_prot_numa() and CONFIG_NUMA_BALANCING guards.
- Shared shmem/hugetlbfs: zap PTEs via zap_page_range_single().
Pages stay in page cache.
- Private hugetlb: rejected with -EINVAL (zapping would destroy content).
Cleanup on unregister/close: restore protnone PTEs to normal permissions
in userfaultfd_clear_vma(), preventing permanently inaccessible pages.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 35 ++++++++++++++++
include/linux/mm.h | 2 +
include/linux/userfaultfd_k.h | 2 +
mm/huge_memory.c | 9 ++--
mm/mprotect.c | 9 +++-
mm/userfaultfd.c | 78 +++++++++++++++++++++++++++++++++--
6 files changed, 127 insertions(+), 8 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 8d508ad19e89..b317c9854b86 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -1441,6 +1441,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+ /* DEACTIVATE is only supported for MINOR ranges. */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_DEACTIVATE);
+
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
@@ -1788,6 +1792,34 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return ret;
}
+static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_range uffdio_range;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_range, (void __user *)arg,
+ sizeof(uffdio_range)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, uffdio_range.start, uffdio_range.len);
+ if (ret)
+ return ret;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mdeactivate_range(ctx, uffdio_range.start,
+ uffdio_range.len);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ return ret;
+}
+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
@@ -2108,6 +2140,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_POISON:
ret = userfaultfd_poison(ctx, arg);
break;
+ case UFFDIO_DEACTIVATE:
+ ret = userfaultfd_deactivate(ctx, arg);
+ break;
}
return ret;
}
diff --git a/include/linux/mm.h b/include/linux/mm.h
index abb4963c1f06..fc2841264d56 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -3036,6 +3036,8 @@ int get_cmdline(struct task_struct *task, char *buffer, int buflen);
#define MM_CP_UFFD_WP_RESOLVE (1UL << 3) /* Resolve wp */
#define MM_CP_UFFD_WP_ALL (MM_CP_UFFD_WP | \
MM_CP_UFFD_WP_RESOLVE)
+/* Whether this change is for uffd deactivation */
+#define MM_CP_UFFD_DEACTIVATE (1UL << 4)
bool can_change_pte_writable(struct vm_area_struct *vma, unsigned long addr,
pte_t pte);
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d1d4ed4a08b0..c94b5c5b5f24 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -130,6 +130,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
+extern int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len);
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index b298cba853ab..2ad736ff007c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2563,6 +2563,7 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
spinlock_t *ptl;
pmd_t oldpmd, entry;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int ret = 1;
@@ -2582,8 +2583,11 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
goto unlock;
}
- if (prot_numa) {
+ /* Already protnone — nothing to do for either NUMA or uffd */
+ if ((prot_numa || uffd_deactivate) && pmd_protnone(*pmd))
+ goto unlock;
+ if (prot_numa) {
/*
* Avoid trapping faults against the zero page. The read-only
* data is likely to be read-cached on the local CPU and
@@ -2592,9 +2596,6 @@ int change_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
if (is_huge_zero_pmd(*pmd))
goto unlock;
- if (pmd_protnone(*pmd))
- goto unlock;
-
if (!folio_can_map_prot_numa(pmd_folio(*pmd), vma,
vma_is_single_threaded_private(vma)))
goto unlock;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index c0571445bef7..7c612a680014 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -220,6 +220,7 @@ static long change_pte_range(struct mmu_gather *tlb,
long pages = 0;
bool is_private_single_threaded;
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
+ bool uffd_deactivate = cp_flags & MM_CP_UFFD_DEACTIVATE;
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
int nr_ptes;
@@ -245,7 +246,8 @@ static long change_pte_range(struct mmu_gather *tlb,
pte_t ptent;
/* Already in the desired state. */
- if (prot_numa && pte_protnone(oldpte))
+ if ((prot_numa || uffd_deactivate) &&
+ pte_protnone(oldpte))
continue;
page = vm_normal_page(vma, addr, oldpte);
@@ -255,6 +257,8 @@ static long change_pte_range(struct mmu_gather *tlb,
/*
* Avoid trapping faults against the zero or KSM
* pages. See similar comment in change_huge_pmd.
+ * Skip this filter for uffd deactivation which
+ * must set protnone regardless of NUMA placement.
*/
if (prot_numa &&
!folio_can_map_prot_numa(folio, vma,
@@ -651,6 +655,9 @@ long change_protection(struct mmu_gather *tlb,
WARN_ON_ONCE(cp_flags & MM_CP_PROT_NUMA);
#endif
+ if (cp_flags & MM_CP_UFFD_DEACTIVATE)
+ newprot = PAGE_NONE;
+
if (is_vm_hugetlb_page(vma))
pages = hugetlb_change_protection(vma, start, end, newprot,
cp_flags);
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index dba1ea26fdfe..3373b11b9d83 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -775,7 +775,7 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
goto out_unlock;
- if (!vma_is_shmem(dst_vma) &&
+ if (!vma_is_shmem(dst_vma) && !vma_is_anonymous(dst_vma) &&
uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE))
goto out_unlock;
@@ -797,13 +797,16 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
break;
}
dst_pmdval = pmdp_get_lockless(dst_pmd);
+ if (unlikely(!pmd_present(dst_pmdval))) {
+ err = -EEXIST;
+ break;
+ }
/*
* If the dst_pmd is THP don't override it and just be strict.
* (This includes the case where the PMD used to be THP and
* changed back to none after __pte_alloc().)
*/
- if (unlikely(!pmd_present(dst_pmdval) ||
- pmd_trans_huge(dst_pmdval))) {
+ if (unlikely(pmd_trans_huge(dst_pmdval))) {
err = -EEXIST;
break;
}
@@ -996,6 +999,65 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
return err;
}
+int mdeactivate_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len)
+{
+ struct mm_struct *dst_mm = ctx->mm;
+ unsigned long end = start + len;
+ struct vm_area_struct *dst_vma;
+ long err;
+ VMA_ITERATOR(vmi, dst_mm, start);
+
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start + len <= start);
+
+ guard(mmap_read_lock)(dst_mm);
+ guard(rwsem_read)(&ctx->map_changing_lock);
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ err = -ENOENT;
+ for_each_vma_range(vmi, dst_vma, end) {
+ unsigned long vma_start = max(dst_vma->vm_start, start);
+ unsigned long vma_end = min(dst_vma->vm_end, end);
+
+ if (!userfaultfd_minor(dst_vma)) {
+ err = -ENOENT;
+ break;
+ }
+
+ /*
+ * Private hugetlb has no page cache to fall back on —
+ * zapping PTEs would destroy page content.
+ */
+ if (is_vm_hugetlb_page(dst_vma) &&
+ !(dst_vma->vm_flags & VM_SHARED)) {
+ err = -EINVAL;
+ break;
+ }
+
+ if (vma_is_anonymous(dst_vma)) {
+ /* Anonymous: set protnone, pages stay resident */
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, dst_mm);
+ err = change_protection(&tlb, dst_vma, vma_start,
+ vma_end,
+ MM_CP_UFFD_DEACTIVATE);
+ tlb_finish_mmu(&tlb);
+ if (err < 0)
+ break;
+ } else {
+ /* Shared shmem/hugetlb: zap PTEs, pages stay in page cache */
+ zap_page_range_single(dst_vma, vma_start,
+ vma_end - vma_start, NULL);
+ }
+ err = 0;
+ }
+ return err;
+}
void double_pt_lock(spinlock_t *ptl1,
spinlock_t *ptl2)
@@ -1988,6 +2050,16 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
if (userfaultfd_wp(vma))
uffd_wp_range(vma, start, end - start, false);
+ /* Restore protnone PTEs to normal permissions */
+ if (userfaultfd_minor(vma) && vma_is_anonymous(vma)) {
+ struct mmu_gather tlb;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ change_protection(&tlb, vma, start, end,
+ MM_CP_TRY_CHANGE_WRITABLE);
+ tlb_finish_mmu(&tlb);
+ }
+
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
vma->vm_flags & ~__VM_UFFD_FLAGS,
NULL_VM_UFFD_CTX, give_up_on_oom);
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 04/12] userfaultfd: UFFDIO_CONTINUE for anonymous memory
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
Allow UFFDIO_CONTINUE on anonymous VMAs with VM_UFFD_MINOR. For shmem,
CONTINUE installs a PTE from page cache. For anonymous memory, the
page is already mapped via a protnone PTE — CONTINUE restores the
original VMA permissions.
PTE level: mfill_atomic_pte_continue_anon() walks to the PTE, verifies
protnone, restores permissions. Rename the shmem path to
mfill_atomic_pte_continue_shmem() for clarity.
PMD/THP level: mfill_atomic_pmd_continue_anon() restores protnone PMD
permissions in place without splitting. Handles PMD races with EAGAIN
retry in the mfill_atomic loop.
Add protnone PTE/PMD checks in userfaultfd_must_wait() so sync minor
faults properly block until resolved.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
fs/userfaultfd.c | 9 +++++-
mm/userfaultfd.c | 82 ++++++++++++++++++++++++++++++++++++++++++++----
2 files changed, 84 insertions(+), 7 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index b317c9854b86..43064238fd8d 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -340,8 +340,11 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
if (!pmd_present(_pmd))
return false;
- if (pmd_trans_huge(_pmd))
+ if (pmd_trans_huge(_pmd)) {
+ if (pmd_protnone(_pmd) && (reason & VM_UFFD_MINOR))
+ return true;
return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+ }
pte = pte_offset_map(pmd, address);
if (!pte)
@@ -366,6 +369,9 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
*/
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
goto out;
+ /* PTE is still protnone (deactivated), wait for userspace to resolve. */
+ if (pte_protnone(ptent) && (reason & VM_UFFD_MINOR))
+ goto out;
ret = false;
out:
@@ -1820,6 +1826,7 @@ static int userfaultfd_deactivate(struct userfaultfd_ctx *ctx,
return ret;
}
+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 3373b11b9d83..4c52fa5d1608 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -380,8 +380,61 @@ static int mfill_atomic_pte_zeropage(pmd_t *dst_pmd,
return ret;
}
-/* Handles UFFDIO_CONTINUE for all shmem VMAs (shared or private). */
-static int mfill_atomic_pte_continue(pmd_t *dst_pmd,
+static int mfill_atomic_pte_continue_anon(pmd_t *dst_pmd,
+ struct vm_area_struct *dst_vma,
+ unsigned long dst_addr,
+ uffd_flags_t flags)
+{
+ pte_t *ptep, pte;
+ spinlock_t *ptl;
+ int ret = -EFAULT;
+
+ ptep = pte_offset_map_lock(dst_vma->vm_mm, dst_pmd, dst_addr, &ptl);
+ if (!ptep)
+ return ret;
+
+ pte = ptep_get(ptep);
+ if (!pte_protnone(pte))
+ goto out_unlock;
+
+ pte = pte_modify(pte, dst_vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ if (flags & MFILL_ATOMIC_WP)
+ pte = pte_wrprotect(pte);
+ set_pte_at(dst_vma->vm_mm, dst_addr, ptep, pte);
+ update_mmu_cache(dst_vma, dst_addr, ptep);
+ ret = 0;
+out_unlock:
+ pte_unmap_unlock(ptep, ptl);
+ return ret;
+}
+
+static int mfill_atomic_pmd_continue_anon(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ unsigned long addr,
+ pmd_t *pmd, pmd_t orig_pmd,
+ uffd_flags_t flags)
+{
+ spinlock_t *ptl;
+ pmd_t entry;
+
+ ptl = pmd_lock(mm, pmd);
+ if (unlikely(!pmd_same(pmdp_get(pmd), orig_pmd))) {
+ spin_unlock(ptl);
+ return -EAGAIN;
+ }
+
+ entry = pmd_modify(orig_pmd, vma->vm_page_prot);
+ entry = pmd_mkyoung(entry);
+ if (flags & MFILL_ATOMIC_WP)
+ entry = pmd_wrprotect(entry);
+ set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, entry);
+ update_mmu_cache_pmd(vma, addr, pmd);
+ spin_unlock(ptl);
+ return 0;
+}
+
+static int mfill_atomic_pte_continue_shmem(pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
unsigned long dst_addr,
uffd_flags_t flags)
@@ -667,7 +720,10 @@ static __always_inline ssize_t mfill_atomic_pte(pmd_t *dst_pmd,
ssize_t err;
if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE)) {
- return mfill_atomic_pte_continue(dst_pmd, dst_vma,
+ if (vma_is_anonymous(dst_vma))
+ return mfill_atomic_pte_continue_anon(dst_pmd, dst_vma,
+ dst_addr, flags);
+ return mfill_atomic_pte_continue_shmem(dst_pmd, dst_vma,
dst_addr, flags);
} else if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) {
return mfill_atomic_pte_poison(dst_pmd, dst_vma,
@@ -802,11 +858,25 @@ static __always_inline ssize_t mfill_atomic(struct userfaultfd_ctx *ctx,
break;
}
/*
- * If the dst_pmd is THP don't override it and just be strict.
- * (This includes the case where the PMD used to be THP and
- * changed back to none after __pte_alloc().)
+ * THP PMD: for anon CONTINUE, restore protnone PMD
+ * permissions in place. For other operations, reject.
*/
if (unlikely(pmd_trans_huge(dst_pmdval))) {
+ if (uffd_flags_mode_is(flags, MFILL_ATOMIC_CONTINUE) &&
+ vma_is_anonymous(dst_vma) &&
+ pmd_protnone(dst_pmdval)) {
+ err = mfill_atomic_pmd_continue_anon(
+ dst_mm, dst_vma, dst_addr,
+ dst_pmd, dst_pmdval, flags);
+ if (err == -EAGAIN)
+ continue; /* PMD changed, re-read it */
+ if (err)
+ break;
+ dst_addr += HPAGE_PMD_SIZE;
+ src_addr += HPAGE_PMD_SIZE;
+ copied += HPAGE_PMD_SIZE;
+ continue;
+ }
err = -EEXIST;
break;
}
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 05/12] mm: intercept protnone faults on VM_UFFD_MINOR anonymous VMAs
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
When a protnone PTE/PMD fault occurs on a VMA with VM_UFFD_MINOR,
dispatch to the userfaultfd minor fault path instead of NUMA balancing.
Async: restore permissions inline. Sync: deliver via handle_userfault().
Feed NUMA locality stats from the fault path via task_numa_fault()
so the scheduler retains placement data even though NUMA scanning
is skipped on these VMAs.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
include/linux/huge_mm.h | 6 +++++
mm/huge_memory.c | 24 +++++++++++++++++++
mm/memory.c | 51 +++++++++++++++++++++++++++++++++++++++--
3 files changed, 79 insertions(+), 2 deletions(-)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index a4d9f964dfde..a900bb530998 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -519,6 +519,7 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
}
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf);
vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
@@ -707,6 +708,11 @@ static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
return 0;
}
+static inline vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf)
+{
+ return 0;
+}
+
static inline vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf)
{
return 0;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 2ad736ff007c..264c646a8573 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2181,6 +2181,30 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
return pmd_dirty(pmd);
}
+vm_fault_t do_huge_pmd_uffd_minor(struct vm_fault *vmf)
+{
+ struct vm_area_struct *vma = vmf->vma;
+
+ if (userfaultfd_minor_async(vma)) {
+ pmd_t pmd;
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
+ pmd = pmd_modify(vmf->orig_pmd, vma->vm_page_prot);
+ pmd = pmd_mkyoung(pmd);
+ set_pmd_at(vma->vm_mm, vmf->address & HPAGE_PMD_MASK,
+ vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
+
+ return handle_userfault(vmf, VM_UFFD_MINOR);
+}
+
/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
diff --git a/mm/memory.c b/mm/memory.c
index c65e82c86fed..f068ff4027e8 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6045,6 +6045,47 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
}
}
+static void uffd_minor_feed_numa_fault(struct vm_fault *vmf)
+{
+ struct folio *folio;
+
+ folio = vm_normal_folio(vmf->vma, vmf->address, vmf->orig_pte);
+ if (folio) {
+ int nid = folio_nid(folio);
+ int flags = 0;
+
+ if (nid == numa_node_id())
+ flags |= TNF_FAULT_LOCAL;
+ task_numa_fault(folio_last_cpupid(folio), nid, 1, flags);
+ }
+}
+
+static vm_fault_t do_uffd_minor_anon(struct vm_fault *vmf)
+{
+ /* Feed NUMA stats even though we skip NUMA scanning on this VMA */
+ uffd_minor_feed_numa_fault(vmf);
+
+ if (userfaultfd_minor_async(vmf->vma)) {
+ pte_t pte;
+
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+ pte = pte_modify(vmf->orig_pte, vmf->vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
+ set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+
+ /* Sync mode: unmap PTE and deliver to userfaultfd handler */
+ pte_unmap(vmf->pte);
+ return handle_userfault(vmf, VM_UFFD_MINOR);
+}
+
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6319,8 +6360,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
- if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+ if (userfaultfd_minor(vmf->vma))
+ return do_uffd_minor_anon(vmf);
return do_numa_page(vmf);
+ }
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
@@ -6434,8 +6478,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return 0;
}
if (pmd_trans_huge(vmf.orig_pmd)) {
- if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) {
+ if (userfaultfd_minor(vma))
+ return do_huge_pmd_uffd_minor(&vmf);
return do_huge_pmd_numa_page(&vmf);
+ }
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!pmd_write(vmf.orig_pmd)) {
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 06/12] userfaultfd: auto-resolve shmem and hugetlbfs minor faults in async mode
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
When UFFD_FEATURE_MINOR_ASYNC is enabled, skip handle_userfault() in
the shmem and hugetlbfs minor fault paths. The normal fault path
installs the PTE from page cache directly.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
mm/hugetlb.c | 3 ++-
mm/shmem.c | 3 ++-
2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 327eaa4074d3..c10d2432768c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -5847,7 +5847,8 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping,
}
/* Check for page in userfault range. */
- if (userfaultfd_minor(vma)) {
+ if (userfaultfd_minor(vma) &&
+ !userfaultfd_minor_async(vma)) {
folio_unlock(folio);
folio_put(folio);
/* See comment in userfaultfd_missing() block above */
diff --git a/mm/shmem.c b/mm/shmem.c
index b40f3cd48961..ce47e77fc090 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2489,7 +2489,8 @@ static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index,
fault_mm = vma ? vma->vm_mm : NULL;
folio = filemap_get_entry(inode->i_mapping, index);
- if (folio && vma && userfaultfd_minor(vma)) {
+ if (folio && vma && userfaultfd_minor(vma) &&
+ !userfaultfd_minor_async(vma)) {
if (!xa_is_value(folio))
folio_put(folio);
*fault_type = handle_userfault(vmf, VM_UFFD_MINOR);
--
2.51.2
^ permalink raw reply related
* [RFC, PATCH 07/12] sched/numa: skip scanning anonymous VM_UFFD_MINOR VMAs
From: Kiryl Shutsemau (Meta) @ 2026-04-14 14:23 UTC (permalink / raw)
To: Andrew Morton
Cc: Peter Xu, David Hildenbrand, Lorenzo Stoakes, Mike Rapoport,
Suren Baghdasaryan, Vlastimil Babka, Liam R . Howlett, Zi Yan,
Jonathan Corbet, Shuah Khan, Sean Christopherson, Paolo Bonzini,
linux-mm, linux-kernel, linux-doc, linux-kselftest, kvm,
Kiryl Shutsemau (Meta)
In-Reply-To: <20260414142354.1465950-1-kas@kernel.org>
Avoid protnone conflict on anonymous VMAs. Shmem unaffected.
NUMA stats fed from uffd fault path instead.
Add NUMAB_SKIP_UFFD_MINOR trace reason.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
include/linux/sched/numa_balancing.h | 1 +
include/trace/events/sched.h | 3 ++-
kernel/sched/fair.c | 13 +++++++++++++
3 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h
index 52b22c5c396d..5668074a4271 100644
--- a/include/linux/sched/numa_balancing.h
+++ b/include/linux/sched/numa_balancing.h
@@ -23,6 +23,7 @@ enum numa_vmaskip_reason {
NUMAB_SKIP_PID_INACTIVE,
NUMAB_SKIP_IGNORE_PID,
NUMAB_SKIP_SEQ_COMPLETED,
+ NUMAB_SKIP_UFFD_MINOR,
};
#ifdef CONFIG_NUMA_BALANCING
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index 7b2645b50e78..02e79b56db28 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -728,7 +728,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa,
EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \
EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \
EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \
- EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" )
+ EM( NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) \
+ EMe(NUMAB_SKIP_UFFD_MINOR, "uffd_minor" )
/* Redefine for export. */
#undef EM
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab4114712be7..57beb04562cf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -25,6 +25,7 @@
#include <linux/hugetlb_inline.h>
#include <linux/jiffies.h>
#include <linux/mm_api.h>
+#include <linux/userfaultfd_k.h>
#include <linux/highmem.h>
#include <linux/spinlock_api.h>
#include <linux/cpumask_api.h>
@@ -3459,6 +3460,18 @@ static void task_numa_work(struct callback_head *work)
continue;
}
+ /*
+ * Skip anonymous VMAs registered for userfaultfd minor faults.
+ * Both NUMA balancing and uffd use protnone PTEs on anonymous
+ * memory — let uffd own the hinting. For shmem, UFFDIO_DEACTIVATE
+ * zaps PTEs entirely (no protnone conflict), so NUMA scanning
+ * can proceed normally.
+ */
+ if (vma_is_anonymous(vma) && userfaultfd_minor(vma)) {
+ trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UFFD_MINOR);
+ continue;
+ }
+
/*
* Shared library pages mapped by multiple processes are not
* migrated as it is expected they are cache replicated. Avoid
--
2.51.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox