* [PATCH v18 14/14] crypto: qce - Communicate the base physical address to the dmaengine
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
In order to communicate to the BAM DMA engine which address should be
used as a scratchpad for dummy writes related to BAM pipe locking,
fill out and attach the provided metadata struct to the descriptor.
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/dma.c | 9 +++++++++
1 file changed, 9 insertions(+)
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 437314f2aa94feee765f750304a28ed7beca90b0..f7a7b98d843f03b7a2722df0376a7be6b4a09114 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -11,6 +11,7 @@
#include "core.h"
#include "dma.h"
+#include "regs-v5.h"
#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
#define QCE_BAM_CMD_SGL_SIZE 128
@@ -41,6 +42,10 @@ void qce_clear_bam_transaction(struct qce_device *qce)
int qce_submit_cmd_desc(struct qce_device *qce)
{
+ struct bam_desc_metadata meta = {
+ .scratchpad_addr = qce->base_phys + REG_VERSION,
+ .direction = DMA_MEM_TO_DEV,
+ };
struct qce_desc_info *qce_desc = qce->dma.bam_txn->desc;
struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
struct dma_async_tx_descriptor *dma_desc;
@@ -60,6 +65,10 @@ int qce_submit_cmd_desc(struct qce_device *qce)
goto err_unmap_sg;
}
+ ret = dmaengine_desc_attach_metadata(dma_desc, &meta, sizeof(meta));
+ if (ret)
+ goto err_unmap_sg;
+
qce_desc->dma_desc = dma_desc;
cookie = dmaengine_submit(qce_desc->dma_desc);
--
2.47.3
^ permalink raw reply related
* [PATCH v18 13/14] crypto: qce - Add BAM DMA support for crypto register I/O
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Switch to using BAM DMA for register I/O in addition to passing data. To
that end: provide the necessary infrastructure in the driver, modify the
ordering of operations as required and replace all direct register writes
with wrappers queueing DMA command descriptors.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/aead.c | 10 ++--
drivers/crypto/qce/common.c | 20 ++++---
drivers/crypto/qce/dma.c | 120 ++++++++++++++++++++++++++++++++++++++++--
drivers/crypto/qce/dma.h | 5 ++
drivers/crypto/qce/sha.c | 10 ++--
drivers/crypto/qce/skcipher.c | 10 ++--
6 files changed, 144 insertions(+), 31 deletions(-)
diff --git a/drivers/crypto/qce/aead.c b/drivers/crypto/qce/aead.c
index 03b8042da9a1b4aebdc775ad8ab912abc7b2383d..e271ecbcbb4a33c405fbec85c458cf1daa7e2c55 100644
--- a/drivers/crypto/qce/aead.c
+++ b/drivers/crypto/qce/aead.c
@@ -463,17 +463,17 @@ qce_aead_async_req_handle(struct crypto_async_request *async_req)
src_nents = dst_nents - 1;
}
- ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents, rctx->dst_sg, dst_nents,
- qce_aead_done, async_req);
+ ret = qce_start(async_req, tmpl->crypto_alg_type);
if (ret)
goto error_unmap_src;
- qce_dma_issue_pending(&qce->dma);
-
- ret = qce_start(async_req, tmpl->crypto_alg_type);
+ ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents, rctx->dst_sg, dst_nents,
+ qce_aead_done, async_req);
if (ret)
goto error_terminate;
+ qce_dma_issue_pending(&qce->dma);
+
return 0;
error_terminate:
diff --git a/drivers/crypto/qce/common.c b/drivers/crypto/qce/common.c
index 54a78a57f63028f01870a3edeb8e390f523bb190..37bb6f03244d317a887aeb0aa10cefe327b4ce05 100644
--- a/drivers/crypto/qce/common.c
+++ b/drivers/crypto/qce/common.c
@@ -25,7 +25,7 @@ static inline u32 qce_read(struct qce_device *qce, u32 offset)
static inline void qce_write(struct qce_device *qce, u32 offset, u32 val)
{
- writel(val, qce->base + offset);
+ qce_write_dma(qce, offset, val);
}
static inline void qce_write_array(struct qce_device *qce, u32 offset,
@@ -82,6 +82,8 @@ static void qce_setup_config(struct qce_device *qce)
{
u32 config;
+ qce_clear_bam_transaction(qce);
+
/* get big endianness */
config = qce_config_reg(qce, 0);
@@ -90,12 +92,14 @@ static void qce_setup_config(struct qce_device *qce)
qce_write(qce, REG_CONFIG, config);
}
-static inline void qce_crypto_go(struct qce_device *qce, bool result_dump)
+static inline int qce_crypto_go(struct qce_device *qce, bool result_dump)
{
if (result_dump)
qce_write(qce, REG_GOPROC, BIT(GO_SHIFT) | BIT(RESULTS_DUMP_SHIFT));
else
qce_write(qce, REG_GOPROC, BIT(GO_SHIFT));
+
+ return qce_submit_cmd_desc(qce);
}
#if defined(CONFIG_CRYPTO_DEV_QCE_SHA) || defined(CONFIG_CRYPTO_DEV_QCE_AEAD)
@@ -223,9 +227,7 @@ static int qce_setup_regs_ahash(struct crypto_async_request *async_req)
config = qce_config_reg(qce, 1);
qce_write(qce, REG_CONFIG, config);
- qce_crypto_go(qce, true);
-
- return 0;
+ return qce_crypto_go(qce, true);
}
#endif
@@ -386,9 +388,7 @@ static int qce_setup_regs_skcipher(struct crypto_async_request *async_req)
config = qce_config_reg(qce, 1);
qce_write(qce, REG_CONFIG, config);
- qce_crypto_go(qce, true);
-
- return 0;
+ return qce_crypto_go(qce, true);
}
#endif
@@ -535,9 +535,7 @@ static int qce_setup_regs_aead(struct crypto_async_request *async_req)
qce_write(qce, REG_CONFIG, config);
/* Start the process */
- qce_crypto_go(qce, !IS_CCM(flags));
-
- return 0;
+ return qce_crypto_go(qce, !IS_CCM(flags));
}
#endif
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 3db46fc0c419a0a387abce93649084fbf4b1f128..437314f2aa94feee765f750304a28ed7beca90b0 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -4,6 +4,8 @@
*/
#include <linux/device.h>
+#include <linux/dma/qcom_bam_dma.h>
+#include <linux/dma-mapping.h>
#include <linux/dmaengine.h>
#include <crypto/scatterwalk.h>
@@ -11,6 +13,96 @@
#include "dma.h"
#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
+#define QCE_BAM_CMD_SGL_SIZE 128
+#define QCE_BAM_CMD_ELEMENT_SIZE 128
+
+struct qce_desc_info {
+ struct dma_async_tx_descriptor *dma_desc;
+ enum dma_data_direction dir;
+};
+
+struct qce_bam_transaction {
+ struct bam_cmd_element bam_ce[QCE_BAM_CMD_ELEMENT_SIZE];
+ struct scatterlist wr_sgl[QCE_BAM_CMD_SGL_SIZE];
+ struct qce_desc_info *desc;
+ u32 bam_ce_idx;
+ u32 pre_bam_ce_idx;
+ u32 wr_sgl_cnt;
+};
+
+void qce_clear_bam_transaction(struct qce_device *qce)
+{
+ struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
+
+ bam_txn->bam_ce_idx = 0;
+ bam_txn->wr_sgl_cnt = 0;
+ bam_txn->pre_bam_ce_idx = 0;
+}
+
+int qce_submit_cmd_desc(struct qce_device *qce)
+{
+ struct qce_desc_info *qce_desc = qce->dma.bam_txn->desc;
+ struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
+ struct dma_async_tx_descriptor *dma_desc;
+ struct dma_chan *chan = qce->dma.rxchan;
+ unsigned long attrs = DMA_PREP_CMD;
+ dma_cookie_t cookie;
+ unsigned int mapped;
+ int ret;
+
+ mapped = dma_map_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+ if (!mapped)
+ return -ENOMEM;
+
+ dma_desc = dmaengine_prep_slave_sg(chan, bam_txn->wr_sgl, mapped, DMA_MEM_TO_DEV, attrs);
+ if (!dma_desc) {
+ ret = -ENOMEM;
+ goto err_unmap_sg;
+ }
+
+ qce_desc->dma_desc = dma_desc;
+ cookie = dmaengine_submit(qce_desc->dma_desc);
+
+ ret = dma_submit_error(cookie);
+ if (ret)
+ goto err_unmap_sg;
+
+ return 0;
+
+err_unmap_sg:
+ dma_unmap_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+ return ret;
+}
+
+static void qce_prep_dma_cmd_desc(struct qce_device *qce, struct qce_dma_data *dma,
+ unsigned int addr, void *buf)
+{
+ struct qce_bam_transaction *bam_txn = dma->bam_txn;
+ struct bam_cmd_element *bam_ce_buf;
+ int bam_ce_size, cnt, idx;
+
+ idx = bam_txn->bam_ce_idx;
+ bam_ce_buf = &bam_txn->bam_ce[idx];
+ bam_prep_ce_le32(bam_ce_buf, addr, BAM_WRITE_COMMAND, *((__le32 *)buf));
+
+ bam_ce_buf = &bam_txn->bam_ce[bam_txn->pre_bam_ce_idx];
+ bam_txn->bam_ce_idx++;
+ bam_ce_size = (bam_txn->bam_ce_idx - bam_txn->pre_bam_ce_idx) * sizeof(*bam_ce_buf);
+
+ cnt = bam_txn->wr_sgl_cnt;
+
+ sg_set_buf(&bam_txn->wr_sgl[cnt], bam_ce_buf, bam_ce_size);
+
+ ++bam_txn->wr_sgl_cnt;
+ bam_txn->pre_bam_ce_idx = bam_txn->bam_ce_idx;
+}
+
+void qce_write_dma(struct qce_device *qce, unsigned int offset, u32 val)
+{
+ unsigned int reg_addr = ((unsigned int)(qce->base_phys) + offset);
+
+ qce_prep_dma_cmd_desc(qce, &qce->dma, reg_addr, &val);
+}
int devm_qce_dma_request(struct qce_device *qce)
{
@@ -31,6 +123,16 @@ int devm_qce_dma_request(struct qce_device *qce)
return dev_err_probe(dev, PTR_ERR(dma->rxchan),
"Failed to get RX DMA channel\n");
+ dma->bam_txn = devm_kzalloc(dev, sizeof(*dma->bam_txn), GFP_KERNEL);
+ if (!dma->bam_txn)
+ return -ENOMEM;
+
+ dma->bam_txn->desc = devm_kzalloc(dev, sizeof(*dma->bam_txn->desc), GFP_KERNEL);
+ if (!dma->bam_txn->desc)
+ return -ENOMEM;
+
+ sg_init_table(dma->bam_txn->wr_sgl, QCE_BAM_CMD_SGL_SIZE);
+
return 0;
}
@@ -90,28 +192,36 @@ int qce_dma_prep_sgs(struct qce_dma_data *dma, struct scatterlist *rx_sg,
{
struct dma_chan *rxchan = dma->rxchan;
struct dma_chan *txchan = dma->txchan;
- unsigned long flags = DMA_PREP_INTERRUPT | DMA_CTRL_ACK;
+ unsigned long txflags = DMA_PREP_INTERRUPT | DMA_CTRL_ACK;
+ unsigned long rxflags = txflags | DMA_PREP_FENCE;
int ret;
- ret = qce_dma_prep_sg(rxchan, rx_sg, rx_nents, flags, DMA_MEM_TO_DEV,
+ ret = qce_dma_prep_sg(rxchan, rx_sg, rx_nents, rxflags, DMA_MEM_TO_DEV,
NULL, NULL);
if (ret)
return ret;
- return qce_dma_prep_sg(txchan, tx_sg, tx_nents, flags, DMA_DEV_TO_MEM,
+ return qce_dma_prep_sg(txchan, tx_sg, tx_nents, txflags, DMA_DEV_TO_MEM,
cb, cb_param);
}
void qce_dma_issue_pending(struct qce_dma_data *dma)
{
- dma_async_issue_pending(dma->rxchan);
dma_async_issue_pending(dma->txchan);
+ dma_async_issue_pending(dma->rxchan);
}
int qce_dma_terminate_all(struct qce_dma_data *dma)
{
+ struct qce_device *qce = container_of(dma, struct qce_device, dma);
+ struct qce_bam_transaction *bam_txn = dma->bam_txn;
int ret;
ret = dmaengine_terminate_all(dma->rxchan);
- return ret ?: dmaengine_terminate_all(dma->txchan);
+ if (ret)
+ return ret;
+
+ dma_unmap_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+
+ return dmaengine_terminate_all(dma->txchan);
}
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index 483789d9fa98e79d1283de8297bf2fc2a773f3a7..f05dfa9e6b25bd60e32f45079a8bc7e6a4cf81f9 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -8,6 +8,7 @@
#include <linux/dmaengine.h>
+struct qce_bam_transaction;
struct qce_device;
/* maximum data transfer block size between BAM and CE */
@@ -32,6 +33,7 @@ struct qce_dma_data {
struct dma_chan *txchan;
struct dma_chan *rxchan;
struct qce_result_dump *result_buf;
+ struct qce_bam_transaction *bam_txn;
};
int devm_qce_dma_request(struct qce_device *qce);
@@ -43,5 +45,8 @@ int qce_dma_terminate_all(struct qce_dma_data *dma);
struct scatterlist *
qce_sgtable_add(struct sg_table *sgt, struct scatterlist *sg_add,
unsigned int max_len);
+void qce_write_dma(struct qce_device *qce, unsigned int offset, u32 val);
+int qce_submit_cmd_desc(struct qce_device *qce);
+void qce_clear_bam_transaction(struct qce_device *qce);
#endif /* _DMA_H_ */
diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c
index a3a1a205aaf8559a04809936e2a3b7d564c16c53..5be82b345753f49202797852cec09dbc7f0a1e03 100644
--- a/drivers/crypto/qce/sha.c
+++ b/drivers/crypto/qce/sha.c
@@ -109,17 +109,17 @@ static int qce_ahash_async_req_handle(struct crypto_async_request *async_req)
goto error_unmap_src;
}
- ret = qce_dma_prep_sgs(&qce->dma, req->src, rctx->src_nents,
- &rctx->result_sg, 1, qce_ahash_done, async_req);
+ ret = qce_start(async_req, tmpl->crypto_alg_type);
if (ret)
goto error_unmap_dst;
- qce_dma_issue_pending(&qce->dma);
-
- ret = qce_start(async_req, tmpl->crypto_alg_type);
+ ret = qce_dma_prep_sgs(&qce->dma, req->src, rctx->src_nents,
+ &rctx->result_sg, 1, qce_ahash_done, async_req);
if (ret)
goto error_terminate;
+ qce_dma_issue_pending(&qce->dma);
+
return 0;
error_terminate:
diff --git a/drivers/crypto/qce/skcipher.c b/drivers/crypto/qce/skcipher.c
index 1fef315a7105c869e7fc6a60719087b721e78bb3..6535336a2c57c39db94999011890b8bdad5c58c2 100644
--- a/drivers/crypto/qce/skcipher.c
+++ b/drivers/crypto/qce/skcipher.c
@@ -142,18 +142,18 @@ qce_skcipher_async_req_handle(struct crypto_async_request *async_req)
src_nents = dst_nents - 1;
}
+ ret = qce_start(async_req, tmpl->crypto_alg_type);
+ if (ret)
+ goto error_unmap_src;
+
ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents,
rctx->dst_sg, dst_nents,
qce_skcipher_done, async_req);
if (ret)
- goto error_unmap_src;
+ goto error_terminate;
qce_dma_issue_pending(&qce->dma);
- ret = qce_start(async_req, tmpl->crypto_alg_type);
- if (ret)
- goto error_terminate;
-
return 0;
error_terminate:
--
2.47.3
^ permalink raw reply related
* [PATCH v18 12/14] crypto: qce - Map crypto memory for DMA
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
As the first step in converting the driver to using DMA for register
I/O, let's map the crypto memory range.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/core.c | 23 ++++++++++++++++++++++-
drivers/crypto/qce/core.h | 6 ++++++
2 files changed, 28 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index a0e2eadc3afd5f83e46724c8bc3e3690146b86ba..d7b7a3dda464964afe6a6893bb329d5bd5759dcd 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -192,10 +192,19 @@ static void qce_cancel_work(void *data)
cancel_work_sync(work);
}
+static void qce_crypto_unmap_dma(void *data)
+{
+ struct qce_device *qce = data;
+
+ dma_unmap_resource(qce->dev, qce->base_dma, qce->dma_size,
+ DMA_BIDIRECTIONAL, 0);
+}
+
static int qce_crypto_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct qce_device *qce;
+ struct resource *res;
int ret;
qce = devm_kzalloc(dev, sizeof(*qce), GFP_KERNEL);
@@ -205,7 +214,7 @@ static int qce_crypto_probe(struct platform_device *pdev)
qce->dev = dev;
platform_set_drvdata(pdev, qce);
- qce->base = devm_platform_ioremap_resource(pdev, 0);
+ qce->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
if (IS_ERR(qce->base))
return PTR_ERR(qce->base);
@@ -255,6 +264,18 @@ static int qce_crypto_probe(struct platform_device *pdev)
qce->async_req_enqueue = qce_async_request_enqueue;
qce->async_req_done = qce_async_request_done;
+ qce->dma_size = resource_size(res);
+ qce->base_dma = dma_map_resource(dev, res->start, qce->dma_size,
+ DMA_BIDIRECTIONAL, 0);
+ qce->base_phys = res->start;
+ ret = dma_mapping_error(dev, qce->base_dma);
+ if (ret)
+ return ret;
+
+ ret = devm_add_action_or_reset(qce->dev, qce_crypto_unmap_dma, qce);
+ if (ret)
+ return ret;
+
return devm_qce_register_algs(qce);
}
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index f092ce2d3b04a936a37805c20ac5ba78d8fdd2df..a80e12eac6c87e5321cce16c56a4bf5003474ef0 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -27,6 +27,9 @@
* @dma: pointer to dma data
* @burst_size: the crypto burst size
* @pipe_pair_id: which pipe pair id the device using
+ * @base_dma: base DMA address
+ * @base_phys: base physical address
+ * @dma_size: size of memory mapped for DMA
* @async_req_enqueue: invoked by every algorithm to enqueue a request
* @async_req_done: invoked by every algorithm to finish its request
*/
@@ -43,6 +46,9 @@ struct qce_device {
struct qce_dma_data dma;
int burst_size;
unsigned int pipe_pair_id;
+ dma_addr_t base_dma;
+ phys_addr_t base_phys;
+ size_t dma_size;
int (*async_req_enqueue)(struct qce_device *qce,
struct crypto_async_request *req);
void (*async_req_done)(struct qce_device *qce, int ret);
--
2.47.3
^ permalink raw reply related
* [PATCH v18 11/14] crypto: qce - Use existing devres APIs in devm_qce_dma_request()
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
Konrad Dybcio
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Switch to devm_kmalloc() and devm_dma_alloc_chan() in
devm_qce_dma_request(). This allows us to drop two labels and shrink the
function.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/dma.c | 41 ++++++++++-------------------------------
1 file changed, 10 insertions(+), 31 deletions(-)
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index c29b0abe9445381a019e0447d30acfd7319d5c1f..3db46fc0c419a0a387abce93649084fbf4b1f128 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -12,47 +12,26 @@
#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
-static void qce_dma_release(void *data)
-{
- struct qce_dma_data *dma = data;
-
- dma_release_channel(dma->txchan);
- dma_release_channel(dma->rxchan);
- kfree(dma->result_buf);
-}
-
int devm_qce_dma_request(struct qce_device *qce)
{
struct qce_dma_data *dma = &qce->dma;
struct device *dev = qce->dev;
- int ret;
- dma->txchan = dma_request_chan(dev, "tx");
+ dma->result_buf = devm_kmalloc(dev, QCE_RESULT_BUF_SZ + QCE_IGNORE_BUF_SZ, GFP_KERNEL);
+ if (!dma->result_buf)
+ return -ENOMEM;
+
+ dma->txchan = devm_dma_request_chan(dev, "tx");
if (IS_ERR(dma->txchan))
return dev_err_probe(dev, PTR_ERR(dma->txchan),
"Failed to get TX DMA channel\n");
- dma->rxchan = dma_request_chan(dev, "rx");
- if (IS_ERR(dma->rxchan)) {
- ret = dev_err_probe(dev, PTR_ERR(dma->rxchan),
- "Failed to get RX DMA channel\n");
- goto error_rx;
- }
-
- dma->result_buf = kmalloc(QCE_RESULT_BUF_SZ + QCE_IGNORE_BUF_SZ,
- GFP_KERNEL);
- if (!dma->result_buf) {
- ret = -ENOMEM;
- goto error_nomem;
- }
-
- return devm_add_action_or_reset(dev, qce_dma_release, dma);
+ dma->rxchan = devm_dma_request_chan(dev, "rx");
+ if (IS_ERR(dma->rxchan))
+ return dev_err_probe(dev, PTR_ERR(dma->rxchan),
+ "Failed to get RX DMA channel\n");
-error_nomem:
- dma_release_channel(dma->rxchan);
-error_rx:
- dma_release_channel(dma->txchan);
- return ret;
+ return 0;
}
struct scatterlist *
--
2.47.3
^ permalink raw reply related
* [PATCH v18 10/14] crypto: qce - Simplify arguments of devm_qce_dma_request()
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
This function can extract all the information it needs from struct
qce_device alone so simplify its arguments. This is done in preparation
for adding support for register I/O over DMA which will require
accessing even more fields from struct qce_device.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/core.c | 2 +-
drivers/crypto/qce/dma.c | 5 ++++-
drivers/crypto/qce/dma.h | 4 +++-
3 files changed, 8 insertions(+), 3 deletions(-)
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index ad37c2b8ae53a373bb248aff06c3b7946e8439a8..a0e2eadc3afd5f83e46724c8bc3e3690146b86ba 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -238,7 +238,7 @@ static int qce_crypto_probe(struct platform_device *pdev)
if (ret)
return ret;
- ret = devm_qce_dma_request(qce->dev, &qce->dma);
+ ret = devm_qce_dma_request(qce);
if (ret)
return ret;
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 08bf3e8ec12433c1a8ee17003f3487e41b7329e4..c29b0abe9445381a019e0447d30acfd7319d5c1f 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -7,6 +7,7 @@
#include <linux/dmaengine.h>
#include <crypto/scatterwalk.h>
+#include "core.h"
#include "dma.h"
#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
@@ -20,8 +21,10 @@ static void qce_dma_release(void *data)
kfree(dma->result_buf);
}
-int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma)
+int devm_qce_dma_request(struct qce_device *qce)
{
+ struct qce_dma_data *dma = &qce->dma;
+ struct device *dev = qce->dev;
int ret;
dma->txchan = dma_request_chan(dev, "tx");
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index fc337c435cd14917bdfb99febcf9119275afdeba..483789d9fa98e79d1283de8297bf2fc2a773f3a7 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -8,6 +8,8 @@
#include <linux/dmaengine.h>
+struct qce_device;
+
/* maximum data transfer block size between BAM and CE */
#define QCE_BAM_BURST_SIZE 64
@@ -32,7 +34,7 @@ struct qce_dma_data {
struct qce_result_dump *result_buf;
};
-int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma);
+int devm_qce_dma_request(struct qce_device *qce);
int qce_dma_prep_sgs(struct qce_dma_data *dma, struct scatterlist *sg_in,
int in_ents, struct scatterlist *sg_out, int out_ents,
dma_async_tx_callback cb, void *cb_param);
--
2.47.3
^ permalink raw reply related
* [PATCH v18 09/14] crypto: qce - Remove unused ignore_buf
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
It's unclear what the purpose of this field is. It has been here since
the initial commit but without any explanation. The driver works fine
without it. We still keep allocating more space in the result buffer, we
just don't need to store its address. While at it: move the
QCE_IGNORE_BUF_SZ definition into dma.c as it's not used outside of this
compilation unit.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/dma.c | 4 ++--
drivers/crypto/qce/dma.h | 2 --
2 files changed, 2 insertions(+), 4 deletions(-)
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 68cafd4741ad3d91906d39e817fc7873b028d498..08bf3e8ec12433c1a8ee17003f3487e41b7329e4 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -9,6 +9,8 @@
#include "dma.h"
+#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
+
static void qce_dma_release(void *data)
{
struct qce_dma_data *dma = data;
@@ -41,8 +43,6 @@ int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma)
goto error_nomem;
}
- dma->ignore_buf = dma->result_buf + QCE_RESULT_BUF_SZ;
-
return devm_add_action_or_reset(dev, qce_dma_release, dma);
error_nomem:
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index 31629185000e12242fa07c2cc08b95fcbd5d4b8c..fc337c435cd14917bdfb99febcf9119275afdeba 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -23,7 +23,6 @@ struct qce_result_dump {
u32 status2;
};
-#define QCE_IGNORE_BUF_SZ (2 * QCE_BAM_BURST_SIZE)
#define QCE_RESULT_BUF_SZ \
ALIGN(sizeof(struct qce_result_dump), QCE_BAM_BURST_SIZE)
@@ -31,7 +30,6 @@ struct qce_dma_data {
struct dma_chan *txchan;
struct dma_chan *rxchan;
struct qce_result_dump *result_buf;
- void *ignore_buf;
};
int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma);
--
2.47.3
^ permalink raw reply related
* [PATCH v18 08/14] crypto: qce - Include algapi.h in the core.h header
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
The header defines a struct embedding struct crypto_queue whose size
needs to be known and which is defined in crypto/algapi.h. Move the
inclusion from core.c to core.h.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/core.c | 1 -
drivers/crypto/qce/core.h | 1 +
2 files changed, 1 insertion(+), 1 deletion(-)
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index f671946cf7351cd5f0c319909bafd87e3af701c7..ad37c2b8ae53a373bb248aff06c3b7946e8439a8 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -13,7 +13,6 @@
#include <linux/mod_devicetable.h>
#include <linux/platform_device.h>
#include <linux/types.h>
-#include <crypto/algapi.h>
#include <crypto/internal/hash.h>
#include "core.h"
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index eb6fa7a8b64a81daf9ad5304a3ae4e5e597a70b8..f092ce2d3b04a936a37805c20ac5ba78d8fdd2df 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -8,6 +8,7 @@
#include <linux/mutex.h>
#include <linux/workqueue.h>
+#include <crypto/algapi.h>
#include "dma.h"
--
2.47.3
^ permalink raw reply related
* [PATCH v18 07/14] crypto: qce - Cancel work on device detach
From: Bartosz Golaszewski @ 2026-05-22 13:40 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
The workqueue is setup in probe() but never cancelled on error or in
remove(). Set up a devres action to clean it up. We need to move the
initialization earlier as we don't want to cancel the work before any
outstanding DMA transfer is terminated.
Fixes: eb7986e5e14d ("crypto: qce - convert tasklet to workqueue")
Closes: https://sashiko.dev/#/patchset/20260427-qcom-qce-cmd-descr-v16-0-945fd1cafbbc%40oss.qualcomm.com?part=7
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/crypto/qce/core.c | 13 ++++++++++++-
1 file changed, 12 insertions(+), 1 deletion(-)
diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index b966f3365b7de8d2a8f6707397a34aa4facdc4ac..f671946cf7351cd5f0c319909bafd87e3af701c7 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -186,6 +186,13 @@ static int qce_check_version(struct qce_device *qce)
return 0;
}
+static void qce_cancel_work(void *data)
+{
+ struct work_struct *work = data;
+
+ cancel_work_sync(work);
+}
+
static int qce_crypto_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
@@ -227,6 +234,11 @@ static int qce_crypto_probe(struct platform_device *pdev)
if (ret)
return ret;
+ INIT_WORK(&qce->done_work, qce_req_done_work);
+ ret = devm_add_action_or_reset(dev, qce_cancel_work, &qce->done_work);
+ if (ret)
+ return ret;
+
ret = devm_qce_dma_request(qce->dev, &qce->dma);
if (ret)
return ret;
@@ -239,7 +251,6 @@ static int qce_crypto_probe(struct platform_device *pdev)
if (ret)
return ret;
- INIT_WORK(&qce->done_work, qce_req_done_work);
crypto_init_queue(&qce->queue, QCE_QUEUE_LENGTH);
qce->async_req_enqueue = qce_async_request_enqueue;
--
2.47.3
^ permalink raw reply related
* [PATCH v18 06/14] dmaengine: qcom: bam_dma: add support for BAM locking
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
Add support for BAM pipe locking. To that end: when starting DMA on an RX
channel - prepend the existing queue of issued descriptors with an
additional "dummy" command descriptor with the LOCK bit set. Once the
transaction is done (no more issued descriptors), issue one more dummy
descriptor with the UNLOCK bit.
We *must* wait until the transaction is signalled as done because we
must not perform any writes into config registers while the engine is
busy.
The dummy writes must be issued into a scratchpad register of the client
so provide a mechanism to communicate the right address via descriptor
metadata.
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/qcom/bam_dma.c | 155 ++++++++++++++++++++++++++++++++++++++-
include/linux/dma/qcom_bam_dma.h | 14 ++++
2 files changed, 165 insertions(+), 4 deletions(-)
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 04fe1d546be73f074c66c4a5712ad65717e10929..40b5d5c24067af562e9776416e126128e29a368f 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -28,11 +28,13 @@
#include <linux/clk.h>
#include <linux/device.h>
#include <linux/dma-mapping.h>
+#include <linux/dma/qcom_bam_dma.h>
#include <linux/dmaengine.h>
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
#include <linux/kernel.h>
+#include <linux/lockdep.h>
#include <linux/module.h>
#include <linux/of_address.h>
#include <linux/of_dma.h>
@@ -60,6 +62,8 @@ struct bam_desc_hw {
#define DESC_FLAG_EOB BIT(13)
#define DESC_FLAG_NWD BIT(12)
#define DESC_FLAG_CMD BIT(11)
+#define DESC_FLAG_LOCK BIT(10)
+#define DESC_FLAG_UNLOCK BIT(9)
struct bam_async_desc {
struct virt_dma_desc vd;
@@ -72,6 +76,10 @@ struct bam_async_desc {
struct bam_desc_hw *curr_desc;
+ /* BAM locking infrastructure */
+ struct scatterlist lock_sg;
+ struct bam_cmd_element lock_ce;
+
/* list node for the desc in the bam_chan list of descriptors */
struct list_head desc_node;
enum dma_transfer_direction dir;
@@ -391,6 +399,10 @@ struct bam_chan {
struct list_head desc_list;
struct list_head node;
+
+ /* BAM locking infrastructure */
+ phys_addr_t scratchpad_addr;
+ enum dma_transfer_direction direction;
};
static inline struct bam_chan *to_bam_chan(struct dma_chan *common)
@@ -652,6 +664,35 @@ static int bam_slave_config(struct dma_chan *chan,
return 0;
}
+static int bam_metadata_attach(struct dma_async_tx_descriptor *desc, void *data, size_t len)
+{
+ struct bam_chan *bchan = to_bam_chan(desc->chan);
+ const struct bam_device_data *bdata = bchan->bdev->dev_data;
+ struct bam_desc_metadata *metadata = data;
+
+ if (!data)
+ return -EINVAL;
+
+ if (!bdata->pipe_lock_supported)
+ /*
+ * The client wants to use locking but this BAM version doesn't
+ * support it. Don't return an error here as this will stop the
+ * client from using DMA at all for no reason.
+ */
+ return 0;
+
+ guard(spinlock_irqsave)(&bchan->vc.lock);
+
+ bchan->scratchpad_addr = metadata->scratchpad_addr;
+ bchan->direction = metadata->direction;
+
+ return 0;
+}
+
+static const struct dma_descriptor_metadata_ops bam_metadata_ops = {
+ .attach = bam_metadata_attach,
+};
+
/**
* bam_prep_slave_sg - Prep slave sg transaction
*
@@ -668,6 +709,7 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
void *context)
{
struct bam_chan *bchan = to_bam_chan(chan);
+ struct dma_async_tx_descriptor *tx_desc;
struct bam_device *bdev = bchan->bdev;
struct bam_async_desc *async_desc;
struct scatterlist *sg;
@@ -723,7 +765,12 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
} while (remainder > 0);
}
- return vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+ tx_desc = vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+ if (!tx_desc)
+ return NULL;
+
+ tx_desc->metadata_ops = &bam_metadata_ops;
+ return tx_desc;
}
/**
@@ -1012,13 +1059,105 @@ static void bam_apply_new_config(struct bam_chan *bchan,
bchan->reconfigure = 0;
}
+static struct bam_async_desc *
+bam_make_lock_desc(struct bam_chan *bchan, unsigned long flag)
+{
+ struct dma_chan *chan = &bchan->vc.chan;
+ struct bam_async_desc *async_desc;
+ struct bam_desc_hw *desc;
+ struct virt_dma_desc *vd;
+ struct virt_dma_chan *vc;
+ unsigned int mapped;
+ dma_cookie_t cookie;
+ int ret;
+
+ async_desc = kzalloc_flex(*async_desc, desc, 1, GFP_NOWAIT);
+ if (!async_desc) {
+ dev_err(bchan->bdev->dev, "failed to allocate the BAM lock descriptor\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ sg_init_table(&async_desc->lock_sg, 1);
+
+ async_desc->num_desc = 1;
+ async_desc->curr_desc = async_desc->desc;
+ async_desc->dir = DMA_MEM_TO_DEV;
+
+ desc = async_desc->desc;
+
+ bam_prep_ce_le32(&async_desc->lock_ce, bchan->scratchpad_addr, BAM_WRITE_COMMAND, 0);
+ sg_set_buf(&async_desc->lock_sg, &async_desc->lock_ce, sizeof(async_desc->lock_ce));
+
+ mapped = dma_map_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
+ if (!mapped) {
+ kfree(async_desc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ desc->flags |= cpu_to_le16(DESC_FLAG_CMD | flag);
+ desc->addr = sg_dma_address(&async_desc->lock_sg);
+ desc->size = cpu_to_le16(sizeof(struct bam_cmd_element));
+
+ vc = &bchan->vc;
+ vd = &async_desc->vd;
+
+ dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
+ vd->tx.flags = DMA_PREP_CMD;
+ vd->tx.desc_free = vchan_tx_desc_free;
+ vd->tx_result.result = DMA_TRANS_NOERROR;
+ vd->tx_result.residue = 0;
+
+ cookie = dma_cookie_assign(&vd->tx);
+ ret = dma_submit_error(cookie);
+ if (ret) {
+ dma_unmap_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
+ kfree(async_desc);
+ return ERR_PTR(ret);
+ }
+
+ return async_desc;
+}
+
+static int bam_do_setup_pipe_lock(struct bam_chan *bchan, bool lock)
+{
+ struct bam_device *bdev = bchan->bdev;
+ const struct bam_device_data *bdata = bdev->dev_data;
+ struct bam_async_desc *lock_desc;
+ unsigned long flag;
+
+ lockdep_assert_held(&bchan->vc.lock);
+
+ if (!bdata->pipe_lock_supported || !bchan->scratchpad_addr ||
+ bchan->direction != DMA_MEM_TO_DEV)
+ return 0;
+
+ flag = lock ? DESC_FLAG_LOCK : DESC_FLAG_UNLOCK;
+
+ lock_desc = bam_make_lock_desc(bchan, flag);
+ if (IS_ERR(lock_desc))
+ return PTR_ERR(lock_desc);
+
+ if (lock)
+ list_add(&lock_desc->vd.node, &bchan->vc.desc_issued);
+ else
+ list_add_tail(&lock_desc->vd.node, &bchan->vc.desc_issued);
+
+ return 0;
+}
+
+static void bam_setup_pipe_lock(struct bam_chan *bchan)
+{
+ if (bam_do_setup_pipe_lock(bchan, true) || bam_do_setup_pipe_lock(bchan, false))
+ dev_err(bchan->vc.chan.slave, "Failed to setup BAM pipe lock descriptors");
+}
+
/**
* bam_start_dma - start next transaction
* @bchan: bam dma channel
*/
static void bam_start_dma(struct bam_chan *bchan)
{
- struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
+ struct virt_dma_desc *vd;
struct bam_device *bdev = bchan->bdev;
struct bam_async_desc *async_desc = NULL;
struct bam_desc_hw *desc;
@@ -1030,6 +1169,9 @@ static void bam_start_dma(struct bam_chan *bchan)
lockdep_assert_held(&bchan->vc.lock);
+ bam_setup_pipe_lock(bchan);
+
+ vd = vchan_next_desc(&bchan->vc);
if (!vd)
return;
@@ -1157,8 +1299,12 @@ static void bam_issue_pending(struct dma_chan *chan)
*/
static void bam_dma_free_desc(struct virt_dma_desc *vd)
{
- struct bam_async_desc *async_desc = container_of(vd,
- struct bam_async_desc, vd);
+ struct bam_async_desc *async_desc = container_of(vd, struct bam_async_desc, vd);
+ struct bam_desc_hw *desc = async_desc->desc;
+ struct dma_chan *chan = vd->tx.chan;
+
+ if (le16_to_cpu(desc->flags) & (DESC_FLAG_LOCK | DESC_FLAG_UNLOCK))
+ dma_unmap_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
kfree(async_desc);
}
@@ -1349,6 +1495,7 @@ static int bam_dma_probe(struct platform_device *pdev)
bdev->common.device_terminate_all = bam_dma_terminate_all;
bdev->common.device_issue_pending = bam_issue_pending;
bdev->common.device_tx_status = bam_tx_status;
+ bdev->common.desc_metadata_modes = DESC_METADATA_CLIENT;
bdev->common.dev = bdev->dev;
ret = dma_async_device_register(&bdev->common);
diff --git a/include/linux/dma/qcom_bam_dma.h b/include/linux/dma/qcom_bam_dma.h
index 68fc0e643b1b97fe4520d5878daa322b81f4f559..a2594264b0f58c4b2b1c85e243cad0d5669c26dc 100644
--- a/include/linux/dma/qcom_bam_dma.h
+++ b/include/linux/dma/qcom_bam_dma.h
@@ -6,6 +6,8 @@
#ifndef _QCOM_BAM_DMA_H
#define _QCOM_BAM_DMA_H
+#include <linux/dmaengine.h>
+
#include <asm/byteorder.h>
/*
@@ -34,6 +36,18 @@ enum bam_command_type {
BAM_READ_COMMAND,
};
+/**
+ * struct bam_desc_metadata - DMA descriptor metadata specific to the BAM driver.
+ *
+ * @scratchpad_addr: Physical address to use for dummy write operations when
+ * queuing command descriptors with LOCK/UNLOCK bits set.
+ * @direction: Transfer direction of this channel.
+ */
+struct bam_desc_metadata {
+ phys_addr_t scratchpad_addr;
+ enum dma_transfer_direction direction;
+};
+
/*
* prep_bam_ce_le32 - Wrapper function to prepare a single BAM command
* element with the data already in le32 format.
--
2.47.3
^ permalink raw reply related
* [PATCH v18 05/14] dmaengine: qcom: bam_dma: Add pipe_lock_supported flag support
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
Dmitry Baryshkov
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Extend the device match data with a flag indicating whether the IP
supports the BAM lock/unlock feature. Set it to true on BAM IP versions
1.4.0 and above.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Acked-by: Manivannan Sadhasivam <mani@kernel.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/qcom/bam_dma.c | 3 +++
1 file changed, 3 insertions(+)
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 2129ff5261571581a2c086c13dd657dc63e16f90..04fe1d546be73f074c66c4a5712ad65717e10929 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -115,6 +115,7 @@ struct reg_offset_data {
struct bam_device_data {
const struct reg_offset_data *reg_info;
+ bool pipe_lock_supported;
};
static const struct reg_offset_data bam_v1_3_reg_info[] = {
@@ -181,6 +182,7 @@ static const struct reg_offset_data bam_v1_4_reg_info[] = {
static const struct bam_device_data bam_v1_4_data = {
.reg_info = bam_v1_4_reg_info,
+ .pipe_lock_supported = true,
};
static const struct reg_offset_data bam_v1_7_reg_info[] = {
@@ -214,6 +216,7 @@ static const struct reg_offset_data bam_v1_7_reg_info[] = {
static const struct bam_device_data bam_v1_7_data = {
.reg_info = bam_v1_7_reg_info,
+ .pipe_lock_supported = true,
};
/* BAM CTRL */
--
2.47.3
^ permalink raw reply related
* [PATCH v18 04/14] dmaengine: qcom: bam_dma: Extend the driver's device match data
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
In preparation for supporting the pipe locking feature flag, extend the
amount of information we can carry in device match data: create a
separate structure and make the register information one of its fields.
This way, in subsequent patches, it will be just a matter of adding a
new field to the device data.
Reviewed-by: Dmitry Baryshkov <lumag@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/qcom/bam_dma.c | 28 ++++++++++++++++++++++------
1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 1c62f845ac0b956e311857b93f5b504086662f45..2129ff5261571581a2c086c13dd657dc63e16f90 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -113,6 +113,10 @@ struct reg_offset_data {
unsigned int pipe_mult, evnt_mult, ee_mult;
};
+struct bam_device_data {
+ const struct reg_offset_data *reg_info;
+};
+
static const struct reg_offset_data bam_v1_3_reg_info[] = {
[BAM_CTRL] = { 0x0F80, 0x00, 0x00, 0x00 },
[BAM_REVISION] = { 0x0F84, 0x00, 0x00, 0x00 },
@@ -142,6 +146,10 @@ static const struct reg_offset_data bam_v1_3_reg_info[] = {
[BAM_P_FIFO_SIZES] = { 0x1020, 0x00, 0x40, 0x00 },
};
+static const struct bam_device_data bam_v1_3_data = {
+ .reg_info = bam_v1_3_reg_info,
+};
+
static const struct reg_offset_data bam_v1_4_reg_info[] = {
[BAM_CTRL] = { 0x0000, 0x00, 0x00, 0x00 },
[BAM_REVISION] = { 0x0004, 0x00, 0x00, 0x00 },
@@ -171,6 +179,10 @@ static const struct reg_offset_data bam_v1_4_reg_info[] = {
[BAM_P_FIFO_SIZES] = { 0x1820, 0x00, 0x1000, 0x00 },
};
+static const struct bam_device_data bam_v1_4_data = {
+ .reg_info = bam_v1_4_reg_info,
+};
+
static const struct reg_offset_data bam_v1_7_reg_info[] = {
[BAM_CTRL] = { 0x00000, 0x00, 0x00, 0x00 },
[BAM_REVISION] = { 0x01000, 0x00, 0x00, 0x00 },
@@ -200,6 +212,10 @@ static const struct reg_offset_data bam_v1_7_reg_info[] = {
[BAM_P_FIFO_SIZES] = { 0x13820, 0x00, 0x1000, 0x00 },
};
+static const struct bam_device_data bam_v1_7_data = {
+ .reg_info = bam_v1_7_reg_info,
+};
+
/* BAM CTRL */
#define BAM_SW_RST BIT(0)
#define BAM_EN BIT(1)
@@ -393,7 +409,7 @@ struct bam_device {
bool powered_remotely;
u32 active_channels;
- const struct reg_offset_data *layout;
+ const struct bam_device_data *dev_data;
struct clk *bamclk;
int irq;
@@ -411,7 +427,7 @@ struct bam_device {
static inline void __iomem *bam_addr(struct bam_device *bdev, u32 pipe,
enum bam_reg reg)
{
- const struct reg_offset_data r = bdev->layout[reg];
+ const struct reg_offset_data r = bdev->dev_data->reg_info[reg];
return bdev->regs + r.base_offset +
r.pipe_mult * pipe +
@@ -1205,9 +1221,9 @@ static void bam_channel_init(struct bam_device *bdev, struct bam_chan *bchan,
}
static const struct of_device_id bam_of_match[] = {
- { .compatible = "qcom,bam-v1.3.0", .data = &bam_v1_3_reg_info },
- { .compatible = "qcom,bam-v1.4.0", .data = &bam_v1_4_reg_info },
- { .compatible = "qcom,bam-v1.7.0", .data = &bam_v1_7_reg_info },
+ { .compatible = "qcom,bam-v1.3.0", .data = &bam_v1_3_data },
+ { .compatible = "qcom,bam-v1.4.0", .data = &bam_v1_4_data },
+ { .compatible = "qcom,bam-v1.7.0", .data = &bam_v1_7_data },
{}
};
@@ -1231,7 +1247,7 @@ static int bam_dma_probe(struct platform_device *pdev)
return -ENODEV;
}
- bdev->layout = match->data;
+ bdev->dev_data = match->data;
bdev->regs = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(bdev->regs))
--
2.47.3
^ permalink raw reply related
* [PATCH v18 03/14] dmaengine: qcom: bam_dma: convert tasklet to a BH workqueue
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
Dmitry Baryshkov
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
BH workqueues are a modern mechanism, aiming to replace legacy tasklets.
Let's convert the BAM DMA driver to using the high-priority variant of
the BH workqueue.
[Vinod: suggested using the BG workqueue instead of the regular one
running in process context]
Suggested-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/qcom/bam_dma.c | 32 ++++++++++++++++----------------
1 file changed, 16 insertions(+), 16 deletions(-)
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index b3d36ea79984385fe0d05ce56042d3e6e3030c5a..1c62f845ac0b956e311857b93f5b504086662f45 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -42,6 +42,7 @@
#include <linux/pm_runtime.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
+#include <linux/workqueue.h>
#include "../dmaengine.h"
#include "../virt-dma.h"
@@ -397,8 +398,8 @@ struct bam_device {
struct clk *bamclk;
int irq;
- /* dma start transaction tasklet */
- struct tasklet_struct task;
+ /* dma start transaction workqueue */
+ struct work_struct work;
};
/**
@@ -863,7 +864,7 @@ static u32 process_channel_irqs(struct bam_device *bdev)
/*
* if complete, process cookie. Otherwise
* push back to front of desc_issued so that
- * it gets restarted by the tasklet
+ * it gets restarted by the work queue.
*/
if (!async_desc->num_desc) {
vchan_cookie_complete(&async_desc->vd);
@@ -893,9 +894,9 @@ static irqreturn_t bam_dma_irq(int irq, void *data)
srcs |= process_channel_irqs(bdev);
- /* kick off tasklet to start next dma transfer */
+ /* kick off the work queue to start next dma transfer */
if (srcs & P_IRQ)
- tasklet_schedule(&bdev->task);
+ queue_work(system_bh_highpri_wq, &bdev->work);
ret = pm_runtime_get_sync(bdev->dev);
if (ret < 0)
@@ -1091,14 +1092,14 @@ static void bam_start_dma(struct bam_chan *bchan)
}
/**
- * dma_tasklet - DMA IRQ tasklet
- * @t: tasklet argument (bam controller structure)
+ * bam_dma_work() - DMA interrupt work queue callback
+ * @work: work queue struct embedded in the BAM controller device struct
*
* Sets up next DMA operation and then processes all completed transactions
*/
-static void dma_tasklet(struct tasklet_struct *t)
+static void bam_dma_work(struct work_struct *work)
{
- struct bam_device *bdev = from_tasklet(bdev, t, task);
+ struct bam_device *bdev = from_work(bdev, work, work);
struct bam_chan *bchan;
unsigned int i;
@@ -1111,14 +1112,13 @@ static void dma_tasklet(struct tasklet_struct *t)
if (!list_empty(&bchan->vc.desc_issued) && !IS_BUSY(bchan))
bam_start_dma(bchan);
}
-
}
/**
* bam_issue_pending - starts pending transactions
* @chan: dma channel
*
- * Calls tasklet directly which in turn starts any pending transactions
+ * Calls work queue directly which in turn starts any pending transactions
*/
static void bam_issue_pending(struct dma_chan *chan)
{
@@ -1286,14 +1286,14 @@ static int bam_dma_probe(struct platform_device *pdev)
if (ret)
goto err_disable_clk;
- tasklet_setup(&bdev->task, dma_tasklet);
+ INIT_WORK(&bdev->work, bam_dma_work);
bdev->channels = devm_kcalloc(bdev->dev, bdev->num_channels,
sizeof(*bdev->channels), GFP_KERNEL);
if (!bdev->channels) {
ret = -ENOMEM;
- goto err_tasklet_kill;
+ goto err_workqueue_cancel;
}
/* allocate and initialize channels */
@@ -1359,8 +1359,8 @@ static int bam_dma_probe(struct platform_device *pdev)
err_bam_channel_exit:
for (i = 0; i < bdev->num_channels; i++)
tasklet_kill(&bdev->channels[i].vc.task);
-err_tasklet_kill:
- tasklet_kill(&bdev->task);
+err_workqueue_cancel:
+ cancel_work_sync(&bdev->work);
err_disable_clk:
clk_disable_unprepare(bdev->bamclk);
@@ -1394,7 +1394,7 @@ static void bam_dma_remove(struct platform_device *pdev)
bdev->channels[i].fifo_phys);
}
- tasklet_kill(&bdev->task);
+ cancel_work_sync(&bdev->work);
clk_disable_unprepare(bdev->bamclk);
}
--
2.47.3
^ permalink raw reply related
* [PATCH v18 02/14] dmaengine: qcom: bam_dma: free interrupt before the clock in error path
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
The BAM interrupt is requested with a devres helper and so on error it's
freed after probe() returns. We disable the clock before freeing or
masking it so it may still fire and we may end up reading BAM registers
with clock disabled.
Stop using devres for interrupts as we free it in remove() manually
anyway. Add an appropriate label and free the interrupt before disabling
the clock in error path and in remove().
Fixes: e7c0fe2a5c84 ("dmaengine: add Qualcomm BAM dma driver")
Closes: https://sashiko.dev/#/patchset/20260427-qcom-qce-cmd-descr-v16-0-945fd1cafbbc%40oss.qualcomm.com?part=2
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/qcom/bam_dma.c | 11 ++++++-----
1 file changed, 6 insertions(+), 5 deletions(-)
diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 19116295f8325767a0d97a7848077885b118241c..b3d36ea79984385fe0d05ce56042d3e6e3030c5a 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -1302,8 +1302,7 @@ static int bam_dma_probe(struct platform_device *pdev)
for (i = 0; i < bdev->num_channels; i++)
bam_channel_init(bdev, &bdev->channels[i], i);
- ret = devm_request_irq(bdev->dev, bdev->irq, bam_dma_irq,
- IRQF_TRIGGER_HIGH, "bam_dma", bdev);
+ ret = request_irq(bdev->irq, bam_dma_irq, IRQF_TRIGGER_HIGH, "bam_dma", bdev);
if (ret)
goto err_bam_channel_exit;
@@ -1336,7 +1335,7 @@ static int bam_dma_probe(struct platform_device *pdev)
ret = dma_async_device_register(&bdev->common);
if (ret) {
dev_err(bdev->dev, "failed to register dma async device\n");
- goto err_bam_channel_exit;
+ goto err_free_irq;
}
ret = of_dma_controller_register(pdev->dev.of_node, bam_dma_xlate,
@@ -1355,6 +1354,8 @@ static int bam_dma_probe(struct platform_device *pdev)
err_unregister_dma:
dma_async_device_unregister(&bdev->common);
+err_free_irq:
+ free_irq(bdev->irq, bdev);
err_bam_channel_exit:
for (i = 0; i < bdev->num_channels; i++)
tasklet_kill(&bdev->channels[i].vc.task);
@@ -1371,6 +1372,8 @@ static void bam_dma_remove(struct platform_device *pdev)
struct bam_device *bdev = platform_get_drvdata(pdev);
u32 i;
+ free_irq(bdev->irq, bdev);
+
pm_runtime_force_suspend(&pdev->dev);
of_dma_controller_free(pdev->dev.of_node);
@@ -1379,8 +1382,6 @@ static void bam_dma_remove(struct platform_device *pdev)
/* mask all interrupts for this execution environment */
writel_relaxed(0, bam_addr(bdev, 0, BAM_IRQ_SRCS_MSK_EE));
- devm_free_irq(bdev->dev, bdev->irq, bdev);
-
for (i = 0; i < bdev->num_channels; i++) {
bam_dma_terminate_all(&bdev->channels[i].vc.chan);
tasklet_kill(&bdev->channels[i].vc.task);
--
2.47.3
^ permalink raw reply related
* [PATCH v18 01/14] dmaengine: constify struct dma_descriptor_metadata_ops
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260522-qcom-qce-cmd-descr-v18-0-99103926bafc@oss.qualcomm.com>
There's no reason for the instances of this struct to be modifiable.
Constify the pointer in struct dma_async_tx_descriptor and all drivers
currently using it.
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
drivers/dma/ti/k3-udma.c | 2 +-
drivers/dma/xilinx/xilinx_dma.c | 2 +-
include/linux/dmaengine.h | 2 +-
3 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c
index c964ebfcf3b68d86e4bbc9b62bad2212f0ce3ee9..8a2f235b669aaf084a6f7b3e6b23d06b04768608 100644
--- a/drivers/dma/ti/k3-udma.c
+++ b/drivers/dma/ti/k3-udma.c
@@ -3408,7 +3408,7 @@ static int udma_set_metadata_len(struct dma_async_tx_descriptor *desc,
return 0;
}
-static struct dma_descriptor_metadata_ops metadata_ops = {
+static const struct dma_descriptor_metadata_ops metadata_ops = {
.attach = udma_attach_metadata,
.get_ptr = udma_get_metadata_ptr,
.set_len = udma_set_metadata_len,
diff --git a/drivers/dma/xilinx/xilinx_dma.c b/drivers/dma/xilinx/xilinx_dma.c
index 404235c1735384635597e88edc25c67c7d250647..165b11a7c776abc6a8d66d631e19da669644577d 100644
--- a/drivers/dma/xilinx/xilinx_dma.c
+++ b/drivers/dma/xilinx/xilinx_dma.c
@@ -653,7 +653,7 @@ static void *xilinx_dma_get_metadata_ptr(struct dma_async_tx_descriptor *tx,
return seg->hw.app;
}
-static struct dma_descriptor_metadata_ops xilinx_dma_metadata_ops = {
+static const struct dma_descriptor_metadata_ops xilinx_dma_metadata_ops = {
.get_ptr = xilinx_dma_get_metadata_ptr,
};
diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h
index b3d251c9734e95e1b75cf6763d4d2c3a1c6a9910..5244edb90e7e7510bf4460b6a74ee2a7f91c1ccc 100644
--- a/include/linux/dmaengine.h
+++ b/include/linux/dmaengine.h
@@ -623,7 +623,7 @@ struct dma_async_tx_descriptor {
void *callback_param;
struct dmaengine_unmap_data *unmap;
enum dma_desc_metadata_mode desc_metadata_mode;
- struct dma_descriptor_metadata_ops *metadata_ops;
+ const struct dma_descriptor_metadata_ops *metadata_ops;
#ifdef CONFIG_ASYNC_TX_ENABLE_CHANNEL_SWITCH
struct dma_async_tx_descriptor *next;
struct dma_async_tx_descriptor *parent;
--
2.47.3
^ permalink raw reply related
* [PATCH v18 00/14] crypto/dmaengine: qce: introduce BAM locking and use DMA for register I/O
From: Bartosz Golaszewski @ 2026-05-22 13:39 UTC (permalink / raw)
To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
Neil Armstrong
Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
Dmitry Baryshkov, Konrad Dybcio
Some more fixes for issues pointed out by sashiko.
Merging strategy: there are build-time dependencies between the crypto
and DMA patches so the best approach is for Vinod to create an immutable
branch with the DMA part pulled in by the crypto tree.
This iteration continues to build on top of v12 but uses the BAM's NWD
bit on data descriptors as suggested by Stephan. To that end, there are
some more changes like reversing the order of command and data
descriptors queuedy by the QCE driver.
Currently the QCE crypto driver accesses the crypto engine registers
directly via CPU. Trust Zone may perform crypto operations simultaneously
resulting in a race condition. To remedy that, let's introduce support
for BAM locking/unlocking to the driver. The BAM driver will now wrap
any existing issued descriptor chains with additional descriptors
performing the locking when the client starts the transaction
(dmaengine_issue_pending()). The client wanting to profit from locking
needs to switch to performing register I/O over DMA and communicate the
address to which to perform the dummy writes via a call to
dmaengine_desc_attach_metadata().
In the specific case of the BAM DMA this translates to sending command
descriptors performing dummy writes with the relevant flags set. The BAM
will then lock all other pipes not related to the current pipe group, and
keep handling the current pipe only until it sees the the unlock bit.
In order for the locking to work correctly, we also need to switch to
using DMA for all register I/O.
On top of this, the series contains some additional tweaks and
refactoring.
The goal of this is not to improve the performance but to prepare the
driver for supporting decryption into secure buffers in the future.
Tested with tcrypt.ko, kcapi and cryptsetup.
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
Changes in v18:
- Free the BAM interrupt before disabling the clock in remove() path too
- convert the size assigned to command descriptors to little endian
- don't pass DMA mapping attributes to dma_map_sg() in bam_dma when
setting up command descriptors
- Cancel the QCE workqueue *after* any outstanding DMA transfer
completes
- When mapping the scatterlist for command descriptors: use the actual
number of mapped segments for dmaengine_prep_slave_sg()
- Drop the leftover read_buf field from struct qce_device
- Unmap command descriptors only after terminating the RX transfer
- Pass the actual size of the metadata struct to
dmaengine_desc_attach_metadata(), this is not really required for our
use-case but let's do this for correctness and make sashiko happy
- Drop double assignment of bam_ce_idx in qce_clear_bam_transaction()
- Remove unused QCE_MAX_REG_READ
- Link to v17: https://patch.msgid.link/20260519-qcom-qce-cmd-descr-v17-0-53a595414b79@oss.qualcomm.com
Changes in v17:
- New patch: free the interrupt before disabling the clock in error path
in probe()
- New patch: cancel the QCE work on device detach
- Hold the channel lock when attaching the metadata
- Reorder the operations in devm_qce_dma_request() to avoid freeing
memory that may still be used by the DMA channel
- Register algorithms as the last step in QCE's probe() to avoid making
the resources available to the system before the DMA is fully set up
- Fix error paths in algo request handlers
- Don't pass dmaengine attributes to map_sg_attrs() as it expects
dma-mapping attribute flags
- Fix a dma mapping leak for command descriptors
- Rebase on top of v7.1-rc4
- Link to v16: https://patch.msgid.link/20260427-qcom-qce-cmd-descr-v16-0-945fd1cafbbc@oss.qualcomm.com
Changes in v16:
- Fix a reported race between dma_map_sg() called with spinlock taken
and the corresponding dma_unmap_sg() called without it by moving the
descriptor locking data into the descriptor struct
- Also queue the TX data descriptors before the command descriptors to
match what downstream is doing
- Tweak commit messages
- Rebase on top of v7.1-rc1
- Link to v15: https://patch.msgid.link/20260402-qcom-qce-cmd-descr-v15-0-98b5361f7ed7@oss.qualcomm.com
Changes in v15:
- Extend the descriptor metadata struct to also carry the channel's
transfer direction and stop using dmaengine_slave_config() for that
- Link to v14: https://patch.msgid.link/20260323-qcom-qce-cmd-descr-v14-0-f323af411274@oss.qualcomm.com
Changes in v14:
- Don't return an error to a client which wants to use locking on BAM
that doesn't support it
- Add a comment describing the DMA descriptor metadata structure
- Fix memory leaks
- Remove leftovers from previous iterations
- Propagate errors from dma_cookie_assign() when setting up lock
descriptors
- Link to v13: https://patch.msgid.link/20260317-qcom-qce-cmd-descr-v13-0-0968eb4f8c40@oss.qualcomm.com
Changes in v13:
- As part of the DMA changes in the QCE driver: reverse the order of
queueing the descriptors in the QCE driver: queue command descriptors
with all the register writes first, followed by all the data descriptors,
this is in line with the recommandations from the BAM HPG
- Set the NWD (notify-when-done) bit (DMA_PREP_FENCE in dmaengine
parlance) on the data descriptors to ensure that the UNLOCK descriptor
will not be processed until after they have been processed by the
engine. While technically the NWD bit is only needed on the final data
descriptor, it's hard to tell which one *will* be the last from the
driver's point-of-view and both the downstream driver as well as
the Qualcomm TZ against which we want to synchronize sets NWD on every
data descriptor,
- Revert to creating the LOCK/UNLOCK command descriptor pair in one
place now that the NWD bit is in place,
- Link to v12: https://patch.msgid.link/20260310-qcom-qce-cmd-descr-v12-0-398f37f26ef0@oss.qualcomm.com
Changes in v12:
- Wait until the transaction is done before queueing the UNLOCK command
descriptor
- Use descriptor metadata for communicating the scratchpad address to
the BAM driver
- To that end: reverse the order of the series (first BAM, then QCE) to
maintain bisectability
- Unmap buffers used for dummy writes after the transaction
- Link to v11: https://patch.msgid.link/20260302-qcom-qce-cmd-descr-v11-0-4bf1f5db4802@oss.qualcomm.com
Changes in v11:
- Use new approach, not requiring the client to be involved in locking.
- Add a patch constifying dma_descriptor_metadata_ops
- Rebase on top of v7.0-rc1
- Link to v10: https://lore.kernel.org/r/20251219-qcom-qce-cmd-descr-v10-0-ff7e4bf7dad4@oss.qualcomm.com
Changes in v10:
- Move DESC_FLAG_(UN)LOCK BIT definitions from patch 2 to 3
- Add a patch constifying the dma engine metadata as the first in the
series
- Use the VERSION register for dummy lock/unlock writes
- Link to v9: https://lore.kernel.org/r/20251128-qcom-qce-cmd-descr-v9-0-9a5f72b89722@linaro.org
Changes in v9:
- Drop the global, generic LOCK/UNLOCK flags and instead use DMA
descriptor metadata ops to pass BAM-specific information from the QCE
to the DMA engine
- Link to v8: https://lore.kernel.org/r/20251106-qcom-qce-cmd-descr-v8-0-ecddca23ca26@linaro.org
Changes in v8:
- Rework the command descriptor logic and drop a lot of unneeded code
- Use the physical address for BAM command descriptor access, not the
mapped DMA address
- Fix the problems with iommu faults on newer platforms
- Generalize the LOCK/UNLOCK flags in dmaengine and reword the docs and
commit messages
- Make the BAM locking logic stricter in the DMA engine driver
- Add some additional minor QCE driver refactoring changes to the series
- Lots of small reworks and tweaks to rebase on current mainline and fix
previous issues
- Link to v7: https://lore.kernel.org/all/20250311-qce-cmd-descr-v7-0-db613f5d9c9f@linaro.org/
Changes in v7:
- remove unused code: writing to multiple registers was not used in v6,
neither were the functions for reading registers over BAM DMA-
- remove
- don't read the SW_VERSION register needlessly in the BAM driver,
instead: encode the information on whether the IP supports BAM locking
in device match data
- shrink code where possible with logic modifications (for instance:
change the implementation of qce_write() instead of replacing it
everywhere with a new symbol)
- remove duplicated error messages
- rework commit messages
- a lot of shuffling code around for easier review and a more
streamlined series
- Link to v6: https://lore.kernel.org/all/20250115103004.3350561-1-quic_mdalam@quicinc.com/
Changes in v6:
- change "BAM" to "DMA"
- Ensured this series is compilable with the current Linux-next tip of
the tree (TOT).
Changes in v5:
- Added DMA_PREP_LOCK and DMA_PREP_UNLOCK flag support in separate patch
- Removed DMA_PREP_LOCK & DMA_PREP_UNLOCK flag
- Added FIELD_GET and GENMASK macro to extract major and minor version
Changes in v4:
- Added feature description and test hardware
with test command
- Fixed patch version numbering
- Dropped dt-binding patch
- Dropped device tree changes
- Added BAM_SW_VERSION register read
- Handled the error path for the api dma_map_resource()
in probe
- updated the commit messages for batter redability
- Squash the change where qce_bam_acquire_lock() and
qce_bam_release_lock() api got introduce to the change where
the lock/unlock flag get introced
- changed cover letter subject heading to
"dmaengine: qcom: bam_dma: add cmd descriptor support"
- Added the very initial post for BAM lock/unlock patch link
as v1 to track this feature
Changes in v3:
- https://lore.kernel.org/lkml/183d4f5e-e00a-8ef6-a589-f5704bc83d4a@quicinc.com/
- Addressed all the comments from v2
- Added the dt-binding
- Fix alignment issue
- Removed type casting from qce_write_reg_dma()
and qce_read_reg_dma()
- Removed qce_bam_txn = dma->qce_bam_txn; line from
qce_alloc_bam_txn() api and directly returning
dma->qce_bam_txn
Changes in v2:
- https://lore.kernel.org/lkml/20231214114239.2635325-1-quic_mdalam@quicinc.com/
- Initial set of patches for cmd descriptor support
- Add client driver to use BAM lock/unlock feature
- Added register read/write via BAM in QCE Crypto driver
to use BAM lock/unlock feature
---
Bartosz Golaszewski (14):
dmaengine: constify struct dma_descriptor_metadata_ops
dmaengine: qcom: bam_dma: free interrupt before the clock in error path
dmaengine: qcom: bam_dma: convert tasklet to a BH workqueue
dmaengine: qcom: bam_dma: Extend the driver's device match data
dmaengine: qcom: bam_dma: Add pipe_lock_supported flag support
dmaengine: qcom: bam_dma: add support for BAM locking
crypto: qce - Cancel work on device detach
crypto: qce - Include algapi.h in the core.h header
crypto: qce - Remove unused ignore_buf
crypto: qce - Simplify arguments of devm_qce_dma_request()
crypto: qce - Use existing devres APIs in devm_qce_dma_request()
crypto: qce - Map crypto memory for DMA
crypto: qce - Add BAM DMA support for crypto register I/O
crypto: qce - Communicate the base physical address to the dmaengine
drivers/crypto/qce/aead.c | 10 +-
drivers/crypto/qce/common.c | 20 ++--
drivers/crypto/qce/core.c | 39 ++++++-
drivers/crypto/qce/core.h | 7 ++
drivers/crypto/qce/dma.c | 165 ++++++++++++++++++++++------
drivers/crypto/qce/dma.h | 11 +-
drivers/crypto/qce/sha.c | 10 +-
drivers/crypto/qce/skcipher.c | 10 +-
drivers/dma/qcom/bam_dma.c | 229 +++++++++++++++++++++++++++++++++------
drivers/dma/ti/k3-udma.c | 2 +-
drivers/dma/xilinx/xilinx_dma.c | 2 +-
include/linux/dma/qcom_bam_dma.h | 14 +++
include/linux/dmaengine.h | 2 +-
13 files changed, 422 insertions(+), 99 deletions(-)
---
base-commit: b4a253871ac29e454a62b6746b0385d52cfe7b24
change-id: 20251103-qcom-qce-cmd-descr-c5e9b11fe609
Best regards,
--
Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
^ permalink raw reply
* [PATCH v3 16/16] ioctl_userfaultfd.2: Add read-write protect mode docs
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Userfaultfd read-write protection (UFFDIO_REGISTER_MODE_RWP) is
supported starting from Linux 7.2. It traps every access -- read or
write -- to a present page within a registered range. The new UAPI
documented here:
- UFFD_FEATURE_RWP / UFFD_FEATURE_RWP_ASYNC capability bits
- UFFDIO_REGISTER_MODE_RWP registration-mode bit
- 1 << _UFFDIO_RWPROTECT / _UFFDIO_SET_MODE available-ioctls bits
- UFFDIO_RWPROTECT install / remove RWP
- UFFDIO_SET_MODE runtime sync/async toggle
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
man2/ioctl_userfaultfd.2 | 209 ++++++++++++++++++++++++++++++++++++++-
1 file changed, 208 insertions(+), 1 deletion(-)
diff --git a/man2/ioctl_userfaultfd.2 b/man2/ioctl_userfaultfd.2
index 504f61d4b0cd..0a24a77ca32b 100644
--- a/man2/ioctl_userfaultfd.2
+++ b/man2/ioctl_userfaultfd.2
@@ -25,7 +25,7 @@
.\" %%%LICENSE_END
.\"
.\"
-.TH IOCTL_USERFAULTFD 2 2021-03-22 "Linux" "Linux Programmer's Manual"
+.TH IOCTL_USERFAULTFD 2 2026-05-22 "Linux" "Linux Programmer's Manual"
.SH NAME
ioctl_userfaultfd \- create a file descriptor for handling page faults in user
space
@@ -214,6 +214,33 @@ memory accesses to the regions registered with userfaultfd.
If this feature bit is set,
.I uffd_msg.pagefault.feat.ptid
will be set to the faulted thread ID for each page-fault message.
+.TP
+.BR UFFD_FEATURE_RWP " (since Linux 7.2)"
+If this feature bit is set,
+the kernel supports read-write protection tracking, and the
+.B UFFDIO_REGISTER_MODE_RWP
+registration mode and the
+.B UFFDIO_RWPROTECT
+ioctl described below become available.
+On kernels or architectures that cannot support this mode, the bit is
+masked out from
+.I uffdio_api.features
+on return; callers should inspect the returned features and fall back
+to another tracking mechanism when the bit is absent.
+.TP
+.BR UFFD_FEATURE_RWP_ASYNC " (since Linux 7.2)"
+If this feature bit is set,
+the kernel will resolve read-write protect faults in place without
+delivering a notification, automatically restoring page permissions and
+letting the faulted thread continue.
+This bit requires
+.B UFFD_FEATURE_RWP
+to be set in the same
+.B UFFDIO_API
+call.
+The async mode can also be toggled at runtime using the
+.B UFFDIO_SET_MODE
+ioctl described below.
.PP
The returned
.I ioctls
@@ -240,6 +267,21 @@ operation is supported.
The
.B UFFDIO_WRITEPROTECT
operation is supported.
+.TP
+.BR "1 << _UFFDIO_RWPROTECT" " (since Linux 7.2)"
+The
+.B UFFDIO_RWPROTECT
+operation is supported.
+This bit is reported only when
+.B UFFD_FEATURE_RWP
+was negotiated successfully.
+.TP
+.BR "1 << _UFFDIO_SET_MODE" " (since Linux 7.2)"
+The
+.B UFFDIO_SET_MODE
+operation is supported.
+This is a file-descriptor-level ioctl and is reported once per
+userfaultfd, independent of any registered range.
.PP
This
.BR ioctl (2)
@@ -327,6 +369,16 @@ Track page faults on missing pages.
.TP
.B UFFDIO_REGISTER_MODE_WP
Track page faults on write-protected pages.
+.TP
+.BR UFFDIO_REGISTER_MODE_RWP " (since Linux 7.2)"
+Track page faults on read-write-protected pages.
+Every access (read or write) to a present page within the registered
+range generates a notification once the range has been protected with
+.BR UFFDIO_RWPROTECT .
+This mode cannot be combined with
+.BR UFFDIO_REGISTER_MODE_WP ;
+attempting to do so returns
+.BR EINVAL .
.PP
If the operation is successful, the kernel modifies the
.I ioctls
@@ -735,6 +787,161 @@ or not registered with userfaultfd write-protect mode.
.TP
.B EFAULT
Encountered a generic fault during processing.
+.SS UFFDIO_RWPROTECT (Since Linux 7.2)
+Read-write-protect or un-protect a userfaultfd-registered memory range
+registered with mode
+.BR UFFDIO_REGISTER_MODE_RWP .
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_rwprotect
+structure as shown below:
+.PP
+.in +4n
+.EX
+struct uffdio_rwprotect {
+ struct uffdio_range range; /* Range to change RWP on */
+ __u64 mode; /* Mode flags */
+};
+.EE
+.in
+.PP
+The following mode bits are supported:
+.TP
+.B UFFDIO_RWPROTECT_MODE_RWP
+When this mode bit is set,
+the ioctl installs read-write protection on every present page in the
+range specified by
+.IR range .
+Otherwise the ioctl removes read-write protection from the range, which
+is also how a faulted handler resolves an
+.B UFFD_PAGEFAULT_FLAG_RWP
+notification.
+.TP
+.B UFFDIO_RWPROTECT_MODE_DONTWAKE
+When this mode bit is set,
+do not wake up any thread that waits for page-fault resolution after
+the operation.
+This can be specified only if
+.B UFFDIO_RWPROTECT_MODE_RWP
+is not specified.
+.PP
+Read-write protection only affects pages that are currently populated
+in the range; unmapped addresses are left untouched.
+Protection is preserved across page reclaim and migration; callers must
+re-arm a range with
+.B UFFDIO_RWPROTECT
+after any operation that drops the underlying page
+.RB ( "MADV_DONTNEED " "on anonymous memory, hole-punch on shmem,"
+truncation of a file mapping).
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EINVAL
+The
+.I start
+or the
+.I len
+field of the
+.I uffdio_range
+structure was not a multiple of the system page size; or
+.I len
+was zero; or the specified range was otherwise invalid; or an invalid
+mode bit was specified; or
+.B UFFDIO_RWPROTECT_MODE_DONTWAKE
+was specified together with
+.BR UFFDIO_RWPROTECT_MODE_RWP .
+.TP
+.B EAGAIN
+The process was interrupted; retry this call.
+.TP
+.B ENOENT
+The range specified in
+.I range
+is not valid.
+For example, the virtual address does not exist,
+or part of the range is not registered with
+.BR UFFDIO_REGISTER_MODE_RWP .
+.TP
+.B EFAULT
+Encountered a generic fault during processing.
+.\"
+.SS UFFDIO_SET_MODE (Since Linux 7.2)
+Toggle userfaultfd features that may be flipped at runtime.
+.PP
+The
+.I argp
+argument is a pointer to a
+.I uffdio_set_mode
+structure as shown below:
+.PP
+.in +4n
+.EX
+struct uffdio_set_mode {
+ __u64 enable; /* Feature bits to set */
+ __u64 disable; /* Feature bits to clear */
+};
+.EE
+.in
+.PP
+Bits set in
+.I enable
+turn the named features on; bits set in
+.I disable
+turn them off.
+The two fields must not overlap.
+Today only
+.B UFFD_FEATURE_RWP_ASYNC
+is a valid bit in either field; any other bit causes the ioctl to
+return
+.BR EINVAL .
+Enabling
+.B UFFD_FEATURE_RWP_ASYNC
+also requires
+.B UFFD_FEATURE_RWP
+to have been negotiated at
+.B UFFDIO_API
+time.
+.PP
+The toggle takes the per-process
+.I mmap_lock
+in write mode, ensuring that all in-flight fault handlers complete
+before the new mode takes effect.
+This allows a single userfaultfd to switch between lightweight async
+detection and synchronous eviction without re-registering its ranges.
+.PP
+This
+.BR ioctl (2)
+operation returns 0 on success.
+On error, \-1 is returned and
+.I errno
+is set to indicate the error.
+Possible errors include:
+.TP
+.B EINVAL
+A bit other than
+.B UFFD_FEATURE_RWP_ASYNC
+was specified in
+.I enable
+or
+.IR disable ;
+the two fields overlap; or
+.B UFFD_FEATURE_RWP_ASYNC
+was requested without
+.B UFFD_FEATURE_RWP
+having been negotiated.
+.TP
+.B EFAULT
+.I argp
+refers to an address that is outside the calling process's accessible
+address space.
.SH RETURN VALUE
See descriptions of the individual operations, above.
.SH ERRORS
--
2.51.2
^ permalink raw reply related
* [PATCH v3 15/16] userfaultfd.2: Add read-write protect mode
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Read-write protect mode (UFFDIO_REGISTER_MODE_RWP) is supported starting
from Linux 7.2. It traps every access -- read or write -- to a present
page within a registered range. The matching UAPI consists of:
- UFFDIO_REGISTER_MODE_RWP registration-mode bit
- UFFD_FEATURE_RWP capability bit
- UFFD_FEATURE_RWP_ASYNC async (in-kernel) fault resolution
- UFFDIO_RWPROTECT install / remove RWP on a range
- UFFDIO_SET_MODE runtime sync/async toggle
- UFFD_PAGEFAULT_FLAG_RWP new pagefault.flags bit
Document the new registration-mode entry, the "Userfaultfd read-write
protect mode" section, the new pagefault flag, and a VERSIONS line.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
---
man2/userfaultfd.2 | 147 ++++++++++++++++++++++++++++++++++++++++++++-
1 file changed, 146 insertions(+), 1 deletion(-)
diff --git a/man2/userfaultfd.2 b/man2/userfaultfd.2
index cee7c01d2512..0e702f2f4969 100644
--- a/man2/userfaultfd.2
+++ b/man2/userfaultfd.2
@@ -24,7 +24,7 @@
.\" the source, must acknowledge the copyright and authors of this work.
.\" %%%LICENSE_END
.\"
-.TH USERFAULTFD 2 2021-03-22 "Linux" "Linux Programmer's Manual"
+.TH USERFAULTFD 2 2026-05-22 "Linux" "Linux Programmer's Manual"
.SH NAME
userfaultfd \- create a file descriptor for handling page faults in user space
.SH SYNOPSIS
@@ -105,6 +105,28 @@ The faulted thread will be stopped from execution
until user-space write-unprotects the page using an
.B UFFDIO_WRITEPROTECT
ioctl.
+.TP
+.BR UFFDIO_REGISTER_MODE_RWP " (since Linux 7.2)"
+When registered with
+.B UFFDIO_REGISTER_MODE_RWP
+mode, user-space will receive a page-fault notification
+on any access \(em read or write \(em to a present page within the range.
+By default the faulted thread will be stopped from execution until
+user-space removes the protection using a
+.B UFFDIO_RWPROTECT
+ioctl;
+if
+.B UFFD_FEATURE_RWP_ASYNC
+was negotiated, the kernel restores access in place and the faulted
+thread continues without blocking.
+.IP
+.B UFFDIO_REGISTER_MODE_RWP
+and
+.B UFFDIO_REGISTER_MODE_WP
+cannot be combined on the same range; attempting to register with both
+bits set returns
+.BR EINVAL .
+See the "Userfaultfd read-write protect mode" section below.
.PP
Multiple modes can be enabled at the same time for the same memory range.
.PP
@@ -186,6 +208,21 @@ The user needs to resolve the page fault by unprotecting the faulted page and
kicking the faulted thread to continue.
For more information,
please refer to the "Userfaultfd write-protect mode" section.
+.PP
+Since Linux 7.2, userfaultfd can do read-write protection tracking, which
+traps every access (read or write) to a present page within a registered
+range.
+One should check against the feature bit
+.B UFFD_FEATURE_RWP
+before using this feature, and optionally negotiate
+.B UFFD_FEATURE_RWP_ASYNC
+to have the kernel auto-restore page permissions on fault without
+delivering a notification.
+This mode is intended for working-set tracking by VM memory managers and
+similar callers; cold pages can then be evicted using independent kernel
+interfaces.
+For more information,
+please refer to the "Userfaultfd read-write protect mode" section.
.\"
.SS Userfaultfd operation
After the userfaultfd object is created with
@@ -322,6 +359,98 @@ should have the flag
cleared upon the faulted page or range.
.PP
Write-protect mode supports only private anonymous memory.
+.SS Userfaultfd read-write protect mode (since Linux 7.2)
+Since Linux 7.2, userfaultfd supports read-write protect mode.
+Unlike write-protect mode, every access \(em read or write \(em to a
+protected present page generates a userfaultfd notification.
+It works on anonymous, shmem, and hugetlbfs mappings.
+.PP
+The user needs to first check availability of this feature using the
+.B UFFDIO_API
+ioctl against the feature bit
+.B UFFD_FEATURE_RWP
+before using this mode.
+On kernels or architectures that cannot support read-write protection,
+the bit is masked out from
+.I uffdio_api.features
+on return from
+.BR UFFDIO_API ;
+callers should inspect the returned features and fall back to another
+tracking mechanism when the bit is absent.
+.PP
+To register with userfaultfd read-write protect mode, the user needs to
+initiate the
+.B UFFDIO_REGISTER
+ioctl with mode
+.B UFFDIO_REGISTER_MODE_RWP
+set.
+.B UFFDIO_REGISTER_MODE_RWP
+cannot be combined with
+.BR UFFDIO_REGISTER_MODE_WP ;
+however it can be combined with
+.B UFFDIO_REGISTER_MODE_MISSING
+when the caller also wants notifications for fresh page populations.
+.PP
+After registration, the user can read-write-protect any existing memory
+within the range using the
+.B UFFDIO_RWPROTECT
+ioctl where
+.I uffdio_rwprotect.mode
+is set to
+.BR UFFDIO_RWPROTECT_MODE_RWP .
+Read-write protection only affects pages that are currently populated
+in the range; unpopulated addresses remain unpopulated and fall through
+to the normal missing-page path on first access.
+.PP
+Protection is preserved across page reclaim and migration; it is
+.I not
+preserved across operations that drop the underlying page
+.RB ( "MADV_DONTNEED " "on anonymous memory, hole-punch on shmem,"
+truncation of a file mapping).
+Callers must re-arm the range with
+.B UFFDIO_RWPROTECT
+after any such operation.
+.PP
+When an access fault happens against a protected page, user-space will
+receive a page-fault notification whose
+.I uffd_msg.pagefault.flags
+field has the
+.B UFFD_PAGEFAULT_FLAG_RWP
+bit set.
+.PP
+To resolve a read-write-protect page fault, the user initiates another
+.B UFFDIO_RWPROTECT
+ioctl whose
+.I uffdio_rwprotect.mode
+has the
+.B UFFDIO_RWPROTECT_MODE_RWP
+flag cleared.
+This restores the original VMA permissions on the affected pages and
+wakes any blocked threads (unless
+.B UFFDIO_RWPROTECT_MODE_DONTWAKE
+is also set).
+.PP
+If
+.B UFFD_FEATURE_RWP_ASYNC
+was negotiated alongside
+.BR UFFD_FEATURE_RWP ,
+the kernel resolves access faults in place without delivering a
+notification: page permissions are restored automatically and the
+faulting thread continues.
+Callers can later reconstruct which pages were touched by inspecting the
+.B PAGE_IS_ACCESSED
+bit returned by the
+.B PAGEMAP_SCAN
+ioctl described in
+.BR ioctl_userfaultfd (2)
+and
+.IR Documentation/admin\-guide/mm/pagemap.rst
+in the Linux kernel source.
+.PP
+The async mode can be toggled at runtime using the
+.B UFFDIO_SET_MODE
+ioctl, which lets a single userfaultfd switch between async detection
+and synchronous eviction without re-registering the range.
.SS Reading from the userfaultfd structure
Each
.BR read (2)
@@ -473,6 +602,12 @@ If the address is in a range that was registered with the
.B UFFDIO_REGISTER_MODE_WP
flag, when this bit is set, it means it is a write-protect fault.
Otherwise it is a page-missing fault.
+.TP
+.BR UFFD_PAGEFAULT_FLAG_RWP " (since Linux 7.2)"
+If the address is in a range that was registered with the
+.B UFFDIO_REGISTER_MODE_RWP
+flag, this bit indicates that the fault was triggered by an access to a
+read-write-protected page (either a read or a write).
.RE
.TP
.I pagefault.feat.pid
@@ -574,6 +709,16 @@ system call first appeared in Linux 4.3.
.PP
The support for hugetlbfs and shared memory areas and
non-page-fault events was added in Linux 4.11
+.PP
+Read-write protect mode
+.RB ( UFFDIO_REGISTER_MODE_RWP ", " UFFD_FEATURE_RWP ", "
+.BR UFFDIO_RWPROTECT )
+was added in Linux 7.2,
+together with
+.B UFFD_FEATURE_RWP_ASYNC
+and the
+.B UFFDIO_SET_MODE
+runtime mode toggle.
.SH CONFORMING TO
.BR userfaultfd ()
is Linux-specific and should not be used in programs intended to be
--
2.51.2
^ permalink raw reply related
* [PATCH v3 14/16] Documentation/userfaultfd: document RWP working set tracking
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Add an admin-guide section covering UFFDIO_REGISTER_MODE_RWP:
- sync and async fault models;
- UFFDIO_RWPROTECT semantics;
- UFFD_FEATURE_RWP_ASYNC;
- UFFDIO_SET_MODE runtime mode flips.
It also covers typical VMM working-set-tracking workflow from detection
loop through sync-mode eviction and back to async.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
Documentation/admin-guide/mm/userfaultfd.rst | 226 ++++++++++++++++++-
1 file changed, 220 insertions(+), 6 deletions(-)
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index 1e533639fd50..cb5d0e0c9fff 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -275,16 +275,16 @@ tracking and it can be different in a few ways:
- Dirty information will not get lost if the pte was zapped due to
various reasons (e.g. during split of a shmem transparent huge page).
- - Due to a reverted meaning of soft-dirty (page clean when uffd-wp bit
- set; dirty when uffd-wp bit cleared), it has different semantics on
- some of the memory operations. For example: ``MADV_DONTNEED`` on
+ - Due to a reverted meaning of soft-dirty (page clean when the uffd bit
+ is set; dirty when the uffd bit is cleared), it has different semantics
+ on some of the memory operations. For example: ``MADV_DONTNEED`` on
anonymous (or ``MADV_REMOVE`` on a file mapping) will be treated as
- dirtying of memory by dropping uffd-wp bit during the procedure.
+ dirtying of memory by dropping the uffd bit during the procedure.
The user app can collect the "written/dirty" status by looking up the
-uffd-wp bit for the pages being interested in /proc/pagemap.
+uffd bit for the pages being interested in /proc/pagemap.
-The page will not be under track of uffd-wp async mode until the page is
+The page will not be under track of userfaultfd-wp async mode until the page is
explicitly write-protected by ``ioctl(UFFDIO_WRITEPROTECT)`` with the mode
flag ``UFFDIO_WRITEPROTECT_MODE_WP`` set. Trying to resolve a page fault
that was tracked by async mode userfaultfd-wp is invalid.
@@ -307,6 +307,220 @@ transparent to the guest, we want that same address range to act as if it was
still poisoned, even though it's on a new physical host which ostensibly
doesn't have a memory error in the exact same spot.
+Read-Write Protection
+---------------------
+
+``UFFDIO_REGISTER_MODE_RWP`` enables read-write protection tracking on a
+memory range. It is similar to (but faster than) ``mprotect(PROT_NONE)``
+combined with a signal handler; unlike ``mprotect(PROT_NONE)``, RWP only
+traps accesses to *present* PTEs, so accesses to unpopulated addresses in a
+protected range fall through to the normal missing-page path. It uses the
+PROT_NONE hinting mechanism (same as NUMA balancing) to make pages
+inaccessible while keeping them resident in memory. Works on anonymous,
+shmem, and hugetlbfs memory.
+
+RWP is designed for VM memory managers that need to track the working set
+of guest memory for cold page eviction to tiered or remote storage.
+
+**Setup:**
+
+1. Open a userfaultfd and enable ``UFFD_FEATURE_RWP`` via ``UFFDIO_API``.
+ Optionally request ``UFFD_FEATURE_RWP_ASYNC`` as well — it requires
+ ``UFFD_FEATURE_RWP`` to be set in the same ``UFFDIO_API`` call.
+
+2. Register the guest memory range with ``UFFDIO_REGISTER_MODE_RWP``
+ (and ``UFFDIO_REGISTER_MODE_MISSING`` if evicted pages will need to be
+ fetched back from storage).
+
+**Feature availability:**
+
+RWP is built on top of two kernel primitives: a spare PTE bit owned by
+userfaultfd (``CONFIG_HAVE_ARCH_USERFAULTFD_WP``) and architecture support
+for present-but-inaccessible PTEs (``CONFIG_ARCH_HAS_PTE_PROTNONE``). When both
+are available on a 64-bit kernel, the build selects
+``CONFIG_USERFAULTFD_RWP=y`` and the ``VM_UFFD_RWP`` VMA flag becomes
+available.
+
+``UFFD_FEATURE_RWP`` and ``UFFD_FEATURE_RWP_ASYNC`` are masked out of the
+features returned by ``UFFDIO_API`` when the running kernel or architecture
+cannot support them — for example 32-bit kernels (where ``VM_UFFD_RWP`` is
+unavailable), kernels built without ``CONFIG_USERFAULTFD_RWP``, and
+architectures whose ptes cannot carry the uffd bit at runtime (e.g. riscv
+without the ``SVRSW60T59B`` extension). ``UFFDIO_API`` does not fail;
+unsupported bits are simply absent from ``uffdio_api.features`` on return.
+Callers should inspect the returned ``features`` after ``UFFDIO_API`` and
+fall back to another tracking method when RWP is unavailable.
+
+**Protecting and Unprotecting:**
+
+Use ``UFFDIO_RWPROTECT`` to protect or unprotect a range, mirroring the
+``UFFDIO_WRITEPROTECT`` interface::
+
+ struct uffdio_rwprotect rwp = {
+ .range = { .start = addr, .len = len },
+ .mode = UFFDIO_RWPROTECT_MODE_RWP, /* protect */
+ };
+ ioctl(uffd, UFFDIO_RWPROTECT, &rwp);
+
+Setting ``UFFDIO_RWPROTECT_MODE_RWP`` sets PROT_NONE on present PTEs in the
+range. Pages stay resident and their physical frames are preserved — only
+access permissions are removed.
+
+Clearing ``UFFDIO_RWPROTECT_MODE_RWP`` restores normal VMA permissions and
+wakes any faulting threads (unless ``UFFDIO_RWPROTECT_MODE_DONTWAKE`` is set).
+
+**Scope of protection:**
+
+RWP protection is a property of *present* PTEs. ``UFFDIO_RWPROTECT`` only
+affects entries that are already populated. Unpopulated addresses within
+the range remain unpopulated; when first accessed they fault through the
+normal missing path (``do_anonymous_page()``, ``do_swap_page()``,
+``finish_fault()``) and the resulting PTE is not RWP-protected. To observe
+the population itself, co-register the range with
+``UFFDIO_REGISTER_MODE_MISSING``.
+
+Protection is preserved across page reclaim: a page swapped out while
+RWP-protected carries the marker on its swap entry, and swap-in restores
+the PROT_NONE state so the first access after swap-in still faults. The
+same applies to pages temporarily replaced by migration entries.
+
+Operations that drop the PTE entirely — ``MADV_DONTNEED`` on anonymous
+memory, hole-punch on shmem, truncation of a file mapping — also drop the
+RWP marker: the next access re-populates the range without protection.
+Unlike WP (which persists via ``PTE_MARKER_UFFD_WP``), there is no
+persistent RWP marker today. The user needs to re-arm the range with
+``UFFDIO_RWPROTECT`` after any operation that explicitly frees PTEs.
+
+**Fault Handling:**
+
+When a protected page is accessed:
+
+- **Sync mode** (default): The faulting thread blocks and a
+ ``UFFD_PAGEFAULT_FLAG_RWP`` message is delivered to the userfaultfd
+ handler. The handler resolves the fault with ``UFFDIO_RWPROTECT``
+ (clearing ``MODE_RWP``), which restores the PTE permissions and wakes
+ the faulting thread.
+
+- **Async mode** (``UFFD_FEATURE_RWP_ASYNC``): The kernel automatically
+ restores PTE permissions and the thread continues without blocking. No
+ message is delivered to the handler.
+
+**Runtime Mode Switching:**
+
+``UFFDIO_SET_MODE`` toggles ``UFFD_FEATURE_RWP_ASYNC`` at runtime, allowing
+the VMM to switch between lightweight async detection and safe sync
+eviction without re-registering. The toggle takes ``mmap_write_lock()`` to
+ensure all in-flight faults complete before the mode change takes effect.
+
+**Cold Page Detection with PAGEMAP_SCAN:**
+
+RWP-protected PTEs carry the uffd PTE bit; the fault-resolution path
+clears it. ``PAGEMAP_SCAN`` reports ``PAGE_IS_ACCESSED`` once the bit is
+clear on a ``VM_UFFD_RWP`` VMA, so inverting it efficiently reports the
+still-protected (cold) pages::
+
+ struct pm_scan_arg arg = {
+ .size = sizeof(arg),
+ .start = guest_mem_start,
+ .end = guest_mem_end,
+ .vec = (uint64_t)regions,
+ .vec_len = regions_len,
+ .category_mask = PAGE_IS_ACCESSED,
+ .category_inverted = PAGE_IS_ACCESSED,
+ .return_mask = PAGE_IS_ACCESSED,
+ };
+ long n = ioctl(pagemap_fd, PAGEMAP_SCAN, &arg);
+
+The returned ``page_region`` array contains contiguous cold ranges that can
+then be evicted.
+
+**Cleanup:**
+
+When the userfaultfd is closed or the range is unregistered, all PROT_NONE
+PTEs are automatically restored to their normal VMA permissions. This
+prevents pages from becoming permanently inaccessible.
+
+**VMM Working Set Tracking Workflow:**
+
+A typical VMM lifecycle for cold page eviction to tiered storage. Two
+mappings of the same shmem (or hugetlbfs) file are used: ``guest_mem`` is
+the RWP-registered mapping that vCPUs access through, and ``io_mem`` is a
+private mapping for VMM-side I/O. Reading ``io_mem`` does not go through
+the RWP-protected PTEs of ``guest_mem``, so the VMM's own ``pwrite()``
+never traps on its own ::
+
+ /* One-time setup */
+ fd = memfd_create("guest", MFD_CLOEXEC);
+ ftruncate(fd, guest_size);
+ guest_mem = mmap(NULL, guest_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0); /* vCPU view, RWP-registered */
+ io_mem = mmap(NULL, guest_size, PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0); /* VMM I/O view, unprotected */
+
+ uffd = userfaultfd(O_CLOEXEC | O_NONBLOCK);
+ ioctl(uffd, UFFDIO_API, &(struct uffdio_api){
+ .api = UFFD_API,
+ .features = UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ });
+ ioctl(uffd, UFFDIO_REGISTER, &(struct uffdio_register){
+ .range = { guest_mem, guest_size },
+ .mode = UFFDIO_REGISTER_MODE_RWP |
+ UFFDIO_REGISTER_MODE_MISSING,
+ });
+
+ /* Tracking loop */
+ while (vm_running) {
+ /* 1. Detection phase (async — no vCPU stalls) */
+ ioctl(uffd, UFFDIO_RWPROTECT, &(struct uffdio_rwprotect){
+ .range = full_range,
+ .mode = UFFDIO_RWPROTECT_MODE_RWP });
+ sleep(tracking_interval);
+
+ /* 2. Find cold pages (uffd bit still set) */
+ ioctl(pagemap_fd, PAGEMAP_SCAN, &(struct pm_scan_arg){
+ .category_mask = PAGE_IS_ACCESSED,
+ .category_inverted = PAGE_IS_ACCESSED,
+ .return_mask = PAGE_IS_ACCESSED,
+ ...
+ });
+
+ /* 3. Switch to sync for safe eviction */
+ ioctl(uffd, UFFDIO_SET_MODE,
+ &(struct uffdio_set_mode){
+ .disable = UFFD_FEATURE_RWP_ASYNC });
+
+ /* 4. Evict cold pages (vCPU faults block on guest_mem) */
+ for each cold range:
+ /* Read from io_mem -- bypasses RWP, no fault. */
+ pwrite(storage_fd, io_mem + cold_offset, len, offset);
+ /* Drop the page from the shared file. */
+ fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+ cold_offset, len);
+ /*
+ * Wake any vCPU blocked on the RWP fault for this range:
+ * fallocate() does not iterate ctx->fault_pending_wqh.
+ */
+ ioctl(uffd, UFFDIO_WAKE, &(struct uffdio_range){
+ .start = (uintptr_t)guest_mem + cold_offset,
+ .len = len });
+
+ /* 5. Resume async tracking */
+ ioctl(uffd, UFFDIO_SET_MODE,
+ &(struct uffdio_set_mode){
+ .enable = UFFD_FEATURE_RWP_ASYNC });
+ }
+
+During step 4, a vCPU that accesses ``guest_mem + cold_offset`` blocks
+with a ``UFFD_PAGEFAULT_FLAG_RWP`` fault while the eviction is in
+progress. After ``fallocate()`` punches the page out and ``UFFDIO_WAKE``
+fires, the vCPU retries the access, faults as ``MISSING``, and the
+handler resolves it with ``UFFDIO_COPY`` from storage.
+
+This workflow targets shmem and hugetlbfs (both support a private
+``io_mem`` mapping over the same fd). Anonymous-memory backings need a
+different inner-loop strategy because the VMM has no way to read the
+page without going through the RWP-protected mapping.
+
QEMU/KVM
========
--
2.51.2
^ permalink raw reply related
* [PATCH v3 13/16] selftests/mm: add userfaultfd RWP tests
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Coverage for UFFDIO_REGISTER_MODE_RWP and UFFDIO_RWPROTECT:
rwp-async async mode — touch pages, verify permissions are
auto-restored without a message
rwp-sync sync mode — access blocks, handler resolves via
UFFDIO_RWPROTECT
rwp-pagemap PAGEMAP_SCAN reports still-cold pages via
inverted PAGE_IS_ACCESSED
rwp-mprotect RWP survives mprotect(PROT_NONE) ->
mprotect(PROT_READ|PROT_WRITE) round-trip
rwp-gup GUP walks through a protnone RWP PTE (pipe
write/read drives the GUP path)
rwp-async-toggle UFFDIO_SET_MODE flips between sync and async
without re-registering
rwp-close closing the uffd restores page permissions
rwp-fork RWP survives fork() with EVENT_FORK; child's
PTEs keep the uffd bit
rwp-fork-pin RWP survives fork() on an RO-longterm-pinned
anon page (forces copy_present_page()); child
read auto-resolves and clears the bit, proving
PAGE_NONE was in place
rwp-wp-exclusive register with MODE_WP|MODE_RWP returns -EINVAL
All tests run against anon, shmem, shmem-private, hugetlb, and
hugetlb-private memory, except rwp-fork-pin which is anon-only —
copy_present_page() is the private-anon pinned-exclusive fork path.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
---
tools/testing/selftests/mm/uffd-unit-tests.c | 766 +++++++++++++++++++
1 file changed, 766 insertions(+)
diff --git a/tools/testing/selftests/mm/uffd-unit-tests.c b/tools/testing/selftests/mm/uffd-unit-tests.c
index 6f5e404a446c..234d3ac0adfb 100644
--- a/tools/testing/selftests/mm/uffd-unit-tests.c
+++ b/tools/testing/selftests/mm/uffd-unit-tests.c
@@ -7,6 +7,8 @@
#include "uffd-common.h"
+#include <linux/fs.h>
+#include <sys/uio.h>
#include "../../../../mm/gup_test.h"
#ifdef __NR_userfaultfd
@@ -128,6 +130,11 @@ static void uffd_test_skip(const char *message)
*/
static int test_uffd_api(bool use_dev)
{
+ const uint64_t expected_ioctls =
+ BIT_ULL(_UFFDIO_REGISTER) |
+ BIT_ULL(_UFFDIO_UNREGISTER) |
+ BIT_ULL(_UFFDIO_API) |
+ BIT_ULL(_UFFDIO_SET_MODE);
struct uffdio_api uffdio_api;
int uffd;
@@ -167,6 +174,15 @@ static int test_uffd_api(bool use_dev)
goto out;
}
+ /* Verify returned fd-level ioctls bitmask */
+ if ((uffdio_api.ioctls & expected_ioctls) != expected_ioctls) {
+ uffd_test_fail("UFFDIO_API missing expected ioctls: "
+ "got=0x%"PRIx64", expected=0x%"PRIx64,
+ (uint64_t)uffdio_api.ioctls,
+ expected_ioctls);
+ goto out;
+ }
+
/* Test double requests of UFFDIO_API with a random feature set */
uffdio_api.features = BIT_ULL(0);
if (ioctl(uffd, UFFDIO_API, &uffdio_api) == 0) {
@@ -623,6 +639,685 @@ void uffd_minor_collapse_test(uffd_global_test_opts_t *gopts, uffd_test_args_t *
uffd_minor_test_common(gopts, true, false);
}
+static int uffd_register_rwp(int uffd, void *addr, uint64_t len)
+{
+ struct uffdio_register reg = {
+ .range = { .start = (unsigned long)addr, .len = len },
+ .mode = UFFDIO_REGISTER_MODE_RWP,
+ };
+
+ if (ioctl(uffd, UFFDIO_REGISTER, ®) == -1)
+ return -errno;
+ return 0;
+}
+
+static void rwprotect_range(int uffd, __u64 start, __u64 len, bool protect)
+{
+ struct uffdio_rwprotect rwp = {
+ .range = { .start = start, .len = len },
+ .mode = protect ? UFFDIO_RWPROTECT_MODE_RWP : 0,
+ };
+
+ if (ioctl(uffd, UFFDIO_RWPROTECT, &rwp))
+ err("UFFDIO_RWPROTECT failed");
+}
+
+static void set_async_mode(int uffd, bool enable)
+{
+ struct uffdio_set_mode mode = { };
+
+ if (enable)
+ mode.enable = UFFD_FEATURE_RWP_ASYNC;
+ else
+ mode.disable = UFFD_FEATURE_RWP_ASYNC;
+
+ if (ioctl(uffd, UFFDIO_SET_MODE, &mode))
+ err("UFFDIO_SET_MODE failed");
+}
+
+/*
+ * Test async RWP faults on anonymous memory.
+ * Populate pages, register MODE_RWP with RWP_ASYNC,
+ * RW-protect, re-access, verify content preserved and no faults delivered.
+ */
+static void uffd_rwp_async_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ unsigned long p;
+
+ /* Populate all pages with known content */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+ /* Register MODE_RWP */
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+
+ /* RW-protect all pages (sets protnone) */
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ /* Access all pages — should auto-resolve, no faults */
+ for (p = 0; p < nr_pages; p++) {
+ unsigned char *page = (unsigned char *)gopts->area_dst +
+ p * page_size;
+ unsigned char expected = p % 255 + 1;
+
+ if (page[0] != expected) {
+ uffd_test_fail("page %lu content mismatch: %u != %u",
+ p, page[0], expected);
+ return;
+ }
+ }
+
+ uffd_test_pass();
+}
+
+/*
+ * Fault handler for RWP — unprotect the page via UFFDIO_RWPROTECT.
+ */
+static void uffd_handle_rwp_fault(uffd_global_test_opts_t *gopts,
+ struct uffd_msg *msg,
+ struct uffd_args *uargs)
+{
+ if (!(msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_RWP))
+ err("expected RWP fault, got 0x%llx",
+ msg->arg.pagefault.flags);
+
+ rwprotect_range(gopts->uffd, msg->arg.pagefault.address,
+ gopts->page_size, false);
+ uargs->minor_faults++;
+}
+
+/*
+ * Test sync RWP faults on anonymous memory.
+ * Populate pages, register MODE_RWP (sync), RW-protect,
+ * access from worker thread, verify fault delivered, UFFDIO_RWPROTECT resolves.
+ */
+static void uffd_rwp_sync_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ pthread_t uffd_mon;
+ struct uffd_args uargs = { };
+ bool failed = false;
+ char c = '\0';
+ unsigned long p;
+
+ uargs.gopts = gopts;
+ uargs.handle_fault = uffd_handle_rwp_fault;
+
+ /* Populate all pages */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+ /* Register MODE_RWP */
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+
+ /* RW-protect all pages */
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ /* Start fault handler thread */
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &uargs))
+ err("uffd_poll_thread create");
+
+ /* Access all pages — triggers sync RWP faults, handler unprotects */
+ for (p = 0; p < nr_pages; p++) {
+ unsigned char *page = (unsigned char *)gopts->area_dst +
+ p * page_size;
+
+ if (page[0] != (p % 255 + 1)) {
+ uffd_test_fail("page %lu content mismatch", p);
+ failed = true;
+ goto out;
+ }
+ }
+
+out:
+ /*
+ * Stop the handler before reading minor_faults: the last fault
+ * resolution rwprotect_range()s before incrementing the counter,
+ * so the main thread can race ahead of the increment.
+ */
+ if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+
+ if (failed)
+ return;
+ if (uargs.minor_faults == 0)
+ uffd_test_fail("expected RWP faults, got 0");
+ else
+ uffd_test_pass();
+}
+
+/*
+ * Test PAGEMAP_SCAN detection of RW-protected (cold) pages.
+ */
+static void uffd_rwp_pagemap_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ unsigned long p;
+ struct page_region regions[16];
+ struct pm_scan_arg pm_arg;
+ int pagemap_fd;
+ long ret;
+
+ /* Need at least 4 pages */
+ if (nr_pages < 4) {
+ uffd_test_skip("need at least 4 pages");
+ return;
+ }
+
+ /* Populate all pages */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, 0xab, page_size);
+
+ /* Register and RW-protect */
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ /* Touch first half of pages to re-activate them (async auto-resolve) */
+ for (p = 0; p < nr_pages / 2; p++) {
+ volatile char *page = gopts->area_dst + p * page_size;
+ (void)*page;
+ }
+
+ /* Scan for cold (still RW-protected) pages */
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err("open pagemap");
+
+ /*
+ * PAGE_IS_ACCESSED is set once the uffd-wp bit has been cleared
+ * (access happened, or the user resolved). Invert it to select
+ * still-protected (cold) pages.
+ */
+ memset(&pm_arg, 0, sizeof(pm_arg));
+ pm_arg.size = sizeof(pm_arg);
+ pm_arg.start = (uint64_t)gopts->area_dst;
+ pm_arg.end = (uint64_t)gopts->area_dst + nr_pages * page_size;
+ pm_arg.vec = (uint64_t)regions;
+ pm_arg.vec_len = ARRAY_SIZE(regions);
+ pm_arg.category_mask = PAGE_IS_ACCESSED;
+ pm_arg.category_inverted = PAGE_IS_ACCESSED;
+ pm_arg.return_mask = PAGE_IS_ACCESSED;
+
+ ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &pm_arg);
+ close(pagemap_fd);
+
+ if (ret < 0) {
+ uffd_test_fail("PAGEMAP_SCAN failed: %s", strerror(errno));
+ return;
+ }
+
+ /*
+ * The second half of pages should be reported as RW-protected.
+ * They may be coalesced into one region.
+ */
+ if (ret < 1) {
+ uffd_test_fail("expected cold pages, got %ld regions", ret);
+ return;
+ }
+
+ /* Verify the cold region covers the second half */
+ uint64_t cold_start = regions[0].start;
+ uint64_t expected_start = (uint64_t)gopts->area_dst +
+ (nr_pages / 2) * page_size;
+
+ if (cold_start != expected_start) {
+ uffd_test_fail("cold region starts at 0x%lx, expected 0x%lx",
+ (unsigned long)cold_start,
+ (unsigned long)expected_start);
+ return;
+ }
+
+ uffd_test_pass();
+}
+
+/*
+ * Test that RWP protection survives a mprotect(PROT_NONE) ->
+ * mprotect(PROT_READ|PROT_WRITE) round-trip. The uffd-wp bit on a
+ * VM_UFFD_RWP VMA must continue to carry PROT_NONE semantics after
+ * mprotect() changes the base protection; otherwise accesses would
+ * silently succeed and the pagemap bit would stick without a fault
+ * ever clearing it.
+ */
+static void uffd_rwp_mprotect_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ unsigned long p;
+ struct page_region regions[16];
+ struct pm_scan_arg pm_arg;
+ int pagemap_fd;
+ long ret;
+
+ /* Populate all pages */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, 0xab, page_size);
+
+ /* Register and RW-protect the whole range */
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ /* Round-trip mprotect(): PROT_NONE -> PROT_READ|PROT_WRITE */
+ if (mprotect(gopts->area_dst, nr_pages * page_size, PROT_NONE))
+ err("mprotect() PROT_NONE");
+ if (mprotect(gopts->area_dst, nr_pages * page_size,
+ PROT_READ | PROT_WRITE))
+ err("mprotect() PROT_READ|PROT_WRITE");
+
+ /* Touch every page. Async RWP must auto-resolve each fault. */
+ for (p = 0; p < nr_pages; p++) {
+ volatile char *page = gopts->area_dst + p * page_size;
+ (void)*page;
+ }
+
+ /*
+ * After touching, no page should remain RW-protected. A stuck
+ * uffd-wp bit would mean mprotect() silently dropped PROT_NONE and
+ * the access never faulted.
+ */
+ pagemap_fd = open("/proc/self/pagemap", O_RDONLY);
+ if (pagemap_fd < 0)
+ err("open pagemap");
+
+ memset(&pm_arg, 0, sizeof(pm_arg));
+ pm_arg.size = sizeof(pm_arg);
+ pm_arg.start = (uint64_t)gopts->area_dst;
+ pm_arg.end = (uint64_t)gopts->area_dst + nr_pages * page_size;
+ pm_arg.vec = (uint64_t)regions;
+ pm_arg.vec_len = ARRAY_SIZE(regions);
+ pm_arg.category_mask = PAGE_IS_ACCESSED;
+ pm_arg.category_inverted = PAGE_IS_ACCESSED;
+ pm_arg.return_mask = PAGE_IS_ACCESSED;
+
+ ret = ioctl(pagemap_fd, PAGEMAP_SCAN, &pm_arg);
+ close(pagemap_fd);
+
+ if (ret < 0) {
+ uffd_test_fail("PAGEMAP_SCAN failed: %s", strerror(errno));
+ return;
+ }
+ if (ret != 0) {
+ uffd_test_fail("expected no cold pages after mprotect()+touch, got %ld regions",
+ ret);
+ return;
+ }
+
+ uffd_test_pass();
+}
+
+/*
+ * Test that GUP resolves through protnone PTEs (async mode).
+ * vmsplice() into a pipe pins user pages via get_user_pages_fast() --
+ * unlike write(), which goes through copy_from_user() and ordinary
+ * hardware page faults -- so it exercises gup_can_follow_protnone() on
+ * the RW-protected PTE. In async mode the kernel auto-restores
+ * permissions and GUP returns the page.
+ */
+static void uffd_rwp_gup_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ struct iovec iov;
+ char buf;
+ int pipefd[2];
+
+ /* Populate first page with known content */
+ memset(gopts->area_dst, 0xCD, gopts->page_size);
+
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst, gopts->page_size))
+ err("register failure");
+
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ gopts->page_size, true);
+
+ if (pipe(pipefd))
+ err("pipe");
+
+ /*
+ * One byte's worth of iov is enough to GUP the containing page and
+ * keeps the pipe transfer well under any pipe-capacity limit even on
+ * hugetlb-backed runs.
+ */
+ iov.iov_base = gopts->area_dst;
+ iov.iov_len = 1;
+ if (vmsplice(pipefd[1], &iov, 1, 0) != 1) {
+ uffd_test_fail("vmsplice from RW-protected page failed: %s",
+ strerror(errno));
+ goto out;
+ }
+
+ if (read(pipefd[0], &buf, 1) != 1) {
+ uffd_test_fail("read from pipe failed");
+ goto out;
+ }
+
+ if (buf != (char)0xCD) {
+ uffd_test_fail("content mismatch: got 0x%02x, expected 0xCD",
+ (unsigned char)buf);
+ goto out;
+ }
+
+ uffd_test_pass();
+out:
+ close(pipefd[0]);
+ close(pipefd[1]);
+}
+
+/*
+ * Test runtime toggle between async and sync modes.
+ * Start in async mode (detection), flip to sync (eviction), verify faults
+ * block, resolve them, flip back to async.
+ */
+static void uffd_rwp_async_toggle_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ struct uffd_args uargs = { };
+ pthread_t uffd_mon;
+ char c = '\0';
+ unsigned long p;
+
+ uargs.gopts = gopts;
+ uargs.handle_fault = uffd_handle_rwp_fault;
+
+ /* Populate */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+
+ /* Phase 1: async detection — RW-protect, access first half */
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ for (p = 0; p < nr_pages / 2; p++) {
+ volatile char *page = gopts->area_dst + p * page_size;
+ (void)*page; /* auto-resolves in async mode */
+ }
+
+ /* Phase 2: flip to sync for eviction */
+ set_async_mode(gopts->uffd, false);
+
+ /* Start handler — will receive faults for cold pages */
+ if (pthread_create(&uffd_mon, NULL, uffd_poll_thread, &uargs))
+ err("uffd_poll_thread create");
+
+ /* Access second half (cold pages) — should trigger sync faults */
+ for (p = nr_pages / 2; p < nr_pages; p++) {
+ unsigned char *page = (unsigned char *)gopts->area_dst +
+ p * page_size;
+ if (page[0] != (p % 255 + 1)) {
+ uffd_test_fail("page %lu content mismatch", p);
+ goto out;
+ }
+ }
+
+ /*
+ * Stop the handler before reading minor_faults: the last fault
+ * resolution rwprotect_range()s before incrementing the counter,
+ * so the main thread can race ahead of the increment. Stopping
+ * here also makes Phase 3 a clean async-only test -- with the
+ * handler still running it would silently resolve any sync fault
+ * the kernel erroneously delivers, masking a regression.
+ */
+ if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+
+ if (uargs.minor_faults == 0) {
+ uffd_test_fail("expected sync faults, got 0");
+ return;
+ }
+
+ /* Phase 3: flip back to async */
+ set_async_mode(gopts->uffd, true);
+
+ /* RW-protect and access again — should auto-resolve */
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ for (p = 0; p < nr_pages; p++) {
+ volatile char *page = gopts->area_dst + p * page_size;
+ (void)*page;
+ }
+
+ uffd_test_pass();
+ return;
+out:
+ if (write(gopts->pipefd[1], &c, sizeof(c)) != sizeof(c))
+ err("pipe write");
+ if (pthread_join(uffd_mon, NULL))
+ err("join() failed");
+}
+
+/*
+ * Test that RW-protected pages become accessible after closing uffd.
+ */
+static void uffd_rwp_close_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ unsigned long p;
+
+ /* Populate */
+ for (p = 0; p < nr_pages; p++)
+ memset(gopts->area_dst + p * page_size, p % 255 + 1, page_size);
+
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failure");
+
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ nr_pages * page_size, true);
+
+ /* Close uffd — should restore protnone PTEs */
+ close(gopts->uffd);
+ gopts->uffd = -1;
+
+ /* All pages should be accessible with original content */
+ for (p = 0; p < nr_pages; p++) {
+ unsigned char *page = (unsigned char *)gopts->area_dst +
+ p * page_size;
+ unsigned char expected = p % 255 + 1;
+
+ if (page[0] != expected) {
+ uffd_test_fail("page %lu not accessible after close", p);
+ return;
+ }
+ }
+
+ uffd_test_pass();
+}
+
+/*
+ * Test that RWP protection is preserved across fork() when
+ * UFFD_FEATURE_EVENT_FORK is enabled. Without preservation, the child's
+ * PTEs would lose the uffd-wp marker and RWP-protected accesses would
+ * silently fall through to do_numa_page().
+ */
+static void uffd_rwp_fork_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ int pagemap_fd;
+ uint64_t value;
+
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst,
+ nr_pages * page_size))
+ err("register failed");
+
+ /* Populate + RWP-protect */
+ *gopts->area_dst = 1;
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst,
+ page_size, true);
+
+ /* Parent: verify uffd-wp bit is set before fork */
+ pagemap_fd = pagemap_open();
+ value = pagemap_get_entry(pagemap_fd, gopts->area_dst);
+ pagemap_check_wp(value, true);
+
+ /*
+ * Fork with EVENT_FORK: child inherits VM_UFFD_RWP. Child reads
+ * its own pagemap and must still see the uffd-wp bit set.
+ */
+ if (pagemap_test_fork(gopts, true, false)) {
+ uffd_test_fail("RWP marker lost in child after fork");
+ goto out;
+ }
+
+ uffd_test_pass();
+out:
+ close(pagemap_fd);
+}
+
+/*
+ * Test that RWP protection on a pinned anon page is preserved across fork().
+ * Pinning forces copy_present_page() in the child path, which must restore
+ * PAGE_NONE on top of the uffd bit. Using async mode, a read in the child
+ * auto-resolves if — and only if — the PTE was actually protnone+uffd; the
+ * cleared uffd bit afterward proves the fault path ran.
+ */
+static void uffd_rwp_fork_pin_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long page_size = gopts->page_size;
+ fork_event_args fevent_args = { .gopts = gopts, .child_uffd = -1 };
+ pin_args pin_args = {};
+ int pagemap_fd, status;
+ pthread_t fevent_thread;
+ uint64_t value;
+ pid_t child;
+
+ if (uffd_register_rwp(gopts->uffd, gopts->area_dst, page_size))
+ err("register failed");
+
+ /* Populate. */
+ *gopts->area_dst = 1;
+
+ /* RO-longterm pin so fork() takes copy_present_page() for this PTE. */
+ if (pin_pages(&pin_args, gopts->area_dst, page_size)) {
+ uffd_test_skip("Possibly CONFIG_GUP_TEST missing or unprivileged");
+ uffd_unregister(gopts->uffd, gopts->area_dst, page_size);
+ return;
+ }
+
+ /* RWP-protect: PTE is now PAGE_NONE + uffd bit. */
+ rwprotect_range(gopts->uffd, (uint64_t)gopts->area_dst, page_size, true);
+
+ pagemap_fd = pagemap_open();
+ value = pagemap_get_entry(pagemap_fd, gopts->area_dst);
+ pagemap_check_wp(value, true);
+
+ /*
+ * UFFD_FEATURE_EVENT_FORK is required so the child inherits
+ * VM_UFFD_RWP and the marker; without it dup_userfaultfd() resets
+ * the child VMA and the test would pass for the wrong reason.
+ * dup_userfaultfd() blocks until the EVENT_FORK message is consumed,
+ * so spawn a reader before the fork().
+ */
+ gopts->ready_for_fork = false;
+ if (pthread_create(&fevent_thread, NULL, fork_event_consumer,
+ &fevent_args))
+ err("pthread_create() for fork event consumer");
+ while (!gopts->ready_for_fork)
+ ; /* Wait for consumer to start polling. */
+
+ child = fork();
+ if (child < 0)
+ err("fork");
+ if (child == 0) {
+ volatile char c;
+ int cfd;
+
+ /*
+ * Read the pinned page. Only reaches the fault path if the
+ * child PTE is protnone + uffd; async mode auto-resolves and
+ * clears the uffd bit. If copy_present_page() dropped
+ * PAGE_NONE, the read would silently succeed and the bit
+ * would still be set.
+ */
+ c = *(volatile char *)gopts->area_dst;
+ (void)c;
+
+ cfd = pagemap_open();
+ value = pagemap_get_entry(cfd, gopts->area_dst);
+ close(cfd);
+ _exit((value & PM_UFFD_WP) ? 1 : 0);
+ }
+ if (waitpid(child, &status, 0) < 0)
+ err("waitpid");
+ if (pthread_join(fevent_thread, NULL))
+ err("pthread_join() for fork event consumer");
+ if (fevent_args.child_uffd >= 0)
+ close(fevent_args.child_uffd);
+
+ unpin_pages(&pin_args);
+ close(pagemap_fd);
+ if (uffd_unregister(gopts->uffd, gopts->area_dst, page_size))
+ err("unregister failed");
+
+ if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) {
+ uffd_test_fail("RWP not enforced in child after pinned fork");
+ return;
+ }
+
+ uffd_test_pass();
+}
+
+/*
+ * WP and RWP share the uffd-wp PTE bit and cannot coexist in the same VMA.
+ * Registration requesting both modes must be rejected.
+ */
+static void uffd_rwp_wp_exclusive_test(uffd_global_test_opts_t *gopts,
+ uffd_test_args_t *args)
+{
+ unsigned long nr_pages = gopts->nr_pages;
+ unsigned long page_size = gopts->page_size;
+ struct uffdio_register reg = { };
+
+ reg.range.start = (unsigned long)gopts->area_dst;
+ reg.range.len = nr_pages * page_size;
+ reg.mode = UFFDIO_REGISTER_MODE_WP | UFFDIO_REGISTER_MODE_RWP;
+
+ if (ioctl(gopts->uffd, UFFDIO_REGISTER, ®) == 0) {
+ uffd_test_fail("register with WP|RWP unexpectedly succeeded");
+ return;
+ }
+ if (errno != EINVAL) {
+ uffd_test_fail("register with WP|RWP: expected EINVAL, got %d",
+ errno);
+ return;
+ }
+ uffd_test_pass();
+}
+
static sigjmp_buf jbuf, *sigbuf;
static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
@@ -1625,6 +2320,77 @@ uffd_test_case_t uffd_tests[] = {
/* We can't test MADV_COLLAPSE, so try our luck */
.uffd_feature_required = UFFD_FEATURE_MINOR_SHMEM,
},
+ {
+ .name = "rwp-async",
+ .uffd_fn = uffd_rwp_async_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ },
+ {
+ .name = "rwp-sync",
+ .uffd_fn = uffd_rwp_sync_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_RWP,
+ },
+ {
+ .name = "rwp-pagemap",
+ .uffd_fn = uffd_rwp_pagemap_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ },
+ {
+ .name = "rwp-mprotect",
+ .uffd_fn = uffd_rwp_mprotect_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ },
+ {
+ .name = "rwp-gup",
+ .uffd_fn = uffd_rwp_gup_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ },
+ {
+ .name = "rwp-async-toggle",
+ .uffd_fn = uffd_rwp_async_toggle_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC,
+ },
+ {
+ .name = "rwp-close",
+ .uffd_fn = uffd_rwp_close_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required = UFFD_FEATURE_RWP,
+ },
+ {
+ .name = "rwp-fork",
+ .uffd_fn = uffd_rwp_fork_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "rwp-fork-pin",
+ .uffd_fn = uffd_rwp_fork_pin_test,
+ .mem_targets = MEM_ANON,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC |
+ UFFD_FEATURE_EVENT_FORK,
+ },
+ {
+ .name = "rwp-wp-exclusive",
+ .uffd_fn = uffd_rwp_wp_exclusive_test,
+ .mem_targets = MEM_ALL,
+ .uffd_feature_required =
+ UFFD_FEATURE_RWP |
+ UFFD_FEATURE_PAGEFAULT_FLAG_WP |
+ UFFD_FEATURE_WP_HUGETLBFS_SHMEM,
+ },
{
.name = "sigbus",
.uffd_fn = uffd_sigbus_test,
--
2.51.2
^ permalink raw reply related
* [PATCH v3 12/16] userfaultfd: add UFFDIO_SET_MODE for runtime sync/async toggle
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Add an ioctl to toggle async mode at runtime without re-registering
the userfaultfd. This allows a VMM to switch between sync and async
RWP modes on-the-fly -- for example, starting in async mode for
working set scanning, then switching to sync mode to intercept faults
during page eviction.
UFFDIO_SET_MODE takes an enable/disable bitmask of UFFD_FEATURE_*
flags. Only UFFD_FEATURE_RWP_ASYNC is toggleable today; the ioctl
rejects any other bit with -EINVAL. Enabling RWP_ASYNC also requires
RWP to have been negotiated at UFFDIO_API time, mirroring the
UFFDIO_API invariant.
Fault-path readers of ctx->features run under mmap_read_lock or a
per-VMA lock; the RMW takes mmap_write_lock and calls
vma_start_write() on every UFFD-armed VMA, so those readers are fully
excluded. userfaultfd_show_fdinfo(), however, reads ctx->features
without any lock, so the RMW is written as a single WRITE_ONCE and
fdinfo reads it with READ_ONCE. That keeps the lockless observer from
seeing a mid-RMW intermediate and removes the audit burden when new
toggleable bits are added later.
When switching to async, pending sync waiters are woken so they retry
and auto-resolve under the new mode.
Signed-off-by: Kiryl Shutsemau (Meta) <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
fs/userfaultfd.c | 150 +++++++++++++++++++++++++------
include/uapi/linux/userfaultfd.h | 14 +++
2 files changed, 136 insertions(+), 28 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index bb0ea60dc3e6..7eacaa20baec 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -79,19 +79,29 @@ struct userfaultfd_wake_range {
/* internal indication that UFFD_API ioctl was successfully executed */
#define UFFD_FEATURE_INITIALIZED (1u << 31)
+/*
+ * UFFDIO_SET_MODE updates ctx->features under mmap_write_lock with
+ * WRITE_ONCE; readers that run outside mmap_read_lock or the per-VMA
+ * lock (poll/read_iter/ioctl, fdinfo) must pair with READ_ONCE.
+ */
+static unsigned int userfaultfd_features(struct userfaultfd_ctx *ctx)
+{
+ return READ_ONCE(ctx->features);
+}
+
static bool userfaultfd_is_initialized(struct userfaultfd_ctx *ctx)
{
- return ctx->features & UFFD_FEATURE_INITIALIZED;
+ return userfaultfd_features(ctx) & UFFD_FEATURE_INITIALIZED;
}
static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
{
- return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
+ return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_WP_ASYNC);
}
static bool userfaultfd_rwp_async_ctx(struct userfaultfd_ctx *ctx)
{
- return ctx && (ctx->features & UFFD_FEATURE_RWP_ASYNC);
+ return ctx && (userfaultfd_features(ctx) & UFFD_FEATURE_RWP_ASYNC);
}
/*
@@ -106,7 +116,7 @@ bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma)
if (!ctx)
return false;
- return ctx->features & UFFD_FEATURE_WP_UNPOPULATED;
+ return userfaultfd_features(ctx) & UFFD_FEATURE_WP_UNPOPULATED;
}
static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
@@ -1870,6 +1880,109 @@ static int userfaultfd_rwprotect(struct userfaultfd_ctx *ctx,
return ret;
}
+/* Subset of UFFD_API_FEATURES actually supported by this kernel/arch */
+static __u64 uffd_api_available_features(void)
+{
+ __u64 f = UFFD_API_FEATURES;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_USERFAULTFD_MINOR))
+ f &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
+ if (!pgtable_supports_uffd())
+ f &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
+ if (!uffd_supports_wp_marker())
+ f &= ~(UFFD_FEATURE_WP_HUGETLBFS_SHMEM |
+ UFFD_FEATURE_WP_UNPOPULATED |
+ UFFD_FEATURE_WP_ASYNC);
+ /*
+ * RWP needs both PROT_NONE support and the uffd PTE bit. The
+ * VM_UFFD_RWP check covers compile-time unavailability; the
+ * pgtable_supports_uffd() check covers runtime (e.g. riscv
+ * without the SVRSW60T59B extension) where the PTE bit is declared
+ * but not actually usable.
+ */
+ if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
+ f &= ~(UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC);
+ return f;
+}
+
+/* Async features that can be toggled at runtime via UFFDIO_SET_MODE */
+#define UFFD_FEATURE_TOGGLEABLE UFFD_FEATURE_RWP_ASYNC
+
+static int userfaultfd_set_mode(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ struct uffdio_set_mode mode;
+ struct mm_struct *mm = ctx->mm;
+
+ if (copy_from_user(&mode, (void __user *)arg, sizeof(mode)))
+ return -EFAULT;
+
+ /* enable and disable must not overlap */
+ if (mode.enable & mode.disable)
+ return -EINVAL;
+
+ /* only toggleable features that this kernel/arch actually supports */
+ if ((mode.enable | mode.disable) &
+ ~(uffd_api_available_features() & UFFD_FEATURE_TOGGLEABLE))
+ return -EINVAL;
+
+ /* RWP_ASYNC can only be enabled on contexts that negotiated RWP */
+ if ((mode.enable & UFFD_FEATURE_RWP_ASYNC) &&
+ !(ctx->features & UFFD_FEATURE_RWP))
+ return -EINVAL;
+
+ if (!mmget_not_zero(mm))
+ return -ESRCH;
+
+ /*
+ * Drain in-flight faults before flipping features. mmap_write_lock()
+ * blocks new mmap_read_lock() callers, but per-VMA locked faults
+ * (lock_vma_under_rcu() + FAULT_FLAG_VMA_LOCK) that acquired before
+ * this point keep running. Calling vma_start_write() on each UFFD-
+ * armed VMA waits for those readers to drop, so no in-flight fault
+ * can observe the old features after mmap_write_unlock().
+ */
+ mmap_write_lock(mm);
+ {
+ struct vm_area_struct *vma;
+ VMA_ITERATOR(vmi, mm, 0);
+
+ for_each_vma(vmi, vma) {
+ if (vma->vm_userfaultfd_ctx.ctx == ctx)
+ vma_start_write(vma);
+ }
+ }
+ /*
+ * Single WRITE_ONCE so lockless readers (fdinfo, poll/read_iter
+ * via userfaultfd_is_initialized(), and the userfaultfd_features()
+ * helper used elsewhere) can't observe a mid-RMW intermediate
+ * value. Hot-path readers already serialise through the mmap lock
+ * + vma_start_write() drain above, so their load doesn't need an
+ * annotation.
+ */
+ WRITE_ONCE(ctx->features,
+ (ctx->features | mode.enable) & ~mode.disable);
+ mmap_write_unlock(mm);
+
+ /*
+ * If switching to async, wake threads blocked in handle_userfault().
+ * They will retry the fault and auto-resolve under the new mode.
+ * len=0 means wake all pending faults on this context.
+ */
+ if (mode.enable & UFFD_FEATURE_RWP_ASYNC) {
+ struct userfaultfd_wake_range range = { .len = 0 };
+
+ spin_lock_irq(&ctx->fault_pending_wqh.lock);
+ __wake_up_locked_key(&ctx->fault_pending_wqh, TASK_NORMAL,
+ &range);
+ __wake_up(&ctx->fault_wqh, TASK_NORMAL, 1, &range);
+ spin_unlock_irq(&ctx->fault_pending_wqh.lock);
+ }
+
+ mmput(mm);
+ return 0;
+}
+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
@@ -2108,29 +2221,7 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
goto err_out;
/* report all available features and ioctls to userland */
- uffdio_api.features = UFFD_API_FEATURES;
-#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
- uffdio_api.features &=
- ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM);
-#endif
- if (!pgtable_supports_uffd())
- uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP;
-
- if (!uffd_supports_wp_marker()) {
- uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM;
- uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
- uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
- }
- /*
- * RWP needs both PROT_NONE support and the uffd-wp PTE bit. The
- * VM_UFFD_RWP check covers compile-time unavailability; the
- * pgtable_supports_uffd() check covers runtime (e.g. riscv
- * without the SVRSW60T59B extension) where the PTE bit is declared
- * but not actually usable.
- */
- if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
- uffdio_api.features &=
- ~(UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC);
+ uffdio_api.features = uffd_api_available_features();
ret = -EINVAL;
if (features & ~uffdio_api.features)
@@ -2200,6 +2291,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_RWPROTECT:
ret = userfaultfd_rwprotect(ctx, arg);
break;
+ case UFFDIO_SET_MODE:
+ ret = userfaultfd_set_mode(ctx, arg);
+ break;
}
return ret;
}
@@ -2227,7 +2321,7 @@ static void userfaultfd_show_fdinfo(struct seq_file *m, struct file *f)
* protocols: aa:... bb:...
*/
seq_printf(m, "pending:\t%lu\ntotal:\t%lu\nAPI:\t%Lx:%x:%Lx\n",
- pending, total, UFFD_API, ctx->features,
+ pending, total, UFFD_API, userfaultfd_features(ctx),
UFFD_API_IOCTLS|UFFD_API_RANGE_IOCTLS);
}
#endif
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index c10f08f8a618..cea11aad6b54 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -49,6 +49,7 @@
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
+ (__u64)1 << _UFFDIO_SET_MODE | \
(__u64)1 << _UFFDIO_API)
#define UFFD_API_RANGE_IOCTLS \
((__u64)1 << _UFFDIO_WAKE | \
@@ -85,6 +86,7 @@
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_POISON (0x08)
#define _UFFDIO_RWPROTECT (0x09)
+#define _UFFDIO_SET_MODE (0x0A)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -111,6 +113,8 @@
struct uffdio_poison)
#define UFFDIO_RWPROTECT _IOWR(UFFDIO, _UFFDIO_RWPROTECT, \
struct uffdio_rwprotect)
+#define UFFDIO_SET_MODE _IOW(UFFDIO, _UFFDIO_SET_MODE, \
+ struct uffdio_set_mode)
/* read() structure */
struct uffd_msg {
@@ -406,6 +410,16 @@ struct uffdio_move {
__s64 move;
};
+struct uffdio_set_mode {
+ /*
+ * Toggle async mode for features at runtime.
+ * Supported: UFFD_FEATURE_RWP_ASYNC.
+ * Setting a bit in both enable and disable is invalid.
+ */
+ __u64 enable;
+ __u64 disable;
+};
+
/*
* Flags for the userfaultfd(2) system call itself.
*/
--
2.51.2
^ permalink raw reply related
* [PATCH v3 11/16] userfaultfd: add UFFD_FEATURE_RWP_ASYNC for async fault resolution
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Sync RWP delivers a message and blocks the faulting thread until the
handler resolves the fault. For working-set tracking the VMM does not
need the message: it just needs to know, at scan time, which pages
were touched. Async RWP serves that use case — the kernel restores
access in-place and the faulting thread continues without blocking.
The VMM reconstructs the access pattern after the fact via
PAGEMAP_SCAN: pages whose uffd bit is still set (inverted
PAGE_IS_ACCESSED) were not re-accessed since the last RWP cycle.
Worth calling out: async resolution upgrades writable private anon
PTEs via pte_mkwrite() when can_change_pte_writable() allows, mirroring
do_numa_page(). Without it, every re-access of an RWP'd writable page
would COW-fault a second time.
UFFD_FEATURE_RWP_ASYNC requires UFFD_FEATURE_RWP.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
fs/userfaultfd.c | 19 ++++++++++++++++++-
include/linux/userfaultfd_k.h | 6 ++++++
include/uapi/linux/userfaultfd.h | 11 ++++++++++-
mm/huge_memory.c | 25 ++++++++++++++++++++++++-
mm/hugetlb.c | 32 +++++++++++++++++++++++++++++++-
mm/memory.c | 27 +++++++++++++++++++++++++--
6 files changed, 114 insertions(+), 6 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f8f1619f5183..bb0ea60dc3e6 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -89,6 +89,11 @@ static bool userfaultfd_wp_async_ctx(struct userfaultfd_ctx *ctx)
return ctx && (ctx->features & UFFD_FEATURE_WP_ASYNC);
}
+static bool userfaultfd_rwp_async_ctx(struct userfaultfd_ctx *ctx)
+{
+ return ctx && (ctx->features & UFFD_FEATURE_RWP_ASYNC);
+}
+
/*
* Whether WP_UNPOPULATED is enabled on the uffd context. It is only
* meaningful when userfaultfd_wp()==true on the vma and when it's
@@ -1988,6 +1993,11 @@ bool userfaultfd_wp_async(struct vm_area_struct *vma)
return userfaultfd_wp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
}
+bool userfaultfd_rwp_async(struct vm_area_struct *vma)
+{
+ return userfaultfd_rwp_async_ctx(vma->vm_userfaultfd_ctx.ctx);
+}
+
static inline unsigned int uffd_ctx_features(__u64 user_features)
{
/*
@@ -2091,6 +2101,12 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
if (features & UFFD_FEATURE_WP_ASYNC)
features |= UFFD_FEATURE_WP_UNPOPULATED;
+ ret = -EINVAL;
+ /* RWP_ASYNC requires RWP */
+ if ((features & UFFD_FEATURE_RWP_ASYNC) &&
+ !(features & UFFD_FEATURE_RWP))
+ goto err_out;
+
/* report all available features and ioctls to userland */
uffdio_api.features = UFFD_API_FEATURES;
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
@@ -2113,7 +2129,8 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
* but not actually usable.
*/
if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
- uffdio_api.features &= ~UFFD_FEATURE_RWP;
+ uffdio_api.features &=
+ ~(UFFD_FEATURE_RWP | UFFD_FEATURE_RWP_ASYNC);
ret = -EINVAL;
if (features & ~uffdio_api.features)
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 1beae4f2f479..4fd2a5ff3064 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -295,6 +295,7 @@ extern void userfaultfd_unmap_complete(struct mm_struct *mm,
struct list_head *uf);
extern bool userfaultfd_wp_unpopulated(struct vm_area_struct *vma);
extern bool userfaultfd_wp_async(struct vm_area_struct *vma);
+extern bool userfaultfd_rwp_async(struct vm_area_struct *vma);
void userfaultfd_reset_ctx(struct vm_area_struct *vma);
@@ -492,6 +493,11 @@ static inline bool userfaultfd_wp_async(struct vm_area_struct *vma)
return false;
}
+static inline bool userfaultfd_rwp_async(struct vm_area_struct *vma)
+{
+ return false;
+}
+
static inline bool vma_has_uffd_without_event_remap(struct vm_area_struct *vma)
{
return false;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index d803e76d47ad..c10f08f8a618 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -44,7 +44,8 @@
UFFD_FEATURE_POISON | \
UFFD_FEATURE_WP_ASYNC | \
UFFD_FEATURE_MOVE | \
- UFFD_FEATURE_RWP)
+ UFFD_FEATURE_RWP | \
+ UFFD_FEATURE_RWP_ASYNC)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -243,6 +244,13 @@ struct uffdio_api {
* UFFDIO_REGISTER_MODE_RWP for read-write protection tracking.
* Pages are made inaccessible via UFFDIO_RWPROTECT and faults
* are delivered when the pages are re-accessed.
+ *
+ * UFFD_FEATURE_RWP_ASYNC indicates asynchronous mode for
+ * UFFDIO_REGISTER_MODE_RWP. When set, faults on read-write
+ * protected pages are auto-resolved by the kernel (PTE
+ * permissions restored immediately) without delivering a message
+ * to the userfaultfd handler. Use PAGEMAP_SCAN with inverted
+ * PAGE_IS_ACCESSED to find pages that were not re-accessed.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -262,6 +270,7 @@ struct uffdio_api {
#define UFFD_FEATURE_WP_ASYNC (1<<15)
#define UFFD_FEATURE_MOVE (1<<16)
#define UFFD_FEATURE_RWP (1<<17)
+#define UFFD_FEATURE_RWP_ASYNC (1<<18)
__u64 features;
__u64 ioctls;
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 76ca0fbaa802..985eb4e2b5c0 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2266,7 +2266,30 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
{
- return handle_userfault(vmf, VM_UFFD_RWP);
+ struct vm_area_struct *vma = vmf->vma;
+ pmd_t pmd;
+
+ if (!userfaultfd_rwp_async(vma))
+ return handle_userfault(vmf, VM_UFFD_RWP);
+
+ vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
+ if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
+ pmd = pmd_modify(vmf->orig_pmd, vma->vm_page_prot);
+ /* pmd_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pmd = pmd_clear_uffd(pmd);
+ pmd = pmd_mkyoung(pmd);
+ if (!pmd_write(pmd) &&
+ vma_wants_manual_pte_write_upgrade(vma) &&
+ can_change_pmd_writable(vma, vmf->address, pmd))
+ pmd = pmd_mkwrite(pmd, vma);
+ set_pmd_at(vma->vm_mm, vmf->address & HPAGE_PMD_MASK,
+ vmf->pmd, pmd);
+ update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
+ spin_unlock(vmf->ptl);
+ return 0;
}
/* NUMA hinting page fault entry point for trans huge pmds */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 9fc31cbcba4b..84ed3784e3a0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6075,7 +6075,37 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) &&
userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) {
- return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+ spinlock_t *ptl;
+ pte_t pte;
+
+ /* Sync: drop hugetlb locks before blocking in handle_userfault() */
+ if (!userfaultfd_rwp_async(vma))
+ return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+
+ ptl = huge_pte_lock(h, mm, vmf.pte);
+ pte = huge_ptep_get(mm, vmf.address, vmf.pte);
+ if (pte_protnone(pte) && huge_pte_uffd(pte)) {
+ unsigned int shift = huge_page_shift(h);
+
+ pte = huge_pte_modify(pte, vma->vm_page_prot);
+ pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
+ /* huge_pte_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pte = huge_pte_clear_uffd(pte);
+ pte = pte_mkyoung(pte);
+ /*
+ * Unlike do_uffd_rwp(), do not upgrade to writable
+ * here. Hugetlb lacks a can_change_huge_pte_writable()
+ * equivalent, so a write access will take a separate
+ * COW fault — acceptable for the rare private hugetlb
+ * case.
+ */
+ set_huge_pte_at(mm, vmf.address, vmf.pte, pte,
+ huge_page_size(h));
+ update_mmu_cache(vma, vmf.address, vmf.pte);
+ }
+ spin_unlock(ptl);
+ ret = 0;
+ goto out_mutex;
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index e0dcf2c28d9d..bfe6f218fb16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6174,8 +6174,31 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
static vm_fault_t do_uffd_rwp(struct vm_fault *vmf)
{
- pte_unmap(vmf->pte);
- return handle_userfault(vmf, VM_UFFD_RWP);
+ pte_t pte;
+
+ if (!userfaultfd_rwp_async(vmf->vma)) {
+ /* Sync mode: unmap PTE and deliver to userfaultfd handler */
+ pte_unmap(vmf->pte);
+ return handle_userfault(vmf, VM_UFFD_RWP);
+ }
+
+ spin_lock(vmf->ptl);
+ if (unlikely(!pte_same(ptep_get(vmf->pte), vmf->orig_pte))) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
+ }
+ pte = pte_modify(vmf->orig_pte, vmf->vma->vm_page_prot);
+ /* pte_modify() preserves _PAGE_UFFD; drop it on resolution */
+ pte = pte_clear_uffd(pte);
+ pte = pte_mkyoung(pte);
+ if (!pte_write(pte) &&
+ vma_wants_manual_pte_write_upgrade(vmf->vma) &&
+ can_change_pte_writable(vmf->vma, vmf->address, pte))
+ pte = pte_mkwrite(pte, vmf->vma);
+ set_pte_at(vmf->vma->vm_mm, vmf->address, vmf->pte, pte);
+ update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ return 0;
}
static vm_fault_t do_numa_page(struct vm_fault *vmf)
--
2.51.2
^ permalink raw reply related
* [PATCH v3 10/16] mm/pagemap: add PAGE_IS_ACCESSED for RWP tracking
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
PAGEMAP_SCAN already reports PAGE_IS_WRITTEN from the inverted uffd
PTE bit, targeting the UFFDIO_WRITEPROTECT workflow. UFFDIO_RWPROTECT
reuses the same PTE bit as a marker for read-write protection, but
"has been written" and "has been accessed" are distinct semantic
signals — they happen to share one PTE bit today only because the two
implementations share infrastructure.
Give RWP its own pagemap category so the UAPI does not conflate them:
PAGE_IS_WRITTEN reported on VM_UFFD_WP VMAs, !pte_uffd(pte)
PAGE_IS_ACCESSED reported on VM_UFFD_RWP VMAs, !pte_uffd(pte)
Both still read the same PTE bit today, but each is scoped to the VMA
whose registered mode makes the bit meaningful. If a future
implementation moves RWP to a separate PTE bit, only PAGE_IS_ACCESSED
switches over.
This is a UAPI narrowing. Outside VM_UFFD_WP VMAs the uffd bit is
always clear, so PAGEMAP_SCAN used to flag PAGE_IS_WRITTEN on every
present PTE there — a meaningless duplicate of PAGE_IS_PRESENT. Now
PAGE_IS_WRITTEN fires only inside VM_UFFD_WP VMAs.
pagemap_hugetlb_category() now takes the vma like its PTE/PMD peers.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
Documentation/admin-guide/mm/pagemap.rst | 13 ++++-
fs/proc/task_mmu.c | 73 ++++++++++++++++++------
include/uapi/linux/fs.h | 1 +
tools/include/uapi/linux/fs.h | 1 +
4 files changed, 67 insertions(+), 21 deletions(-)
diff --git a/Documentation/admin-guide/mm/pagemap.rst b/Documentation/admin-guide/mm/pagemap.rst
index c57e61b5d8aa..ffa690a171c8 100644
--- a/Documentation/admin-guide/mm/pagemap.rst
+++ b/Documentation/admin-guide/mm/pagemap.rst
@@ -19,8 +19,11 @@ There are four components to pagemap:
* Bit 55 pte is soft-dirty (see
Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped (since 4.2)
- * Bit 57 pte is uffd-wp write-protected (since 5.13) (see
- Documentation/admin-guide/mm/userfaultfd.rst)
+ * Bit 57 pte is tracked by userfaultfd (since 5.13) — in a
+ ``VM_UFFD_WP`` VMA this indicates a write-protected PTE; in a
+ ``VM_UFFD_RWP`` VMA it indicates an RWP-protected PTE. WP and
+ RWP are mutually exclusive per VMA, so the meaning is
+ unambiguous. See Documentation/admin-guide/mm/userfaultfd.rst.
* Bit 58 pte is a guard region (since 6.15) (see madvise (2) man page)
* Bits 59-60 zero
* Bit 61 page is file-page or shared-anon (since 3.5)
@@ -244,7 +247,8 @@ in this IOCTL:
Following flags about pages are currently supported:
- ``PAGE_IS_WPALLOWED`` - Page has async-write-protection enabled
-- ``PAGE_IS_WRITTEN`` - Page has been written to from the time it was write protected
+- ``PAGE_IS_WRITTEN`` - Page in a ``UFFDIO_REGISTER_MODE_WP`` VMA has been
+ written to since it was write-protected. Only reported inside such VMAs.
- ``PAGE_IS_FILE`` - Page is file backed
- ``PAGE_IS_PRESENT`` - Page is present in the memory
- ``PAGE_IS_SWAPPED`` - Page is in swapped
@@ -252,6 +256,9 @@ Following flags about pages are currently supported:
- ``PAGE_IS_HUGE`` - Page is PMD-mapped THP or Hugetlb backed
- ``PAGE_IS_SOFT_DIRTY`` - Page is soft-dirty
- ``PAGE_IS_GUARD`` - Page is a part of a guard region
+- ``PAGE_IS_ACCESSED`` - Page in a ``UFFDIO_REGISTER_MODE_RWP`` VMA has been
+ accessed since RWP was applied. Only reported inside such VMAs. See
+ Documentation/admin-guide/mm/userfaultfd.rst for the RWP workflow.
The ``struct pm_scan_arg`` is used as the argument of the IOCTL.
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index fbaede228201..4e207b6216b1 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -2197,7 +2197,7 @@ static const struct mm_walk_ops pagemap_ops = {
* Bits 5-54 swap offset if swapped
* Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst)
* Bit 56 page exclusively mapped
- * Bit 57 pte is uffd-wp write-protected
+ * Bit 57 pte is tracked by userfaultfd (uffd-wp or RWP)
* Bit 58 pte is a guard region
* Bits 59-60 zero
* Bit 61 page is file-page or shared-anon
@@ -2332,7 +2332,7 @@ static int pagemap_release(struct inode *inode, struct file *file)
PAGE_IS_FILE | PAGE_IS_PRESENT | \
PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \
PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \
- PAGE_IS_GUARD)
+ PAGE_IS_GUARD | PAGE_IS_ACCESSED)
#define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC)
struct pagemap_scan_private {
@@ -2357,8 +2357,12 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
categories = PAGE_IS_PRESENT;
- if (!pte_uffd(pte))
- categories |= PAGE_IS_WRITTEN;
+ if (!pte_uffd(pte)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page(vma, addr, pte);
@@ -2375,8 +2379,12 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p,
categories = PAGE_IS_SWAPPED;
- if (!pte_swp_uffd_any(pte))
- categories |= PAGE_IS_WRITTEN;
+ if (!pte_swp_uffd_any(pte)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
entry = softleaf_from_pte(pte);
if (softleaf_is_guard_marker(entry))
@@ -2425,8 +2433,12 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
struct page *page;
categories |= PAGE_IS_PRESENT;
- if (!pmd_uffd(pmd))
- categories |= PAGE_IS_WRITTEN;
+ if (!pmd_uffd(pmd)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
if (p->masks_of_interest & PAGE_IS_FILE) {
page = vm_normal_page_pmd(vma, addr, pmd);
@@ -2440,8 +2452,12 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p,
categories |= PAGE_IS_SOFT_DIRTY;
} else {
categories |= PAGE_IS_SWAPPED;
- if (!pmd_swp_uffd(pmd))
- categories |= PAGE_IS_WRITTEN;
+ if (!pmd_swp_uffd(pmd)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
if (pmd_swp_soft_dirty(pmd))
categories |= PAGE_IS_SOFT_DIRTY;
@@ -2474,7 +2490,8 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma,
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#ifdef CONFIG_HUGETLB_PAGE
-static unsigned long pagemap_hugetlb_category(pte_t pte)
+static unsigned long pagemap_hugetlb_category(struct vm_area_struct *vma,
+ pte_t pte)
{
unsigned long categories = PAGE_IS_HUGE;
@@ -2489,8 +2506,12 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
if (pte_present(pte)) {
categories |= PAGE_IS_PRESENT;
- if (!huge_pte_uffd(pte))
- categories |= PAGE_IS_WRITTEN;
+ if (!huge_pte_uffd(pte)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
if (!PageAnon(pte_page(pte)))
categories |= PAGE_IS_FILE;
if (is_zero_pfn(pte_pfn(pte)))
@@ -2500,8 +2521,12 @@ static unsigned long pagemap_hugetlb_category(pte_t pte)
} else {
categories |= PAGE_IS_SWAPPED;
- if (!pte_swp_uffd_any(pte))
- categories |= PAGE_IS_WRITTEN;
+ if (!pte_swp_uffd_any(pte)) {
+ if (userfaultfd_wp(vma))
+ categories |= PAGE_IS_WRITTEN;
+ if (userfaultfd_rwp(vma))
+ categories |= PAGE_IS_ACCESSED;
+ }
if (pte_swp_soft_dirty(pte))
categories |= PAGE_IS_SOFT_DIRTY;
}
@@ -2586,6 +2611,16 @@ static int pagemap_scan_test_walk(unsigned long start, unsigned long end,
bool wp_allowed = userfaultfd_wp_async(vma) &&
userfaultfd_wp_use_markers(vma);
+ /*
+ * PM_SCAN_WP_MATCHING is the atomic read-and-reset flavour of the
+ * scan and is implemented for the WP marker only. Reject it on
+ * VM_UFFD_RWP VMAs explicitly so userspace gets a clear error
+ * instead of a silently-skipped range; re-arming is done with
+ * UFFDIO_RWPROTECT(MODE_RWP).
+ */
+ if (userfaultfd_rwp(vma) && (p->arg.flags & PM_SCAN_WP_MATCHING))
+ return -EINVAL;
+
if (!wp_allowed) {
/* User requested explicit failure over wp-async capability */
if (p->arg.flags & PM_SCAN_CHECK_WPASYNC)
@@ -2773,7 +2808,8 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start,
goto flush_and_return;
}
- if (!p->arg.category_anyof_mask && !p->arg.category_inverted &&
+ if (userfaultfd_wp(vma) && !p->arg.category_anyof_mask &&
+ !p->arg.category_inverted &&
p->arg.category_mask == PAGE_IS_WRITTEN &&
p->arg.return_mask == PAGE_IS_WRITTEN) {
for (addr = start; addr < end; pte++, addr += PAGE_SIZE) {
@@ -2848,7 +2884,8 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
/* Go the short route when not write-protecting pages. */
pte = huge_ptep_get(walk->mm, start, ptep);
- categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+ categories = p->cur_vma_category |
+ pagemap_hugetlb_category(vma, pte);
if (!pagemap_scan_is_interesting_page(categories, p))
return 0;
@@ -2860,7 +2897,7 @@ static int pagemap_scan_hugetlb_entry(pte_t *ptep, unsigned long hmask,
ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, ptep);
pte = huge_ptep_get(walk->mm, start, ptep);
- categories = p->cur_vma_category | pagemap_hugetlb_category(pte);
+ categories = p->cur_vma_category | pagemap_hugetlb_category(vma, pte);
if (!pagemap_scan_is_interesting_page(categories, p))
goto out_unlock;
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 13f71202845e..c4aeaa0c31c7 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -455,6 +455,7 @@ typedef int __bitwise __kernel_rwf_t;
#define PAGE_IS_HUGE (1 << 6)
#define PAGE_IS_SOFT_DIRTY (1 << 7)
#define PAGE_IS_GUARD (1 << 8)
+#define PAGE_IS_ACCESSED (1 << 9)
/*
* struct page_region - Page region with flags
diff --git a/tools/include/uapi/linux/fs.h b/tools/include/uapi/linux/fs.h
index 24ddf7bc4f25..f0a26309b6d5 100644
--- a/tools/include/uapi/linux/fs.h
+++ b/tools/include/uapi/linux/fs.h
@@ -364,6 +364,7 @@ typedef int __bitwise __kernel_rwf_t;
#define PAGE_IS_HUGE (1 << 6)
#define PAGE_IS_SOFT_DIRTY (1 << 7)
#define PAGE_IS_GUARD (1 << 8)
+#define PAGE_IS_ACCESSED (1 << 9)
/*
* struct page_region - Page region with flags
--
2.51.2
^ permalink raw reply related
* [PATCH v3 09/16] mm/userfaultfd: add RWP fault delivery and expose UFFDIO_REGISTER_MODE_RWP
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Wire the fault side of read-write protection tracking and turn the
userspace interface on.
An RWP-protected PTE is PAGE_NONE with the uffd bit set. The
PROT_NONE triggers a fault on any access; the uffd bit distinguishes
it from plain mprotect(PROT_NONE) or NUMA hinting.
Fault dispatch, per level:
PTE handle_pte_fault() -> do_uffd_rwp()
PMD __handle_mm_fault() -> do_huge_pmd_uffd_rwp()
hugetlb hugetlb_fault() -> hugetlb_handle_userfault()
The RWP branches gate on userfaultfd_pte_rwp() / userfaultfd_huge_pmd_rwp()
(VM_UFFD_RWP plus the uffd bit) and fall through to do_numa_page() /
do_huge_pmd_numa_page() otherwise. Each delivers a
UFFD_PAGEFAULT_FLAG_RWP message through handle_userfault(); the handler
resolves it with UFFDIO_RWPROTECT clearing MODE_RWP.
userfaultfd_must_wait() and userfaultfd_huge_must_wait() add matching
protnone+uffd waiters so sync-mode fault handlers block correctly.
Expose the UAPI:
UFFDIO_REGISTER_MODE_RWP -> UFFD_API_REGISTER_MODES
UFFD_FEATURE_RWP -> UFFD_API_FEATURES
_UFFDIO_RWPROTECT -> UFFD_API_RANGE_IOCTLS
UFFD_API_RANGE_IOCTLS_BASIC
UFFD_FEATURE_RWP is masked out at UFFDIO_API time when PROT_NONE is
not available or VM_UFFD_RWP aliases VM_NONE (32-bit), so userspace
never sees an advertised-but-broken feature.
Works on anonymous, shmem, and hugetlb memory.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
fs/userfaultfd.c | 32 ++++++++++++++++++++++++++++++--
include/linux/huge_mm.h | 7 +++++++
include/linux/userfaultfd_k.h | 24 ++++++++++++++++++++++++
include/uapi/linux/userfaultfd.h | 12 ++++++++----
mm/huge_memory.c | 5 +++++
mm/hugetlb.c | 11 +++++++++++
mm/memory.c | 21 +++++++++++++++++++--
7 files changed, 104 insertions(+), 8 deletions(-)
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index f2097c558165..f8f1619f5183 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -261,6 +261,12 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx,
*/
if (!huge_pte_write(pte) && (reason & VM_UFFD_WP))
return true;
+ /*
+ * PTE is still RW-protected (protnone with uffd bit), wait for
+ * resolution. Plain PROT_NONE without the marker is not an RWP fault.
+ */
+ if (pte_protnone(pte) && huge_pte_uffd(pte) && (reason & VM_UFFD_RWP))
+ return true;
return false;
}
@@ -321,8 +327,14 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
if (!pmd_present(_pmd))
return false;
- if (pmd_trans_huge(_pmd))
- return !pmd_write(_pmd) && (reason & VM_UFFD_WP);
+ if (pmd_trans_huge(_pmd)) {
+ if (!pmd_write(_pmd) && (reason & VM_UFFD_WP))
+ return true;
+ if (pmd_protnone(_pmd) && pmd_uffd(_pmd) &&
+ (reason & VM_UFFD_RWP))
+ return true;
+ return false;
+ }
pte = pte_offset_map(pmd, address);
if (!pte)
@@ -347,6 +359,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx,
*/
if (!pte_write(ptent) && (reason & VM_UFFD_WP))
goto out;
+ /*
+ * PTE is still RW-protected (protnone with uffd bit), wait for
+ * userspace to resolve. Plain PROT_NONE without the marker is not
+ * an RWP fault.
+ */
+ if (pte_protnone(ptent) && pte_uffd(ptent) && (reason & VM_UFFD_RWP))
+ goto out;
ret = false;
out:
@@ -2086,6 +2105,15 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx,
uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED;
uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC;
}
+ /*
+ * RWP needs both PROT_NONE support and the uffd-wp PTE bit. The
+ * VM_UFFD_RWP check covers compile-time unavailability; the
+ * pgtable_supports_uffd() check covers runtime (e.g. riscv
+ * without the SVRSW60T59B extension) where the PTE bit is declared
+ * but not actually usable.
+ */
+ if (VM_UFFD_RWP == VM_NONE || !pgtable_supports_uffd())
+ uffdio_api.features &= ~UFFD_FEATURE_RWP;
ret = -EINVAL;
if (features & ~uffdio_api.features)
diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h
index 2949e5acff35..e980909ee49e 100644
--- a/include/linux/huge_mm.h
+++ b/include/linux/huge_mm.h
@@ -520,6 +520,8 @@ static inline bool folio_test_pmd_mappable(struct folio *folio)
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf);
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf);
+
vm_fault_t do_huge_pmd_device_private(struct vm_fault *vmf);
extern struct folio *huge_zero_folio;
@@ -702,6 +704,11 @@ static inline spinlock_t *pud_trans_huge_lock(pud_t *pud,
return NULL;
}
+static inline vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+ return 0;
+}
+
static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
return 0;
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index d46974be864e..1beae4f2f479 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -247,6 +247,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
return userfaultfd_wp(vma) && pmd_uffd(pmd);
}
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return userfaultfd_rwp(vma) && pte_uffd(pte);
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return userfaultfd_rwp(vma) && pmd_uffd(pmd);
+}
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return vma->vm_flags & __VM_UFFD_FLAGS;
@@ -399,6 +411,18 @@ static inline bool userfaultfd_huge_pmd_wp(struct vm_area_struct *vma,
return false;
}
+static inline bool userfaultfd_pte_rwp(struct vm_area_struct *vma,
+ pte_t pte)
+{
+ return false;
+}
+
+static inline bool userfaultfd_huge_pmd_rwp(struct vm_area_struct *vma,
+ pmd_t pmd)
+{
+ return false;
+}
+
static inline bool userfaultfd_armed(struct vm_area_struct *vma)
{
return false;
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 7b78aa3b5318..d803e76d47ad 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -25,7 +25,8 @@
#define UFFD_API ((__u64)0xAA)
#define UFFD_API_REGISTER_MODES (UFFDIO_REGISTER_MODE_MISSING | \
UFFDIO_REGISTER_MODE_WP | \
- UFFDIO_REGISTER_MODE_MINOR)
+ UFFDIO_REGISTER_MODE_MINOR | \
+ UFFDIO_REGISTER_MODE_RWP)
#define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
UFFD_FEATURE_EVENT_FORK | \
UFFD_FEATURE_EVENT_REMAP | \
@@ -42,7 +43,8 @@
UFFD_FEATURE_WP_UNPOPULATED | \
UFFD_FEATURE_POISON | \
UFFD_FEATURE_WP_ASYNC | \
- UFFD_FEATURE_MOVE)
+ UFFD_FEATURE_MOVE | \
+ UFFD_FEATURE_RWP)
#define UFFD_API_IOCTLS \
((__u64)1 << _UFFDIO_REGISTER | \
(__u64)1 << _UFFDIO_UNREGISTER | \
@@ -54,13 +56,15 @@
(__u64)1 << _UFFDIO_MOVE | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_POISON)
+ (__u64)1 << _UFFDIO_POISON | \
+ (__u64)1 << _UFFDIO_RWPROTECT)
#define UFFD_API_RANGE_IOCTLS_BASIC \
((__u64)1 << _UFFDIO_WAKE | \
(__u64)1 << _UFFDIO_COPY | \
(__u64)1 << _UFFDIO_WRITEPROTECT | \
(__u64)1 << _UFFDIO_CONTINUE | \
- (__u64)1 << _UFFDIO_POISON)
+ (__u64)1 << _UFFDIO_POISON | \
+ (__u64)1 << _UFFDIO_RWPROTECT)
/*
* Valid ioctl command number range with this API is from 0x00 to
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 189192ea45cf..76ca0fbaa802 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -2264,6 +2264,11 @@ static inline bool can_change_pmd_writable(struct vm_area_struct *vma,
return pmd_dirty(pmd);
}
+vm_fault_t do_huge_pmd_uffd_rwp(struct vm_fault *vmf)
+{
+ return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
/* NUMA hinting page fault entry point for trans huge pmds */
vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf)
{
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index eee32a325481..9fc31cbcba4b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6067,6 +6067,17 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_mutex;
}
+ /*
+ * Protnone hugetlb PTEs with the uffd bit are used by
+ * userfaultfd RWP for access tracking. Plain PROT_NONE (without the
+ * marker) is not an RWP fault and is not expected on hugetlb (no
+ * NUMA hinting), so let normal hugetlb fault handling proceed.
+ */
+ if (pte_protnone(vmf.orig_pte) && vma_is_accessible(vma) &&
+ userfaultfd_rwp(vma) && huge_pte_uffd(vmf.orig_pte)) {
+ return hugetlb_handle_userfault(&vmf, mapping, VM_UFFD_RWP);
+ }
+
/*
* If we are going to COW/unshare the mapping later, we examine the
* pending reservations for this page now. This will ensure that any
diff --git a/mm/memory.c b/mm/memory.c
index ea9616e3dbaf..e0dcf2c28d9d 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -6172,6 +6172,12 @@ static void numa_rebuild_large_mapping(struct vm_fault *vmf, struct vm_area_stru
}
}
+static vm_fault_t do_uffd_rwp(struct vm_fault *vmf)
+{
+ pte_unmap(vmf->pte);
+ return handle_userfault(vmf, VM_UFFD_RWP);
+}
+
static vm_fault_t do_numa_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
@@ -6446,8 +6452,16 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
if (!pte_present(vmf->orig_pte))
return do_swap_page(vmf);
- if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma))
+ if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) {
+ /*
+ * RWP-protected PTEs are protnone plus the uffd bit. On a
+ * VM_UFFD_RWP VMA, a protnone PTE without the uffd bit is
+ * NUMA hinting and must still fall through to do_numa_page().
+ */
+ if (userfaultfd_pte_rwp(vmf->vma, vmf->orig_pte))
+ return do_uffd_rwp(vmf);
return do_numa_page(vmf);
+ }
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
@@ -6561,8 +6575,11 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
return 0;
}
if (pmd_trans_huge(vmf.orig_pmd)) {
- if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma))
+ if (pmd_protnone(vmf.orig_pmd) && vma_is_accessible(vma)) {
+ if (userfaultfd_huge_pmd_rwp(vma, vmf.orig_pmd))
+ return do_huge_pmd_uffd_rwp(&vmf);
return do_huge_pmd_numa_page(&vmf);
+ }
if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) &&
!pmd_write(vmf.orig_pmd)) {
--
2.51.2
^ permalink raw reply related
* [PATCH v3 08/16] userfaultfd: add UFFDIO_REGISTER_MODE_RWP and UFFDIO_RWPROTECT plumbing
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Add the userspace interface for read-write protection tracking:
- UFFDIO_REGISTER_MODE_RWP register a range for RWP tracking
- UFFD_FEATURE_RWP capability bit
- UFFDIO_RWPROTECT install / remove RWP on a range
Introduce CONFIG_USERFAULTFD_RWP, auto-selected on 64-bit kernels with
ARCH_HAS_PTE_PROTNONE and HAVE_ARCH_USERFAULTFD_WP. The symbol gates
VM_UFFD_RWP (previously aliased to VM_NONE) and the smaps/trace-flag
hooks added in the preparatory patches; without it the UAPI bits added
here have nothing to drive and would be unreachable.
Registration sets VM_UFFD_RWP on the VMA. Combining MODE_WP with
MODE_RWP is rejected because both modes claim the uffd PTE bit.
UFFDIO_RWPROTECT is the bidirectional counterpart of
UFFDIO_WRITEPROTECT:
- MODE_RWP change_protection() with MM_CP_UFFD_RWP
installs PAGE_NONE and sets the uffd bit on
present PTEs
- !MODE_RWP change_protection() with MM_CP_UFFD_RWP_RESOLVE
restores vma->vm_page_prot and clears the bit
userfaultfd_clear_vma() runs the same resolve pass on unregister so
RWP state cannot outlive the uffd.
Re-registering a range must not drop a mode that installs per-PTE
markers (WP or RWP); doing so returns -EBUSY. This also closes a
pre-existing window where re-registering without MODE_WP would strand
uffd-wp markers: before, those caused extra write-faults but were
otherwise benign; with RWP preservation in place, a subsequent
mprotect() on a VM_UFFD_RWP VMA would silently promote the stale
markers to RWP.
The feature is not yet advertised. UFFDIO_REGISTER_MODE_RWP,
UFFD_FEATURE_RWP, and _UFFDIO_RWPROTECT are intentionally absent from
UFFD_API_REGISTER_MODES, UFFD_API_FEATURES, and UFFD_API_RANGE_IOCTLS,
so UFFDIO_API masks them out and the register-mode validator rejects
the bit. The follow-up patch adds fault dispatch and exposes the UAPI.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Reviewed-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
Documentation/admin-guide/mm/userfaultfd.rst | 10 ++
fs/userfaultfd.c | 84 +++++++++++++++++
include/linux/userfaultfd_k.h | 2 +
include/uapi/linux/userfaultfd.h | 19 ++++
mm/Kconfig | 9 ++
mm/userfaultfd.c | 96 +++++++++++++++++++-
6 files changed, 217 insertions(+), 3 deletions(-)
diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst
index e5cc8848dcb3..1e533639fd50 100644
--- a/Documentation/admin-guide/mm/userfaultfd.rst
+++ b/Documentation/admin-guide/mm/userfaultfd.rst
@@ -131,6 +131,16 @@ userfaults on the range registered. Not all ioctls will necessarily be
supported for all memory types (e.g. anonymous memory vs. shmem vs.
hugetlbfs), or all types of intercepted faults.
+.. note::
+
+ Re-registering an already-registered range must not drop any of the
+ modes that install per-PTE markers — currently
+ ``UFFDIO_REGISTER_MODE_WP`` and ``UFFDIO_REGISTER_MODE_RWP``. Doing
+ so would strand markers with no flag to describe them, so the call
+ is rejected with ``-EBUSY``; userspace must issue
+ ``UFFDIO_UNREGISTER`` first. This differs from older kernels, which
+ silently replaced the mode bits on re-registration.
+
Userland can use the ``uffdio_register.ioctls`` to manage the virtual
address space in the background (to add or potentially also remove
memory from the ``userfaultfd`` registered range). This means a userfault
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 0fdf28f62702..f2097c558165 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -215,6 +215,8 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WRITE;
if (reason & VM_UFFD_WP)
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_WP;
+ if (reason & VM_UFFD_RWP)
+ msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_RWP;
if (reason & VM_UFFD_MINOR)
msg.arg.pagefault.flags |= UFFD_PAGEFAULT_FLAG_MINOR;
if (features & UFFD_FEATURE_THREAD_ID)
@@ -1292,6 +1294,22 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
vm_flags |= VM_UFFD_WP;
}
+ if (uffdio_register.mode & UFFDIO_REGISTER_MODE_RWP) {
+ if (!pgtable_supports_uffd() || VM_UFFD_RWP == VM_NONE)
+ goto out;
+ if (!(ctx->features & UFFD_FEATURE_RWP))
+ goto out;
+ vm_flags |= VM_UFFD_RWP;
+ }
+
+ /*
+ * WP and RWP share the uffd PTE bit and
+ * cannot coexist in the same VMA — the bit would carry ambiguous
+ * semantics. Reject the combination up front.
+ */
+ if ((vm_flags & VM_UFFD_WP) && (vm_flags & VM_UFFD_RWP))
+ goto out;
+
if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) {
#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_MINOR
goto out;
@@ -1385,6 +1403,16 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
cur->vm_userfaultfd_ctx.ctx != ctx)
goto out_unlock;
+ /*
+ * Mode switches that drop VM_UFFD_WP or VM_UFFD_RWP would
+ * leave PTE markers without the flag that describes them;
+ * subsequent mprotect() would then promote stale markers
+ * into the other mode. Require an unregister first.
+ */
+ if (cur->vm_userfaultfd_ctx.ctx == ctx &&
+ cur->vm_flags & (VM_UFFD_WP | VM_UFFD_RWP) & ~vm_flags)
+ goto out_unlock;
+
/*
* Note vmas containing huge pages
*/
@@ -1418,6 +1446,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
+ /* RWPROTECT is only supported for RWP ranges */
+ if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_RWP))
+ ioctls_out &= ~((__u64)1 << _UFFDIO_RWPROTECT);
+
/*
* Now that we scanned all vmas we can already tell
* userland which ioctls methods are guaranteed to
@@ -1765,6 +1797,55 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
return ret;
}
+static int userfaultfd_rwprotect(struct userfaultfd_ctx *ctx,
+ unsigned long arg)
+{
+ int ret;
+ struct uffdio_rwprotect uffdio_rwp;
+ struct userfaultfd_wake_range range;
+ bool mode_rwp, mode_dontwake;
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (copy_from_user(&uffdio_rwp, (void __user *)arg,
+ sizeof(uffdio_rwp)))
+ return -EFAULT;
+
+ ret = validate_range(ctx->mm, uffdio_rwp.range.start,
+ uffdio_rwp.range.len);
+ if (ret)
+ return ret;
+
+ if (uffdio_rwp.mode & ~(UFFDIO_RWPROTECT_MODE_DONTWAKE |
+ UFFDIO_RWPROTECT_MODE_RWP))
+ return -EINVAL;
+
+ mode_rwp = uffdio_rwp.mode & UFFDIO_RWPROTECT_MODE_RWP;
+ mode_dontwake = uffdio_rwp.mode & UFFDIO_RWPROTECT_MODE_DONTWAKE;
+
+ if (mode_rwp && mode_dontwake)
+ return -EINVAL;
+
+ if (mmget_not_zero(ctx->mm)) {
+ ret = mrwprotect_range(ctx, uffdio_rwp.range.start,
+ uffdio_rwp.range.len, mode_rwp);
+ mmput(ctx->mm);
+ } else {
+ return -ESRCH;
+ }
+
+ if (ret)
+ return ret;
+
+ if (!mode_rwp && !mode_dontwake) {
+ range.start = uffdio_rwp.range.start;
+ range.len = uffdio_rwp.range.len;
+ wake_userfault(ctx, &range);
+ }
+ return ret;
+}
+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
{
__s64 ret;
@@ -2071,6 +2152,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
case UFFDIO_POISON:
ret = userfaultfd_poison(ctx, arg);
break;
+ case UFFDIO_RWPROTECT:
+ ret = userfaultfd_rwprotect(ctx, arg);
+ break;
}
return ret;
}
diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h
index 07766398d592..d46974be864e 100644
--- a/include/linux/userfaultfd_k.h
+++ b/include/linux/userfaultfd_k.h
@@ -162,6 +162,8 @@ extern int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
unsigned long len, bool enable_wp);
extern long uffd_wp_range(struct vm_area_struct *vma,
unsigned long start, unsigned long len, bool enable_wp);
+extern int mrwprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_rwp);
/* move_pages */
void double_pt_lock(spinlock_t *ptl1, spinlock_t *ptl2);
diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h
index 2841e4ea8f2c..7b78aa3b5318 100644
--- a/include/uapi/linux/userfaultfd.h
+++ b/include/uapi/linux/userfaultfd.h
@@ -79,6 +79,7 @@
#define _UFFDIO_WRITEPROTECT (0x06)
#define _UFFDIO_CONTINUE (0x07)
#define _UFFDIO_POISON (0x08)
+#define _UFFDIO_RWPROTECT (0x09)
#define _UFFDIO_API (0x3F)
/* userfaultfd ioctl ids */
@@ -103,6 +104,8 @@
struct uffdio_continue)
#define UFFDIO_POISON _IOWR(UFFDIO, _UFFDIO_POISON, \
struct uffdio_poison)
+#define UFFDIO_RWPROTECT _IOWR(UFFDIO, _UFFDIO_RWPROTECT, \
+ struct uffdio_rwprotect)
/* read() structure */
struct uffd_msg {
@@ -158,6 +161,7 @@ struct uffd_msg {
#define UFFD_PAGEFAULT_FLAG_WRITE (1<<0) /* If this was a write fault */
#define UFFD_PAGEFAULT_FLAG_WP (1<<1) /* If reason is VM_UFFD_WP */
#define UFFD_PAGEFAULT_FLAG_MINOR (1<<2) /* If reason is VM_UFFD_MINOR */
+#define UFFD_PAGEFAULT_FLAG_RWP (1<<3) /* If reason is VM_UFFD_RWP */
struct uffdio_api {
/* userland asks for an API number and the features to enable */
@@ -230,6 +234,11 @@ struct uffdio_api {
*
* UFFD_FEATURE_MOVE indicates that the kernel supports moving an
* existing page contents from userspace.
+ *
+ * UFFD_FEATURE_RWP indicates that the kernel supports
+ * UFFDIO_REGISTER_MODE_RWP for read-write protection tracking.
+ * Pages are made inaccessible via UFFDIO_RWPROTECT and faults
+ * are delivered when the pages are re-accessed.
*/
#define UFFD_FEATURE_PAGEFAULT_FLAG_WP (1<<0)
#define UFFD_FEATURE_EVENT_FORK (1<<1)
@@ -248,6 +257,7 @@ struct uffdio_api {
#define UFFD_FEATURE_POISON (1<<14)
#define UFFD_FEATURE_WP_ASYNC (1<<15)
#define UFFD_FEATURE_MOVE (1<<16)
+#define UFFD_FEATURE_RWP (1<<17)
__u64 features;
__u64 ioctls;
@@ -263,6 +273,7 @@ struct uffdio_register {
#define UFFDIO_REGISTER_MODE_MISSING ((__u64)1<<0)
#define UFFDIO_REGISTER_MODE_WP ((__u64)1<<1)
#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1<<2)
+#define UFFDIO_REGISTER_MODE_RWP ((__u64)1<<3)
__u64 mode;
/*
@@ -356,6 +367,14 @@ struct uffdio_poison {
__s64 updated;
};
+struct uffdio_rwprotect {
+ struct uffdio_range range;
+ /* !RWP means undo RWP-protection */
+#define UFFDIO_RWPROTECT_MODE_RWP ((__u64)1<<0)
+#define UFFDIO_RWPROTECT_MODE_DONTWAKE ((__u64)1<<1)
+ __u64 mode;
+};
+
struct uffdio_move {
__u64 dst;
__u64 src;
diff --git a/mm/Kconfig b/mm/Kconfig
index e8bf1e9e6ad9..ccf534a8cbc9 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -1347,6 +1347,15 @@ config HAVE_ARCH_USERFAULTFD_MINOR
help
Arch has userfaultfd minor fault support
+config USERFAULTFD_RWP
+ def_bool y
+ depends on 64BIT && ARCH_HAS_PTE_PROTNONE && HAVE_ARCH_USERFAULTFD_WP
+ help
+ Userfaultfd read-write protection (UFFDIO_RWPROTECT) delivers a
+ userfaultfd notification on every access -- read or write -- to a
+ protected range, letting userspace observe the working set of a
+ process.
+
menuconfig USERFAULTFD
bool "Enable userfaultfd() system call"
depends on MMU
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index d4a1d340dab3..c13452cb092b 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -1072,6 +1072,75 @@ int mwriteprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
return err;
}
+int mrwprotect_range(struct userfaultfd_ctx *ctx, unsigned long start,
+ unsigned long len, bool enable_rwp)
+{
+ struct mm_struct *dst_mm = ctx->mm;
+ unsigned long end = start + len;
+ struct vm_area_struct *dst_vma;
+ unsigned int mm_cp_flags;
+ struct mmu_gather tlb;
+ bool found = false;
+ VMA_ITERATOR(vmi, dst_mm, start);
+
+ VM_WARN_ON_ONCE(start & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(len & ~PAGE_MASK);
+ VM_WARN_ON_ONCE(start + len <= start);
+
+ guard(mmap_read_lock)(dst_mm);
+ guard(rwsem_read)(&ctx->map_changing_lock);
+
+ if (atomic_read(&ctx->mmap_changing))
+ return -EAGAIN;
+
+ if (enable_rwp)
+ mm_cp_flags = MM_CP_UFFD_RWP;
+ else
+ mm_cp_flags = MM_CP_UFFD_RWP_RESOLVE;
+
+ /*
+ * Pre-scan the range: validate every spanned VMA before applying
+ * any change_protection() so a partial failure cannot leave the
+ * process with only a prefix of the range re-protected.
+ */
+ for_each_vma_range(vmi, dst_vma, end) {
+ if (!userfaultfd_rwp(dst_vma))
+ return -ENOENT;
+
+ if (is_vm_hugetlb_page(dst_vma)) {
+ unsigned long page_mask;
+
+ page_mask = vma_kernel_pagesize(dst_vma) - 1;
+ if ((start & page_mask) || (len & page_mask))
+ return -EINVAL;
+ }
+ found = true;
+ }
+ if (!found)
+ return -ENOENT;
+
+ vma_iter_set(&vmi, start);
+ tlb_gather_mmu(&tlb, dst_mm);
+ for_each_vma_range(vmi, dst_vma, end) {
+ unsigned long vma_start = max(dst_vma->vm_start, start);
+ unsigned long vma_end = min(dst_vma->vm_end, end);
+ unsigned int flags = mm_cp_flags;
+
+ /*
+ * On resolve, try to upgrade writability per-VMA --
+ * MM_CP_TRY_CHANGE_WRITABLE WARNs in
+ * maybe_change_pte_writable() if the VMA is not VM_WRITE,
+ * and RWP can be registered on PROT_READ-only mappings.
+ */
+ if (!enable_rwp && vma_wants_manual_pte_write_upgrade(dst_vma))
+ flags |= MM_CP_TRY_CHANGE_WRITABLE;
+
+ change_protection(&tlb, dst_vma, vma_start, vma_end, flags);
+ }
+ tlb_finish_mmu(&tlb);
+
+ return 0;
+}
void double_pt_lock(spinlock_t *ptl1,
spinlock_t *ptl2)
@@ -2109,9 +2178,22 @@ struct vm_area_struct *userfaultfd_clear_vma(struct vma_iterator *vmi,
if (start == vma->vm_start && end == vma->vm_end)
give_up_on_oom = true;
- /* Reset ptes for the whole vma range if wr-protected */
- if (userfaultfd_wp(vma))
- uffd_wp_range(vma, start, end - start, false);
+ /* Clear the uffd bit and/or restore protnone PTEs */
+ if (userfaultfd_protected(vma)) {
+ unsigned int mm_cp_flags = 0;
+ struct mmu_gather tlb;
+
+ if (userfaultfd_wp(vma))
+ mm_cp_flags |= MM_CP_UFFD_WP_RESOLVE;
+ if (userfaultfd_rwp(vma))
+ mm_cp_flags |= MM_CP_UFFD_RWP_RESOLVE;
+ if (vma_wants_manual_pte_write_upgrade(vma))
+ mm_cp_flags |= MM_CP_TRY_CHANGE_WRITABLE;
+
+ tlb_gather_mmu(&tlb, vma->vm_mm);
+ change_protection(&tlb, vma, start, end, mm_cp_flags);
+ tlb_finish_mmu(&tlb);
+ }
ret = vma_modify_flags_uffd(vmi, prev, vma, start, end,
&new_vma_flags, NULL_VM_UFFD_CTX,
@@ -2160,6 +2242,14 @@ int userfaultfd_register_range(struct userfaultfd_ctx *ctx,
vma_test_all_mask(vma, vma_flags))
goto skip;
+ /*
+ * Pre-scan in userfaultfd_register() already rejected mode
+ * switches that would drop VM_UFFD_WP or VM_UFFD_RWP, so a
+ * stray bit here is a bug.
+ */
+ VM_WARN_ON_ONCE(vma->vm_userfaultfd_ctx.ctx == ctx &&
+ vma->vm_flags & (VM_UFFD_WP | VM_UFFD_RWP) & ~vm_flags);
+
if (vma->vm_start > start)
start = vma->vm_start;
vma_end = min(end, vma->vm_end);
--
2.51.2
^ permalink raw reply related
* [PATCH v3 07/16] mm: handle VM_UFFD_RWP in khugepaged, rmap, and GUP
From: Kiryl Shutsemau @ 2026-05-22 13:38 UTC (permalink / raw)
To: akpm, rppt, peterx, david
Cc: ljs, surenb, vbabka, Liam.Howlett, ziy, corbet, skhan, seanjc,
pbonzini, jthoughton, aarcange, sj, usama.arif, linux-mm,
linux-kernel, linux-doc, linux-kselftest, kvm, kernel-team,
linux-man, alx, Kiryl Shutsemau (Meta)
In-Reply-To: <20260522133857.552279-1-kirill@shutemov.name>
From: "Kiryl Shutsemau (Meta)" <kas@kernel.org>
Three mm paths outside the fault handler gate on the uffd PTE bit
today: khugepaged (skip collapse on ranges carrying markers), rmap
(cap unmap batching), and GUP (force a fault through
gup_can_follow_protnone). Extend each to treat VM_UFFD_RWP the same
as VM_UFFD_WP; otherwise per-PTE RWP state is silently destroyed or
bypassed.
khugepaged: try_collapse_pte_mapped_thp() and
file_backed_vma_is_retractable() already refuse to collapse or
retract page tables on ranges carrying the uffd PTE bit. Broaden the
VMA predicate from userfaultfd_wp() to userfaultfd_protected() so
VM_UFFD_RWP ranges get the same protection. hpage_collapse_scan_pmd()
needs no change — its existing pte_uffd() check already catches an
RWP PTE because it carries the uffd bit.
rmap: folio_unmap_pte_batch() caps batching at 1 for VM_UFFD_RWP so
the restore path handles each PTE with its own marker.
GUP: gup_can_follow_protnone() forces a fault on VM_UFFD_RWP VMAs
regardless of FOLL_HONOR_NUMA_FAULT. RWP uses protnone as an
access-tracking marker, not for NUMA hinting, so any GUP — read or
write — must go through the userfaultfd fault path.
Signed-off-by: Kiryl Shutsemau <kas@kernel.org>
Assisted-by: Claude:claude-opus-4-6
Acked-by: Mike Rapoport (Microsoft) <rppt@kernel.org>
---
include/linux/mm.h | 10 +++++++++-
mm/khugepaged.c | 18 +++++++++++-------
mm/rmap.c | 2 +-
3 files changed, 21 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 0598b1482aeb..0ecfb0973b01 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -4605,11 +4605,19 @@ static inline int vm_fault_to_errno(vm_fault_t vm_fault, int foll_flags)
/*
* Indicates whether GUP can follow a PROT_NONE mapped page, or whether
- * a (NUMA hinting) fault is required.
+ * a (NUMA hinting or userfaultfd RWP) fault is required.
*/
static inline bool gup_can_follow_protnone(const struct vm_area_struct *vma,
unsigned int flags)
{
+ /*
+ * VM_UFFD_RWP uses protnone as an access-tracking marker, not for
+ * NUMA hinting. GUP must always take a fault so the access is
+ * delivered to userfaultfd, regardless of FOLL_HONOR_NUMA_FAULT.
+ */
+ if (vma->vm_flags & VM_UFFD_RWP)
+ return false;
+
/*
* If callers don't want to honor NUMA hinting faults, no need to
* determine if we would actually have to trigger a NUMA hinting fault.
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index de0644bde400..a798c542c849 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1532,8 +1532,11 @@ static enum scan_result try_collapse_pte_mapped_thp(struct mm_struct *mm, unsign
if (!thp_vma_allowable_order(vma, vma->vm_flags, TVA_FORCED_COLLAPSE, PMD_ORDER))
return SCAN_VMA_CHECK;
- /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */
- if (userfaultfd_wp(vma))
+ /*
+ * Keep pmd pgtable while the uffd bit is in use; see comment in
+ * retract_page_tables().
+ */
+ if (userfaultfd_protected(vma))
return SCAN_PTE_UFFD;
folio = filemap_lock_folio(vma->vm_file->f_mapping,
@@ -1746,13 +1749,14 @@ static bool file_backed_vma_is_retractable(struct vm_area_struct *vma)
return false;
/*
- * When a vma is registered with uffd-wp, we cannot recycle
+ * When a vma is registered with uffd-wp or RWP, we cannot recycle
* the page table because there may be pte markers installed.
- * Other vmas can still have the same file mapped hugely, but
- * skip this one: it will always be mapped in small page size
- * for uffd-wp registered ranges.
+ * VM_UFFD_RWP ranges similarly rely on per-PTE uffd state
+ * and cannot be recycled to a shared PMD. Other vmas can still
+ * have the same file mapped hugely, but skip this one: it will
+ * always be mapped in small page size for these registrations.
*/
- if (userfaultfd_wp(vma))
+ if (userfaultfd_protected(vma))
return false;
/*
diff --git a/mm/rmap.c b/mm/rmap.c
index 05056c213203..1426d1ece917 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1965,7 +1965,7 @@ static inline unsigned int folio_unmap_pte_batch(struct folio *folio,
if (pte_unused(pte))
return 1;
- if (userfaultfd_wp(vma))
+ if (userfaultfd_protected(vma))
return 1;
/*
--
2.51.2
^ permalink raw reply related
page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox