Linux-ARM-Kernel Archive on lore.kernel.org

Linux-ARM-Kernel Archive on lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v20 06/14] dmaengine: qcom: bam_dma: add support for BAM locking
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

Add support for BAM pipe locking. To that end: when starting DMA on an RX
channel - prepend the existing queue of issued descriptors with an
additional "dummy" command descriptor with the LOCK bit set. Once the
transaction is done (no more issued descriptors), issue one more dummy
descriptor with the UNLOCK bit.

We *must* wait until the transaction is signalled as done because we
must not perform any writes into config registers while the engine is
busy.

The dummy writes must be issued into a scratchpad register of the client
so provide a mechanism to communicate the right address via descriptor
metadata.

Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/dma/qcom/bam_dma.c       | 189 +++++++++++++++++++++++++++++++++++++--
 include/linux/dma/qcom_bam_dma.h |  14 +++
 2 files changed, 196 insertions(+), 7 deletions(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index f3e713a5259c2c7c24cfdcec094814eb1202971a..f4f258994264a234f60debd3e66e31a6b35d1dc5 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -28,11 +28,13 @@
 #include <linux/clk.h>
 #include <linux/device.h>
 #include <linux/dma-mapping.h>
+#include <linux/dma/qcom_bam_dma.h>
 #include <linux/dmaengine.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
+#include <linux/lockdep.h>
 #include <linux/module.h>
 #include <linux/of_address.h>
 #include <linux/of_dma.h>
@@ -60,6 +62,8 @@ struct bam_desc_hw {
 #define DESC_FLAG_EOB BIT(13)
 #define DESC_FLAG_NWD BIT(12)
 #define DESC_FLAG_CMD BIT(11)
+#define DESC_FLAG_LOCK BIT(10)
+#define DESC_FLAG_UNLOCK BIT(9)
 
 struct bam_async_desc {
 	struct virt_dma_desc vd;
@@ -72,6 +76,10 @@ struct bam_async_desc {
 
 	struct bam_desc_hw *curr_desc;
 
+	/* BAM locking infrastructure */
+	struct scatterlist lock_sg;
+	struct bam_cmd_element lock_ce;
+
 	/* list node for the desc in the bam_chan list of descriptors */
 	struct list_head desc_node;
 	enum dma_transfer_direction dir;
@@ -425,6 +433,11 @@ struct bam_chan {
 	struct list_head desc_list;
 
 	struct list_head node;
+
+	/* BAM locking infrastructure */
+	phys_addr_t scratchpad_addr;
+	enum dma_transfer_direction direction;
+	bool bam_locked;
 };
 
 static inline struct bam_chan *to_bam_chan(struct dma_chan *common)
@@ -638,8 +651,10 @@ static void bam_free_chan(struct dma_chan *chan)
 		goto err;
 	}
 
-	scoped_guard(spinlock_irqsave, &bchan->vc.lock)
+	scoped_guard(spinlock_irqsave, &bchan->vc.lock) {
 		bam_reset_channel(bchan);
+		bchan->bam_locked = false;
+	}
 
 	dma_free_wc(bdev->dev, BAM_DESC_FIFO_SIZE, bchan->fifo_virt,
 		    bchan->fifo_phys);
@@ -686,6 +701,35 @@ static int bam_slave_config(struct dma_chan *chan,
 	return 0;
 }
 
+static int bam_metadata_attach(struct dma_async_tx_descriptor *desc, void *data, size_t len)
+{
+	struct bam_chan *bchan = to_bam_chan(desc->chan);
+	const struct bam_device_data *bdata = bchan->bdev->dev_data;
+	struct bam_desc_metadata *metadata = data;
+
+	if (!data)
+		return -EINVAL;
+
+	if (!bdata->pipe_lock_supported)
+		/*
+		 * The client wants to use locking but this BAM version doesn't
+		 * support it. Don't return an error here as this will stop the
+		 * client from using DMA at all for no reason.
+		 */
+		return 0;
+
+	guard(spinlock_irqsave)(&bchan->vc.lock);
+
+	bchan->scratchpad_addr = metadata->scratchpad_addr;
+	bchan->direction = metadata->direction;
+
+	return 0;
+}
+
+static const struct dma_descriptor_metadata_ops bam_metadata_ops = {
+	.attach = bam_metadata_attach,
+};
+
 /**
  * bam_prep_slave_sg - Prep slave sg transaction
  *
@@ -702,6 +746,7 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 	void *context)
 {
 	struct bam_chan *bchan = to_bam_chan(chan);
+	struct dma_async_tx_descriptor *tx_desc;
 	struct bam_device *bdev = bchan->bdev;
 	struct bam_async_desc *async_desc;
 	struct scatterlist *sg;
@@ -757,7 +802,10 @@ static struct dma_async_tx_descriptor *bam_prep_slave_sg(struct dma_chan *chan,
 		} while (remainder > 0);
 	}
 
-	return vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+	tx_desc = vchan_tx_prep(&bchan->vc, &async_desc->vd, flags);
+	tx_desc->metadata_ops = &bam_metadata_ops;
+
+	return tx_desc;
 }
 
 /**
@@ -802,6 +850,7 @@ static int bam_dma_terminate_all(struct dma_chan *chan)
 		}
 
 		vchan_get_all_descriptors(&bchan->vc, &head);
+		bchan->bam_locked = false;
 	}
 
 	vchan_dma_desc_free_list(&bchan->vc, &head);
@@ -859,6 +908,15 @@ static int bam_resume(struct dma_chan *chan)
 	return 0;
 }
 
+static void bam_dma_free_lock_desc(struct virt_dma_desc *vd)
+{
+	struct bam_async_desc *async_desc = container_of(vd, struct bam_async_desc, vd);
+	struct dma_chan *chan = vd->tx.chan;
+
+	dma_unmap_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
+	kfree(async_desc);
+}
+
 /**
  * process_channel_irqs - processes the channel interrupts
  * @bdev: bam controller
@@ -919,13 +977,23 @@ static u32 process_channel_irqs(struct bam_device *bdev)
 			 * push back to front of desc_issued so that
 			 * it gets restarted by the work queue.
 			 */
+
+			list_del(&async_desc->desc_node);
 			if (!async_desc->num_desc) {
-				vchan_cookie_complete(&async_desc->vd);
+				struct bam_desc_hw *hdesc = async_desc->desc;
+				u16 flags = le16_to_cpu(hdesc->flags);
+
+				if (flags & (DESC_FLAG_LOCK | DESC_FLAG_UNLOCK)) {
+					if (flags & DESC_FLAG_UNLOCK)
+						bchan->bam_locked = false;
+					bam_dma_free_lock_desc(&async_desc->vd);
+				} else {
+					vchan_cookie_complete(&async_desc->vd);
+				}
 			} else {
 				list_add(&async_desc->vd.node,
 					 &bchan->vc.desc_issued);
 			}
-			list_del(&async_desc->desc_node);
 		}
 	}
 
@@ -1046,13 +1114,101 @@ static void bam_apply_new_config(struct bam_chan *bchan,
 	bchan->reconfigure = 0;
 }
 
+static struct bam_async_desc *
+bam_make_lock_desc(struct bam_chan *bchan, unsigned long flag)
+{
+	struct dma_chan *chan = &bchan->vc.chan;
+	struct bam_async_desc *async_desc;
+	struct bam_desc_hw *desc;
+	struct virt_dma_desc *vd;
+	struct virt_dma_chan *vc;
+	unsigned int mapped;
+
+	async_desc = kzalloc_flex(*async_desc, desc, 1, GFP_NOWAIT);
+	if (!async_desc) {
+		dev_err(bchan->bdev->dev, "failed to allocate the BAM lock descriptor\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	sg_init_table(&async_desc->lock_sg, 1);
+
+	async_desc->num_desc = 1;
+	async_desc->curr_desc = async_desc->desc;
+	async_desc->dir = DMA_MEM_TO_DEV;
+
+	desc = async_desc->desc;
+
+	bam_prep_ce_le32(&async_desc->lock_ce, bchan->scratchpad_addr, BAM_WRITE_COMMAND, 0);
+	sg_set_buf(&async_desc->lock_sg, &async_desc->lock_ce, sizeof(async_desc->lock_ce));
+
+	mapped = dma_map_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
+	if (!mapped) {
+		kfree(async_desc);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	desc->flags |= cpu_to_le16(DESC_FLAG_CMD | flag);
+	desc->addr = sg_dma_address(&async_desc->lock_sg);
+	desc->size = cpu_to_le16(sizeof(struct bam_cmd_element));
+
+	vc = &bchan->vc;
+	vd = &async_desc->vd;
+
+	dma_async_tx_descriptor_init(&vd->tx, &vc->chan);
+	vd->tx.flags = DMA_PREP_CMD;
+	vd->tx_result.result = DMA_TRANS_NOERROR;
+	vd->tx_result.residue = 0;
+
+	return async_desc;
+}
+
+static int bam_setup_pipe_lock(struct bam_chan *bchan)
+{
+	const struct bam_device_data *bdata = bchan->bdev->dev_data;
+	struct bam_async_desc *lock_desc, *unlock_desc;
+
+	lockdep_assert_held(&bchan->vc.lock);
+
+	if (!bdata->pipe_lock_supported || !bchan->scratchpad_addr ||
+	    bchan->direction != DMA_MEM_TO_DEV)
+		return 0;
+
+	/*
+	 * Allocate both the LOCK and the UNLOCK descriptors up-front so the
+	 * operation is all-or-nothing: if either allocation fails we free both
+	 * and run the sequence unlocked rather than leave the pipe locked with
+	 * no matching UNLOCK.
+	 *
+	 * Both are queued in-band around the currently issued work: the LOCK is
+	 * prepended so it enters the FIFO first, the UNLOCK is appended so it is
+	 * the last descriptor of the sequence. They are loaded together with the
+	 * payload in a single operation so the engine executes LOCK, the work
+	 * and UNLOCK as one ordered batch.
+	 */
+	lock_desc = bam_make_lock_desc(bchan, DESC_FLAG_LOCK);
+	if (IS_ERR(lock_desc))
+		return PTR_ERR(lock_desc);
+
+	unlock_desc = bam_make_lock_desc(bchan, DESC_FLAG_UNLOCK);
+	if (IS_ERR(unlock_desc)) {
+		bam_dma_free_lock_desc(&lock_desc->vd);
+		return PTR_ERR(unlock_desc);
+	}
+
+	list_add(&lock_desc->vd.node, &bchan->vc.desc_issued);
+	list_add_tail(&unlock_desc->vd.node, &bchan->vc.desc_issued);
+	bchan->bam_locked = true;
+
+	return 0;
+}
+
 /**
  * bam_start_dma - start next transaction
  * @bchan: bam dma channel
  */
 static void bam_start_dma(struct bam_chan *bchan)
 {
-	struct virt_dma_desc *vd = vchan_next_desc(&bchan->vc);
+	struct virt_dma_desc *vd;
 	struct bam_device *bdev = bchan->bdev;
 	struct bam_async_desc *async_desc = NULL;
 	struct bam_desc_hw *desc;
@@ -1064,9 +1220,23 @@ static void bam_start_dma(struct bam_chan *bchan)
 
 	lockdep_assert_held(&bchan->vc.lock);
 
+	vd = vchan_next_desc(&bchan->vc);
 	if (!vd)
 		return;
 
+	/*
+	 * Wrap the issued work with a LOCK/UNLOCK pair exactly once, at the
+	 * start of a fresh sequence and only when there is real work to lock
+	 * around. On a re-entry after a full FIFO, we see the BAM is locked
+	 * and must not add another pair we simply continue loading the
+	 * remainder of the same locked sequence.
+	 */
+	if (!bchan->bam_locked) {
+		ret = bam_setup_pipe_lock(bchan);
+		if (ret == 0 && bchan->bam_locked)
+			vd = vchan_next_desc(&bchan->vc);
+	}
+
 	ret = pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
 		return;
@@ -1191,8 +1361,12 @@ static void bam_issue_pending(struct dma_chan *chan)
  */
 static void bam_dma_free_desc(struct virt_dma_desc *vd)
 {
-	struct bam_async_desc *async_desc = container_of(vd,
-			struct bam_async_desc, vd);
+	struct bam_async_desc *async_desc = container_of(vd, struct bam_async_desc, vd);
+	struct bam_desc_hw *desc = async_desc->desc;
+	struct dma_chan *chan = vd->tx.chan;
+
+	if (le16_to_cpu(desc->flags) & (DESC_FLAG_LOCK | DESC_FLAG_UNLOCK))
+		dma_unmap_sg(chan->slave, &async_desc->lock_sg, 1, DMA_TO_DEVICE);
 
 	kfree(async_desc);
 }
@@ -1384,6 +1558,7 @@ static int bam_dma_probe(struct platform_device *pdev)
 	bdev->common.device_terminate_all = bam_dma_terminate_all;
 	bdev->common.device_issue_pending = bam_issue_pending;
 	bdev->common.device_tx_status = bam_tx_status;
+	bdev->common.desc_metadata_modes = DESC_METADATA_CLIENT;
 	bdev->common.dev = bdev->dev;
 
 	ret = dma_async_device_register(&bdev->common);
diff --git a/include/linux/dma/qcom_bam_dma.h b/include/linux/dma/qcom_bam_dma.h
index 68fc0e643b1b97fe4520d5878daa322b81f4f559..a2594264b0f58c4b2b1c85e243cad0d5669c26dc 100644
--- a/include/linux/dma/qcom_bam_dma.h
+++ b/include/linux/dma/qcom_bam_dma.h
@@ -6,6 +6,8 @@
 #ifndef _QCOM_BAM_DMA_H
 #define _QCOM_BAM_DMA_H
 
+#include <linux/dmaengine.h>
+
 #include <asm/byteorder.h>
 
 /*
@@ -34,6 +36,18 @@ enum bam_command_type {
 	BAM_READ_COMMAND,
 };
 
+/**
+ * struct bam_desc_metadata - DMA descriptor metadata specific to the BAM driver.
+ *
+ * @scratchpad_addr: Physical address to use for dummy write operations when
+ *                   queuing command descriptors with LOCK/UNLOCK bits set.
+ * @direction: Transfer direction of this channel.
+ */
+struct bam_desc_metadata {
+	phys_addr_t scratchpad_addr;
+	enum dma_transfer_direction direction;
+};
+
 /*
  * prep_bam_ce_le32 - Wrapper function to prepare a single BAM command
  * element with the data already in le32 format.

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 02/14] dmaengine: qcom: bam_dma: free interrupt before the clock in error path
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

The BAM interrupt is requested with a devres helper and so on error it's
freed after probe() returns. We disable the clock before freeing or
masking it so it may still fire and we may end up reading BAM registers
with clock disabled.

Stop using devres for interrupts as we free it in remove() manually
anyway. Add an appropriate label and free the interrupt before disabling
the clock in error path and in remove().

Fixes: e7c0fe2a5c84 ("dmaengine: add Qualcomm BAM dma driver")
Closes: https://sashiko.dev/#/patchset/20260427-qcom-qce-cmd-descr-v16-0-945fd1cafbbc%40oss.qualcomm.com?part=2
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/dma/qcom/bam_dma.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 1bb26af0405f3a16f97e0d4b86c945c252d97f57..fc155e0d1870cbb7e099a2c4280f9f8fbdf6cf15 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -1332,8 +1332,7 @@ static int bam_dma_probe(struct platform_device *pdev)
 	for (i = 0; i < bdev->num_channels; i++)
 		bam_channel_init(bdev, &bdev->channels[i], i);
 
-	ret = devm_request_irq(bdev->dev, bdev->irq, bam_dma_irq,
-			IRQF_TRIGGER_HIGH, "bam_dma", bdev);
+	ret = request_irq(bdev->irq, bam_dma_irq, IRQF_TRIGGER_HIGH, "bam_dma", bdev);
 	if (ret)
 		goto err_bam_channel_exit;
 
@@ -1366,7 +1365,7 @@ static int bam_dma_probe(struct platform_device *pdev)
 	ret = dma_async_device_register(&bdev->common);
 	if (ret) {
 		dev_err(bdev->dev, "failed to register dma async device\n");
-		goto err_bam_channel_exit;
+		goto err_free_irq;
 	}
 
 	ret = of_dma_controller_register(pdev->dev.of_node, bam_dma_xlate,
@@ -1385,6 +1384,8 @@ static int bam_dma_probe(struct platform_device *pdev)
 
 err_unregister_dma:
 	dma_async_device_unregister(&bdev->common);
+err_free_irq:
+	free_irq(bdev->irq, bdev);
 err_bam_channel_exit:
 	for (i = 0; i < bdev->num_channels; i++)
 		tasklet_kill(&bdev->channels[i].vc.task);
@@ -1401,6 +1402,8 @@ static void bam_dma_remove(struct platform_device *pdev)
 	struct bam_device *bdev = platform_get_drvdata(pdev);
 	u32 i;
 
+	free_irq(bdev->irq, bdev);
+
 	pm_runtime_force_suspend(&pdev->dev);
 
 	of_dma_controller_free(pdev->dev.of_node);
@@ -1409,8 +1412,6 @@ static void bam_dma_remove(struct platform_device *pdev)
 	/* mask all interrupts for this execution environment */
 	writel_relaxed(0, bam_addr(bdev, 0,  BAM_IRQ_SRCS_MSK_EE));
 
-	devm_free_irq(bdev->dev, bdev->irq, bdev);
-
 	for (i = 0; i < bdev->num_channels; i++) {
 		bam_dma_terminate_all(&bdev->channels[i].vc.chan);
 		tasklet_kill(&bdev->channels[i].vc.task);

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 08/14] crypto: qce - Include algapi.h in the core.h header
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

The header defines a struct embedding struct crypto_queue whose size
needs to be known and which is defined in crypto/algapi.h. Move the
inclusion from core.c to core.h.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/core.c | 1 -
 drivers/crypto/qce/core.h | 1 +
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index f671946cf7351cd5f0c319909bafd87e3af701c7..ad37c2b8ae53a373bb248aff06c3b7946e8439a8 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -13,7 +13,6 @@
 #include <linux/mod_devicetable.h>
 #include <linux/platform_device.h>
 #include <linux/types.h>
-#include <crypto/algapi.h>
 #include <crypto/internal/hash.h>
 
 #include "core.h"
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index eb6fa7a8b64a81daf9ad5304a3ae4e5e597a70b8..f092ce2d3b04a936a37805c20ac5ba78d8fdd2df 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -8,6 +8,7 @@
 
 #include <linux/mutex.h>
 #include <linux/workqueue.h>
+#include <crypto/algapi.h>
 
 #include "dma.h"
 

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 11/14] crypto: qce - Use existing devres APIs in devm_qce_dma_request()
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
	Konrad Dybcio
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

Switch to devm_kmalloc() and devm_dma_alloc_chan() in
devm_qce_dma_request(). This allows us to drop two labels and shrink the
function.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Konrad Dybcio <konrad.dybcio@oss.qualcomm.com>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/dma.c | 37 +++++++++++--------------------------
 1 file changed, 11 insertions(+), 26 deletions(-)

diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index d60efb5c26d88f8b0259b1dccc8724d0f75571c6..26347e9fc078adede712722107e74958538accdf 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -12,49 +12,34 @@
 
 #define QCE_IGNORE_BUF_SZ		(2 * QCE_BAM_BURST_SIZE)
 
-static void qce_dma_release(void *data)
+static void qce_dma_terminate(void *data)
 {
 	struct qce_dma_data *dma = data;
 
 	dmaengine_terminate_sync(dma->txchan);
 	dmaengine_terminate_sync(dma->rxchan);
-	dma_release_channel(dma->txchan);
-	dma_release_channel(dma->rxchan);
-	kfree(dma->result_buf);
 }
 
 int devm_qce_dma_request(struct qce_device *qce)
 {
 	struct qce_dma_data *dma = &qce->dma;
 	struct device *dev = qce->dev;
-	int ret;
 
-	dma->txchan = dma_request_chan(dev, "tx");
+	dma->result_buf = devm_kmalloc(dev, QCE_RESULT_BUF_SZ + QCE_IGNORE_BUF_SZ, GFP_KERNEL);
+	if (!dma->result_buf)
+		return -ENOMEM;
+
+	dma->txchan = devm_dma_request_chan(dev, "tx");
 	if (IS_ERR(dma->txchan))
 		return dev_err_probe(dev, PTR_ERR(dma->txchan),
 				     "Failed to get TX DMA channel\n");
 
-	dma->rxchan = dma_request_chan(dev, "rx");
-	if (IS_ERR(dma->rxchan)) {
-		ret = dev_err_probe(dev, PTR_ERR(dma->rxchan),
-				    "Failed to get RX DMA channel\n");
-		goto error_rx;
-	}
-
-	dma->result_buf = kmalloc(QCE_RESULT_BUF_SZ + QCE_IGNORE_BUF_SZ,
-				  GFP_KERNEL);
-	if (!dma->result_buf) {
-		ret = -ENOMEM;
-		goto error_nomem;
-	}
-
-	return devm_add_action_or_reset(dev, qce_dma_release, dma);
+	dma->rxchan = devm_dma_request_chan(dev, "rx");
+	if (IS_ERR(dma->rxchan))
+		return dev_err_probe(dev, PTR_ERR(dma->rxchan),
+				     "Failed to get RX DMA channel\n");
 
-error_nomem:
-	dma_release_channel(dma->rxchan);
-error_rx:
-	dma_release_channel(dma->txchan);
-	return ret;
+	return devm_add_action_or_reset(dev, qce_dma_terminate, dma);
 }
 
 struct scatterlist *

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 09/14] crypto: qce - Remove unused ignore_buf
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

It's unclear what the purpose of this field is. It has been here since
the initial commit but without any explanation. The driver works fine
without it. We still keep allocating more space in the result buffer, we
just don't need to store its address. While at it: move the
QCE_IGNORE_BUF_SZ definition into dma.c as it's not used outside of this
compilation unit.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/dma.c | 4 ++--
 drivers/crypto/qce/dma.h | 2 --
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 7ec9d72fd690fb17e03ade7efe3cc522fb47e1ac..d1daa229361aa74da5d3d7bfe1bc8ab189761e38 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -9,6 +9,8 @@
 
 #include "dma.h"
 
+#define QCE_IGNORE_BUF_SZ		(2 * QCE_BAM_BURST_SIZE)
+
 static void qce_dma_release(void *data)
 {
 	struct qce_dma_data *dma = data;
@@ -43,8 +45,6 @@ int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma)
 		goto error_nomem;
 	}
 
-	dma->ignore_buf = dma->result_buf + QCE_RESULT_BUF_SZ;
-
 	return devm_add_action_or_reset(dev, qce_dma_release, dma);
 
 error_nomem:
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index 31629185000e12242fa07c2cc08b95fcbd5d4b8c..fc337c435cd14917bdfb99febcf9119275afdeba 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -23,7 +23,6 @@ struct qce_result_dump {
 	u32 status2;
 };
 
-#define QCE_IGNORE_BUF_SZ	(2 * QCE_BAM_BURST_SIZE)
 #define QCE_RESULT_BUF_SZ	\
 		ALIGN(sizeof(struct qce_result_dump), QCE_BAM_BURST_SIZE)
 
@@ -31,7 +30,6 @@ struct qce_dma_data {
 	struct dma_chan *txchan;
 	struct dma_chan *rxchan;
 	struct qce_result_dump *result_buf;
-	void *ignore_buf;
 };
 
 int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma);

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 10/14] crypto: qce - Simplify arguments of devm_qce_dma_request()
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

This function can extract all the information it needs from struct
qce_device alone so simplify its arguments. This is done in preparation
for adding support for register I/O over DMA which will require
accessing even more fields from struct qce_device.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/core.c | 2 +-
 drivers/crypto/qce/dma.c  | 5 ++++-
 drivers/crypto/qce/dma.h  | 4 +++-
 3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index ad37c2b8ae53a373bb248aff06c3b7946e8439a8..a0e2eadc3afd5f83e46724c8bc3e3690146b86ba 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -238,7 +238,7 @@ static int qce_crypto_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = devm_qce_dma_request(qce->dev, &qce->dma);
+	ret = devm_qce_dma_request(qce);
 	if (ret)
 		return ret;
 
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index d1daa229361aa74da5d3d7bfe1bc8ab189761e38..d60efb5c26d88f8b0259b1dccc8724d0f75571c6 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -7,6 +7,7 @@
 #include <linux/dmaengine.h>
 #include <crypto/scatterwalk.h>
 
+#include "core.h"
 #include "dma.h"
 
 #define QCE_IGNORE_BUF_SZ		(2 * QCE_BAM_BURST_SIZE)
@@ -22,8 +23,10 @@ static void qce_dma_release(void *data)
 	kfree(dma->result_buf);
 }
 
-int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma)
+int devm_qce_dma_request(struct qce_device *qce)
 {
+	struct qce_dma_data *dma = &qce->dma;
+	struct device *dev = qce->dev;
 	int ret;
 
 	dma->txchan = dma_request_chan(dev, "tx");
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index fc337c435cd14917bdfb99febcf9119275afdeba..483789d9fa98e79d1283de8297bf2fc2a773f3a7 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -8,6 +8,8 @@
 
 #include <linux/dmaengine.h>
 
+struct qce_device;
+
 /* maximum data transfer block size between BAM and CE */
 #define QCE_BAM_BURST_SIZE		64
 
@@ -32,7 +34,7 @@ struct qce_dma_data {
 	struct qce_result_dump *result_buf;
 };
 
-int devm_qce_dma_request(struct device *dev, struct qce_dma_data *dma);
+int devm_qce_dma_request(struct qce_device *qce);
 int qce_dma_prep_sgs(struct qce_dma_data *dma, struct scatterlist *sg_in,
 		     int in_ents, struct scatterlist *sg_out, int out_ents,
 		     dma_async_tx_callback cb, void *cb_param);

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 12/14] crypto: qce - Map crypto memory for DMA
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

As the first step in converting the driver to using DMA for register
I/O, let's map the crypto memory range.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/core.c | 23 ++++++++++++++++++++++-
 drivers/crypto/qce/core.h |  6 ++++++
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/qce/core.c b/drivers/crypto/qce/core.c
index a0e2eadc3afd5f83e46724c8bc3e3690146b86ba..d7b7a3dda464964afe6a6893bb329d5bd5759dcd 100644
--- a/drivers/crypto/qce/core.c
+++ b/drivers/crypto/qce/core.c
@@ -192,10 +192,19 @@ static void qce_cancel_work(void *data)
 	cancel_work_sync(work);
 }
 
+static void qce_crypto_unmap_dma(void *data)
+{
+	struct qce_device *qce = data;
+
+	dma_unmap_resource(qce->dev, qce->base_dma, qce->dma_size,
+			   DMA_BIDIRECTIONAL, 0);
+}
+
 static int qce_crypto_probe(struct platform_device *pdev)
 {
 	struct device *dev = &pdev->dev;
 	struct qce_device *qce;
+	struct resource *res;
 	int ret;
 
 	qce = devm_kzalloc(dev, sizeof(*qce), GFP_KERNEL);
@@ -205,7 +214,7 @@ static int qce_crypto_probe(struct platform_device *pdev)
 	qce->dev = dev;
 	platform_set_drvdata(pdev, qce);
 
-	qce->base = devm_platform_ioremap_resource(pdev, 0);
+	qce->base = devm_platform_get_and_ioremap_resource(pdev, 0, &res);
 	if (IS_ERR(qce->base))
 		return PTR_ERR(qce->base);
 
@@ -255,6 +264,18 @@ static int qce_crypto_probe(struct platform_device *pdev)
 	qce->async_req_enqueue = qce_async_request_enqueue;
 	qce->async_req_done = qce_async_request_done;
 
+	qce->dma_size = resource_size(res);
+	qce->base_dma = dma_map_resource(dev, res->start, qce->dma_size,
+					 DMA_BIDIRECTIONAL, 0);
+	qce->base_phys = res->start;
+	ret = dma_mapping_error(dev, qce->base_dma);
+	if (ret)
+		return ret;
+
+	ret = devm_add_action_or_reset(qce->dev, qce_crypto_unmap_dma, qce);
+	if (ret)
+		return ret;
+
 	return devm_qce_register_algs(qce);
 }
 
diff --git a/drivers/crypto/qce/core.h b/drivers/crypto/qce/core.h
index f092ce2d3b04a936a37805c20ac5ba78d8fdd2df..a80e12eac6c87e5321cce16c56a4bf5003474ef0 100644
--- a/drivers/crypto/qce/core.h
+++ b/drivers/crypto/qce/core.h
@@ -27,6 +27,9 @@
  * @dma: pointer to dma data
  * @burst_size: the crypto burst size
  * @pipe_pair_id: which pipe pair id the device using
+ * @base_dma: base DMA address
+ * @base_phys: base physical address
+ * @dma_size: size of memory mapped for DMA
  * @async_req_enqueue: invoked by every algorithm to enqueue a request
  * @async_req_done: invoked by every algorithm to finish its request
  */
@@ -43,6 +46,9 @@ struct qce_device {
 	struct qce_dma_data dma;
 	int burst_size;
 	unsigned int pipe_pair_id;
+	dma_addr_t base_dma;
+	phys_addr_t base_phys;
+	size_t dma_size;
 	int (*async_req_enqueue)(struct qce_device *qce,
 				 struct crypto_async_request *req);
 	void (*async_req_done)(struct qce_device *qce, int ret);

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 14/14] crypto: qce - Communicate the base physical address to the dmaengine
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

In order to communicate to the BAM DMA engine which address should be
used as a scratchpad for dummy writes related to BAM pipe locking,
fill out and attach the provided metadata struct to the descriptor.

Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/dma.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 1b43c56503334154be4b8000e5a9330b2005cb64..6410f8dc5bcf517223c768a3e8f87af245076c84 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -11,6 +11,7 @@
 
 #include "core.h"
 #include "dma.h"
+#include "regs-v5.h"
 
 #define QCE_IGNORE_BUF_SZ		(2 * QCE_BAM_BURST_SIZE)
 #define QCE_BAM_CMD_SGL_SIZE		128
@@ -41,6 +42,10 @@ void qce_clear_bam_transaction(struct qce_device *qce)
 
 int qce_submit_cmd_desc(struct qce_device *qce)
 {
+	struct bam_desc_metadata meta = {
+		.scratchpad_addr = qce->base_phys + REG_VERSION,
+		.direction = DMA_MEM_TO_DEV,
+	};
 	struct qce_desc_info *qce_desc = qce->dma.bam_txn->desc;
 	struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
 	struct dma_async_tx_descriptor *dma_desc;
@@ -60,15 +65,21 @@ int qce_submit_cmd_desc(struct qce_device *qce)
 		goto err_unmap_sg;
 	}
 
+	ret = dmaengine_desc_attach_metadata(dma_desc, &meta, sizeof(meta));
+	if (ret)
+		goto err_free_desc;
+
 	qce_desc->dma_desc = dma_desc;
 	cookie = dmaengine_submit(qce_desc->dma_desc);
 
 	ret = dma_submit_error(cookie);
 	if (ret)
-		goto err_unmap_sg;
+		goto err_free_desc;
 
 	return 0;
 
+err_free_desc:
+	dmaengine_desc_free(dma_desc);
 err_unmap_sg:
 	dma_unmap_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
 	return ret;

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 13/14] crypto: qce - Add BAM DMA support for crypto register I/O
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

Switch to using BAM DMA for register I/O in addition to passing data. To
that end: provide the necessary infrastructure in the driver, modify the
ordering of operations as required and replace all direct register writes
with wrappers queueing DMA command descriptors.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/crypto/qce/aead.c     |  10 ++--
 drivers/crypto/qce/common.c   |  20 ++++---
 drivers/crypto/qce/dma.c      | 120 ++++++++++++++++++++++++++++++++++++++++--
 drivers/crypto/qce/dma.h      |   5 ++
 drivers/crypto/qce/sha.c      |  10 ++--
 drivers/crypto/qce/skcipher.c |  10 ++--
 6 files changed, 144 insertions(+), 31 deletions(-)

diff --git a/drivers/crypto/qce/aead.c b/drivers/crypto/qce/aead.c
index 1461a08e6c58b00e60aa35515f3392c096726f6a..544a3cf8709248a5f3eb2b669e30b09183d3a69d 100644
--- a/drivers/crypto/qce/aead.c
+++ b/drivers/crypto/qce/aead.c
@@ -463,17 +463,17 @@ qce_aead_async_req_handle(struct crypto_async_request *async_req)
 			src_nents = dst_nents - 1;
 	}
 
-	ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents, rctx->dst_sg, dst_nents,
-			       qce_aead_done, async_req);
+	ret = qce_start(async_req, tmpl->crypto_alg_type);
 	if (ret)
 		goto error_unmap_src;
 
-	qce_dma_issue_pending(&qce->dma);
-
-	ret = qce_start(async_req, tmpl->crypto_alg_type);
+	ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents, rctx->dst_sg, dst_nents,
+			       qce_aead_done, async_req);
 	if (ret)
 		goto error_terminate;
 
+	qce_dma_issue_pending(&qce->dma);
+
 	return 0;
 
 error_terminate:
diff --git a/drivers/crypto/qce/common.c b/drivers/crypto/qce/common.c
index 54a78a57f63028f01870a3edeb8e390f523bb190..37bb6f03244d317a887aeb0aa10cefe327b4ce05 100644
--- a/drivers/crypto/qce/common.c
+++ b/drivers/crypto/qce/common.c
@@ -25,7 +25,7 @@ static inline u32 qce_read(struct qce_device *qce, u32 offset)
 
 static inline void qce_write(struct qce_device *qce, u32 offset, u32 val)
 {
-	writel(val, qce->base + offset);
+	qce_write_dma(qce, offset, val);
 }
 
 static inline void qce_write_array(struct qce_device *qce, u32 offset,
@@ -82,6 +82,8 @@ static void qce_setup_config(struct qce_device *qce)
 {
 	u32 config;
 
+	qce_clear_bam_transaction(qce);
+
 	/* get big endianness */
 	config = qce_config_reg(qce, 0);
 
@@ -90,12 +92,14 @@ static void qce_setup_config(struct qce_device *qce)
 	qce_write(qce, REG_CONFIG, config);
 }
 
-static inline void qce_crypto_go(struct qce_device *qce, bool result_dump)
+static inline int qce_crypto_go(struct qce_device *qce, bool result_dump)
 {
 	if (result_dump)
 		qce_write(qce, REG_GOPROC, BIT(GO_SHIFT) | BIT(RESULTS_DUMP_SHIFT));
 	else
 		qce_write(qce, REG_GOPROC, BIT(GO_SHIFT));
+
+	return qce_submit_cmd_desc(qce);
 }
 
 #if defined(CONFIG_CRYPTO_DEV_QCE_SHA) || defined(CONFIG_CRYPTO_DEV_QCE_AEAD)
@@ -223,9 +227,7 @@ static int qce_setup_regs_ahash(struct crypto_async_request *async_req)
 	config = qce_config_reg(qce, 1);
 	qce_write(qce, REG_CONFIG, config);
 
-	qce_crypto_go(qce, true);
-
-	return 0;
+	return qce_crypto_go(qce, true);
 }
 #endif
 
@@ -386,9 +388,7 @@ static int qce_setup_regs_skcipher(struct crypto_async_request *async_req)
 	config = qce_config_reg(qce, 1);
 	qce_write(qce, REG_CONFIG, config);
 
-	qce_crypto_go(qce, true);
-
-	return 0;
+	return qce_crypto_go(qce, true);
 }
 #endif
 
@@ -535,9 +535,7 @@ static int qce_setup_regs_aead(struct crypto_async_request *async_req)
 	qce_write(qce, REG_CONFIG, config);
 
 	/* Start the process */
-	qce_crypto_go(qce, !IS_CCM(flags));
-
-	return 0;
+	return qce_crypto_go(qce, !IS_CCM(flags));
 }
 #endif
 
diff --git a/drivers/crypto/qce/dma.c b/drivers/crypto/qce/dma.c
index 26347e9fc078adede712722107e74958538accdf..1b43c56503334154be4b8000e5a9330b2005cb64 100644
--- a/drivers/crypto/qce/dma.c
+++ b/drivers/crypto/qce/dma.c
@@ -4,6 +4,8 @@
  */
 
 #include <linux/device.h>
+#include <linux/dma/qcom_bam_dma.h>
+#include <linux/dma-mapping.h>
 #include <linux/dmaengine.h>
 #include <crypto/scatterwalk.h>
 
@@ -11,6 +13,96 @@
 #include "dma.h"
 
 #define QCE_IGNORE_BUF_SZ		(2 * QCE_BAM_BURST_SIZE)
+#define QCE_BAM_CMD_SGL_SIZE		128
+#define QCE_BAM_CMD_ELEMENT_SIZE	128
+
+struct qce_desc_info {
+	struct dma_async_tx_descriptor *dma_desc;
+	enum dma_data_direction dir;
+};
+
+struct qce_bam_transaction {
+	struct bam_cmd_element bam_ce[QCE_BAM_CMD_ELEMENT_SIZE];
+	struct scatterlist wr_sgl[QCE_BAM_CMD_SGL_SIZE];
+	struct qce_desc_info *desc;
+	u32 bam_ce_idx;
+	u32 pre_bam_ce_idx;
+	u32 wr_sgl_cnt;
+};
+
+void qce_clear_bam_transaction(struct qce_device *qce)
+{
+	struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
+
+	bam_txn->bam_ce_idx = 0;
+	bam_txn->wr_sgl_cnt = 0;
+	bam_txn->pre_bam_ce_idx = 0;
+}
+
+int qce_submit_cmd_desc(struct qce_device *qce)
+{
+	struct qce_desc_info *qce_desc = qce->dma.bam_txn->desc;
+	struct qce_bam_transaction *bam_txn = qce->dma.bam_txn;
+	struct dma_async_tx_descriptor *dma_desc;
+	struct dma_chan *chan = qce->dma.rxchan;
+	unsigned long attrs = DMA_PREP_CMD;
+	dma_cookie_t cookie;
+	unsigned int mapped;
+	int ret;
+
+	mapped = dma_map_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+	if (!mapped)
+		return -ENOMEM;
+
+	dma_desc = dmaengine_prep_slave_sg(chan, bam_txn->wr_sgl, mapped, DMA_MEM_TO_DEV, attrs);
+	if (!dma_desc) {
+		ret = -ENOMEM;
+		goto err_unmap_sg;
+	}
+
+	qce_desc->dma_desc = dma_desc;
+	cookie = dmaengine_submit(qce_desc->dma_desc);
+
+	ret = dma_submit_error(cookie);
+	if (ret)
+		goto err_unmap_sg;
+
+	return 0;
+
+err_unmap_sg:
+	dma_unmap_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+	return ret;
+}
+
+static void qce_prep_dma_cmd_desc(struct qce_device *qce, struct qce_dma_data *dma,
+				  unsigned int addr, void *buf)
+{
+	struct qce_bam_transaction *bam_txn = dma->bam_txn;
+	struct bam_cmd_element *bam_ce_buf;
+	int bam_ce_size, cnt, idx;
+
+	idx = bam_txn->bam_ce_idx;
+	bam_ce_buf = &bam_txn->bam_ce[idx];
+	bam_prep_ce_le32(bam_ce_buf, addr, BAM_WRITE_COMMAND, *((__le32 *)buf));
+
+	bam_ce_buf = &bam_txn->bam_ce[bam_txn->pre_bam_ce_idx];
+	bam_txn->bam_ce_idx++;
+	bam_ce_size = (bam_txn->bam_ce_idx - bam_txn->pre_bam_ce_idx) * sizeof(*bam_ce_buf);
+
+	cnt = bam_txn->wr_sgl_cnt;
+
+	sg_set_buf(&bam_txn->wr_sgl[cnt], bam_ce_buf, bam_ce_size);
+
+	++bam_txn->wr_sgl_cnt;
+	bam_txn->pre_bam_ce_idx = bam_txn->bam_ce_idx;
+}
+
+void qce_write_dma(struct qce_device *qce, unsigned int offset, u32 val)
+{
+	unsigned int reg_addr = ((unsigned int)(qce->base_phys) + offset);
+
+	qce_prep_dma_cmd_desc(qce, &qce->dma, reg_addr, &val);
+}
 
 static void qce_dma_terminate(void *data)
 {
@@ -39,6 +131,16 @@ int devm_qce_dma_request(struct qce_device *qce)
 		return dev_err_probe(dev, PTR_ERR(dma->rxchan),
 				     "Failed to get RX DMA channel\n");
 
+	dma->bam_txn = devm_kzalloc(dev, sizeof(*dma->bam_txn), GFP_KERNEL);
+	if (!dma->bam_txn)
+		return -ENOMEM;
+
+	dma->bam_txn->desc = devm_kzalloc(dev, sizeof(*dma->bam_txn->desc), GFP_KERNEL);
+	if (!dma->bam_txn->desc)
+		return -ENOMEM;
+
+	sg_init_table(dma->bam_txn->wr_sgl, QCE_BAM_CMD_SGL_SIZE);
+
 	return devm_add_action_or_reset(dev, qce_dma_terminate, dma);
 }
 
@@ -98,28 +200,36 @@ int qce_dma_prep_sgs(struct qce_dma_data *dma, struct scatterlist *rx_sg,
 {
 	struct dma_chan *rxchan = dma->rxchan;
 	struct dma_chan *txchan = dma->txchan;
-	unsigned long flags = DMA_PREP_INTERRUPT | DMA_CTRL_ACK;
+	unsigned long txflags = DMA_PREP_INTERRUPT | DMA_CTRL_ACK;
+	unsigned long rxflags = txflags | DMA_PREP_FENCE;
 	int ret;
 
-	ret = qce_dma_prep_sg(rxchan, rx_sg, rx_nents, flags, DMA_MEM_TO_DEV,
+	ret = qce_dma_prep_sg(rxchan, rx_sg, rx_nents, rxflags, DMA_MEM_TO_DEV,
 			     NULL, NULL);
 	if (ret)
 		return ret;
 
-	return qce_dma_prep_sg(txchan, tx_sg, tx_nents, flags, DMA_DEV_TO_MEM,
+	return qce_dma_prep_sg(txchan, tx_sg, tx_nents, txflags, DMA_DEV_TO_MEM,
 			       cb, cb_param);
 }
 
 void qce_dma_issue_pending(struct qce_dma_data *dma)
 {
-	dma_async_issue_pending(dma->rxchan);
 	dma_async_issue_pending(dma->txchan);
+	dma_async_issue_pending(dma->rxchan);
 }
 
 int qce_dma_terminate_all(struct qce_dma_data *dma)
 {
+	struct qce_device *qce = container_of(dma, struct qce_device, dma);
+	struct qce_bam_transaction *bam_txn = dma->bam_txn;
 	int ret;
 
 	ret = dmaengine_terminate_all(dma->rxchan);
-	return ret ?: dmaengine_terminate_all(dma->txchan);
+	if (ret)
+		return ret;
+
+	dma_unmap_sg(qce->dev, bam_txn->wr_sgl, bam_txn->wr_sgl_cnt, DMA_TO_DEVICE);
+
+	return dmaengine_terminate_all(dma->txchan);
 }
diff --git a/drivers/crypto/qce/dma.h b/drivers/crypto/qce/dma.h
index 483789d9fa98e79d1283de8297bf2fc2a773f3a7..f05dfa9e6b25bd60e32f45079a8bc7e6a4cf81f9 100644
--- a/drivers/crypto/qce/dma.h
+++ b/drivers/crypto/qce/dma.h
@@ -8,6 +8,7 @@
 
 #include <linux/dmaengine.h>
 
+struct qce_bam_transaction;
 struct qce_device;
 
 /* maximum data transfer block size between BAM and CE */
@@ -32,6 +33,7 @@ struct qce_dma_data {
 	struct dma_chan *txchan;
 	struct dma_chan *rxchan;
 	struct qce_result_dump *result_buf;
+	struct qce_bam_transaction *bam_txn;
 };
 
 int devm_qce_dma_request(struct qce_device *qce);
@@ -43,5 +45,8 @@ int qce_dma_terminate_all(struct qce_dma_data *dma);
 struct scatterlist *
 qce_sgtable_add(struct sg_table *sgt, struct scatterlist *sg_add,
 		unsigned int max_len);
+void qce_write_dma(struct qce_device *qce, unsigned int offset, u32 val);
+int qce_submit_cmd_desc(struct qce_device *qce);
+void qce_clear_bam_transaction(struct qce_device *qce);
 
 #endif /* _DMA_H_ */
diff --git a/drivers/crypto/qce/sha.c b/drivers/crypto/qce/sha.c
index 5476d4d30fae7eb72bbcbcdd7d8be7a76f6732c2..5cfd769a59a791a79da42e2a5b0554ad974f7631 100644
--- a/drivers/crypto/qce/sha.c
+++ b/drivers/crypto/qce/sha.c
@@ -109,17 +109,17 @@ static int qce_ahash_async_req_handle(struct crypto_async_request *async_req)
 		goto error_unmap_src;
 	}
 
-	ret = qce_dma_prep_sgs(&qce->dma, req->src, rctx->src_nents,
-			       &rctx->result_sg, 1, qce_ahash_done, async_req);
+	ret = qce_start(async_req, tmpl->crypto_alg_type);
 	if (ret)
 		goto error_unmap_dst;
 
-	qce_dma_issue_pending(&qce->dma);
-
-	ret = qce_start(async_req, tmpl->crypto_alg_type);
+	ret = qce_dma_prep_sgs(&qce->dma, req->src, rctx->src_nents,
+			       &rctx->result_sg, 1, qce_ahash_done, async_req);
 	if (ret)
 		goto error_terminate;
 
+	qce_dma_issue_pending(&qce->dma);
+
 	return 0;
 
 error_terminate:
diff --git a/drivers/crypto/qce/skcipher.c b/drivers/crypto/qce/skcipher.c
index a9b59e68df4b6837805d45391f5a5fe43fd47709..b4ef3748fbb4dde542b0307f32d4c871b7c33ac2 100644
--- a/drivers/crypto/qce/skcipher.c
+++ b/drivers/crypto/qce/skcipher.c
@@ -142,18 +142,18 @@ qce_skcipher_async_req_handle(struct crypto_async_request *async_req)
 		src_nents = dst_nents - 1;
 	}
 
+	ret = qce_start(async_req, tmpl->crypto_alg_type);
+	if (ret)
+		goto error_unmap_src;
+
 	ret = qce_dma_prep_sgs(&qce->dma, rctx->src_sg, src_nents,
 			       rctx->dst_sg, dst_nents,
 			       qce_skcipher_done, async_req);
 	if (ret)
-		goto error_unmap_src;
+		goto error_terminate;
 
 	qce_dma_issue_pending(&qce->dma);
 
-	ret = qce_start(async_req, tmpl->crypto_alg_type);
-	if (ret)
-		goto error_terminate;
-
 	return 0;
 
 error_terminate:

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 03/14] dmaengine: qcom: bam_dma: convert tasklet to a BH workqueue
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
	Dmitry Baryshkov
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

BH workqueues are a modern mechanism, aiming to replace legacy tasklets.
Let's convert the BAM DMA driver to using the high-priority variant of
the BH workqueue.

[Vinod: suggested using the BG workqueue instead of the regular one
running in process context]

Suggested-by: Vinod Koul <vkoul@kernel.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Reviewed-by: Bjorn Andersson <andersson@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/dma/qcom/bam_dma.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index fc155e0d1870cbb7e099a2c4280f9f8fbdf6cf15..ea3df28e777f99c0532761b6aee6807ab23ab4ca 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -42,6 +42,7 @@
 #include <linux/pm_runtime.h>
 #include <linux/scatterlist.h>
 #include <linux/slab.h>
+#include <linux/workqueue.h>
 
 #include "../dmaengine.h"
 #include "../virt-dma.h"
@@ -426,8 +427,8 @@ struct bam_device {
 	struct clk *bamclk;
 	int irq;
 
-	/* dma start transaction tasklet */
-	struct tasklet_struct task;
+	/* dma start transaction workqueue */
+	struct work_struct work;
 };
 
 /**
@@ -892,7 +893,7 @@ static u32 process_channel_irqs(struct bam_device *bdev)
 			/*
 			 * if complete, process cookie. Otherwise
 			 * push back to front of desc_issued so that
-			 * it gets restarted by the tasklet
+			 * it gets restarted by the work queue.
 			 */
 			if (!async_desc->num_desc) {
 				vchan_cookie_complete(&async_desc->vd);
@@ -922,9 +923,9 @@ static irqreturn_t bam_dma_irq(int irq, void *data)
 
 	srcs |= process_channel_irqs(bdev);
 
-	/* kick off tasklet to start next dma transfer */
+	/* kick off the work queue to start next dma transfer */
 	if (srcs & P_IRQ)
-		tasklet_schedule(&bdev->task);
+		queue_work(system_bh_highpri_wq, &bdev->work);
 
 	ret = pm_runtime_get_sync(bdev->dev);
 	if (ret < 0)
@@ -1120,14 +1121,14 @@ static void bam_start_dma(struct bam_chan *bchan)
 }
 
 /**
- * dma_tasklet - DMA IRQ tasklet
- * @t: tasklet argument (bam controller structure)
+ * bam_dma_work() - DMA interrupt work queue callback
+ * @work: work queue struct embedded in the BAM controller device struct
  *
  * Sets up next DMA operation and then processes all completed transactions
  */
-static void dma_tasklet(struct tasklet_struct *t)
+static void bam_dma_work(struct work_struct *work)
 {
-	struct bam_device *bdev = from_tasklet(bdev, t, task);
+	struct bam_device *bdev = from_work(bdev, work, work);
 	struct bam_chan *bchan;
 	unsigned int i;
 
@@ -1140,14 +1141,13 @@ static void dma_tasklet(struct tasklet_struct *t)
 		if (!list_empty(&bchan->vc.desc_issued) && !IS_BUSY(bchan))
 			bam_start_dma(bchan);
 	}
-
 }
 
 /**
  * bam_issue_pending - starts pending transactions
  * @chan: dma channel
  *
- * Calls tasklet directly which in turn starts any pending transactions
+ * Calls work queue directly which in turn starts any pending transactions
  */
 static void bam_issue_pending(struct dma_chan *chan)
 {
@@ -1316,14 +1316,14 @@ static int bam_dma_probe(struct platform_device *pdev)
 	if (ret)
 		goto err_disable_clk;
 
-	tasklet_setup(&bdev->task, dma_tasklet);
+	INIT_WORK(&bdev->work, bam_dma_work);
 
 	bdev->channels = devm_kcalloc(bdev->dev, bdev->num_channels,
 				sizeof(*bdev->channels), GFP_KERNEL);
 
 	if (!bdev->channels) {
 		ret = -ENOMEM;
-		goto err_tasklet_kill;
+		goto err_workqueue_cancel;
 	}
 
 	/* allocate and initialize channels */
@@ -1389,8 +1389,8 @@ static int bam_dma_probe(struct platform_device *pdev)
 err_bam_channel_exit:
 	for (i = 0; i < bdev->num_channels; i++)
 		tasklet_kill(&bdev->channels[i].vc.task);
-err_tasklet_kill:
-	tasklet_kill(&bdev->task);
+err_workqueue_cancel:
+	cancel_work_sync(&bdev->work);
 err_disable_clk:
 	clk_disable_unprepare(bdev->bamclk);
 
@@ -1424,7 +1424,7 @@ static void bam_dma_remove(struct platform_device *pdev)
 			    bdev->channels[i].fifo_phys);
 	}
 
-	tasklet_kill(&bdev->task);
+	cancel_work_sync(&bdev->work);
 
 	clk_disable_unprepare(bdev->bamclk);
 }

-- 
2.47.3



^ permalink raw reply related

* [PATCH v20 05/14] dmaengine: qcom: bam_dma: Add pipe_lock_supported flag support
From: Bartosz Golaszewski @ 2026-06-29 10:01 UTC (permalink / raw)
  To: Vinod Koul, Jonathan Corbet, Thara Gopinath, Herbert Xu,
	David S. Miller, Udit Tiwari, Md Sadre Alam, Dmitry Baryshkov,
	Manivannan Sadhasivam, Stephan Gerhold, Bjorn Andersson,
	Peter Ujfalusi, Michal Simek, Frank Li, Andy Gross,
	Neil Armstrong
  Cc: dmaengine, linux-doc, linux-kernel, linux-arm-msm, linux-crypto,
	linux-arm-kernel, brgl, Bartosz Golaszewski, Bartosz Golaszewski,
	Dmitry Baryshkov
In-Reply-To: <20260629-qcom-qce-cmd-descr-v20-0-56f67da84c05@oss.qualcomm.com>

From: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>

Extend the device match data with a flag indicating whether the IP
supports the BAM lock/unlock feature. Set it to true on BAM IP versions
1.4.0 and above.

Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@linaro.org>
Reviewed-by: Dmitry Baryshkov <dmitry.baryshkov@oss.qualcomm.com>
Acked-by: Manivannan Sadhasivam <mani@kernel.org>
Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
Signed-off-by: Bartosz Golaszewski <bartosz.golaszewski@oss.qualcomm.com>
---
 drivers/dma/qcom/bam_dma.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/drivers/dma/qcom/bam_dma.c b/drivers/dma/qcom/bam_dma.c
index 8ce0fe085c5fea6cc614edd692b5cfd264b94d5a..f3e713a5259c2c7c24cfdcec094814eb1202971a 100644
--- a/drivers/dma/qcom/bam_dma.c
+++ b/drivers/dma/qcom/bam_dma.c
@@ -115,6 +115,7 @@ struct reg_offset_data {
 
 struct bam_device_data {
 	const struct reg_offset_data *reg_info;
+	bool pipe_lock_supported;
 };
 
 static const struct reg_offset_data bam_v1_3_reg_info[] = {
@@ -181,6 +182,7 @@ static const struct reg_offset_data bam_v1_4_reg_info[] = {
 
 static const struct bam_device_data bam_v1_4_data = {
 	.reg_info = bam_v1_4_reg_info,
+	.pipe_lock_supported = true,
 };
 
 static const struct reg_offset_data bam_v1_7_reg_info[] = {
@@ -214,6 +216,7 @@ static const struct reg_offset_data bam_v1_7_reg_info[] = {
 
 static const struct bam_device_data bam_v1_7_data = {
 	.reg_info = bam_v1_7_reg_info,
+	.pipe_lock_supported = true,
 };
 
 static const struct reg_offset_data bam_v2_0_reg_info[] = {
@@ -247,6 +250,7 @@ static const struct reg_offset_data bam_v2_0_reg_info[] = {
 
 static const struct bam_device_data bam_v2_0_data = {
 	.reg_info = bam_v2_0_reg_info,
+	.pipe_lock_supported = true,
 };
 
 /* BAM CTRL */

-- 
2.47.3



^ permalink raw reply related

* Re: [PATCH v3] soc: samsung: exynos-pmu: fix of_node refcount leak in exynos_get_pmu_regmap()
From: Krzysztof Kozlowski @ 2026-06-29 10:09 UTC (permalink / raw)
  To: Weigang He
  Cc: Alim Akhtar, Marek Szyprowski, Tomasz Figa, linux-arm-kernel,
	linux-samsung-soc, linux-kernel
In-Reply-To: <20260609143852.1783558-1-geoffreyhe2@gmail.com>


On Wed, 10 Jun 2026 00:38:52 +1000, Weigang He wrote:
> exynos_get_pmu_regmap() obtains a device_node via of_find_matching_node()
> and passes it to exynos_get_pmu_regmap_by_phandle(np, NULL). With
> propname == NULL the callee uses np directly and does not drop a
> reference, so the reference taken by of_find_matching_node() is leaked on
> every call -- including on each -EPROBE_DEFER retry of the only in-tree
> caller, exynos_retention_init() in the Exynos pinctrl driver.
> 
> [...]

Applied, thanks!

[1/1] soc: samsung: exynos-pmu: fix of_node refcount leak in exynos_get_pmu_regmap()
      https://git.kernel.org/krzk/linux/c/fa476d53edd24e8105faace04e881b9c4179738f

Best regards,
-- 
Krzysztof Kozlowski <krzk@kernel.org>



^ permalink raw reply

* Re: [PATCH v2] ARM: dts: exynos: Add bluetooth support to manta
From: Krzysztof Kozlowski @ 2026-06-29 10:09 UTC (permalink / raw)
  To: Rob Herring, Krzysztof Kozlowski, Conor Dooley, Alim Akhtar,
	Lukas Timmermann
  Cc: devicetree, linux-arm-kernel, linux-samsung-soc, linux-kernel,
	Alexandre Marquet
In-Reply-To: <20260614-manta-bluetooth-v2-1-52de06cabf9d@timmermann.space>


On Sun, 14 Jun 2026 22:16:35 +0200, Lukas Timmermann wrote:
> Enable the bcm4330-bt device for manta boards on serial0.
> Also adds the necessary pin definitions and interrupt handling for
> wakeup.

Applied, thanks!

[1/1] ARM: dts: exynos: Add bluetooth support to manta
      https://git.kernel.org/krzk/linux/c/718b15471c2b13a4830e80efbb489c2a849060d1

Best regards,
-- 
Krzysztof Kozlowski <krzk@kernel.org>



^ permalink raw reply

* Re: [PATCH] ARM: s3c: Replace __ASSEMBLY__ with __ASSEMBLER__ in header files
From: Krzysztof Kozlowski @ 2026-06-29 10:09 UTC (permalink / raw)
  To: Peter Griffin, linux-kernel, Thomas Huth
  Cc: linux-samsung-soc, linux-arm-kernel, Alim Akhtar, Russell King
In-Reply-To: <20260619125827.215977-1-thuth@redhat.com>


On Fri, 19 Jun 2026 14:58:27 +0200, Thomas Huth wrote:
> While the GCC and Clang compilers already define __ASSEMBLER__
> automatically when compiling assembly code, __ASSEMBLY__ is a
> macro that only gets defined by the Makefiles in the kernel.
> This can be very confusing when switching between userspace
> and kernelspace coding, or when dealing with uapi headers that
> rather should use __ASSEMBLER__ instead. So let's standardize now
> on the __ASSEMBLER__ macro that is provided by the compilers.
> 
> [...]

Applied, thanks!

[1/1] ARM: s3c: Replace __ASSEMBLY__ with __ASSEMBLER__ in header files
      https://git.kernel.org/krzk/linux/c/7b06ff772080919fdb194c95af6b1e3acb079b71

Best regards,
-- 
Krzysztof Kozlowski <krzk@kernel.org>



^ permalink raw reply

* [PATCH] arm64: Clarify ARM64_WORKAROUND_REPEAT_TLBI semantics
From: Mark Rutland @ 2026-06-29 10:09 UTC (permalink / raw)
  To: linux-arm-kernel; +Cc: catalin.marinas, mark.rutland, will

Will notes that the ARM64_WORKAROUND_REPEAT_TLBI name is potentially
misleading, and that it would be nice to rename that and add some
documentation. See:

  https://lore.kernel.org/linux-arm-kernel/ajKn_Pt50CmOUrsP@willie-the-truck/

To that end, I've renamed the Kconfig symbol and hwcap from:

  [CONFIG_]ARM64_WORKAROUND_REPEAT_TLBI

... to:

  [CONFIG_]ARM64_WORKAROUND_REPEAT_TLBI_SYNC

... and I've added some rationale alongside the Kconfig. As the Kconfig
symbol isn't user selectable, the usual 'help' section won't appear in
menuconfig, so I've added this as a comment.

The rename was scripted with:

  git grep -l REPEAT_TLBI | while read F; do
    sed -i '{ s/WORKAROUND_REPEAT_TLBI\>/WORKAROUND_REPEAT_TLBI_SYNC/g }' $F;
  done

Bikeshedding-wise, I considered a few names, including:

* ARM64_WORKAROUND_REPEAT_TLBI_SYNC
* ARM64_WORKAROUND_TLBI_REPEAT_SYNC
* ARM64_WORKAROUND_BROADCAST_TLBI_REPEAT_SYNC

... and I settled on ARM64_WORKAROUND_REPEAT_TLBI_SYNC to try keep
things simple, and to avoid unnecessary churn caused by moving
definitions to retain alphabetical order. I'm happy to defer to Will and
Catalin's preference.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Will Deacon <will@kernel.org>
---
 arch/arm64/Kconfig                | 34 +++++++++++++++++++++++++------
 arch/arm64/include/asm/cpucaps.h  |  4 ++--
 arch/arm64/include/asm/tlbflush.h |  2 +-
 arch/arm64/kernel/cpu_errata.c    |  6 +++---
 arch/arm64/tools/cpucaps          |  2 +-
 5 files changed, 35 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index b3afe0688919b..7571104215435 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -701,12 +701,34 @@ config ARM64_ERRATUM_1530923
 
 	  If unsure, say Y.
 
-config ARM64_WORKAROUND_REPEAT_TLBI
+config ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	bool
+	# This workaround is (only) suitable for TLB invalidation errata where
+	# all of the following conditions are true:
+	#
+	# - The effects of the errata are only a loss of ordering/completion
+	#   for explicit memory accesses when the TLBI is completed with a DSB.
+	#   The removal of TLB entries is not affected.
+	#
+	#   Note that architecturally, S2-only invalidation does not remove
+	#   combined S1+S2 entries, and does not complete accesses translated
+	#   via those S1+S2 entries. Consequently, where this condition holds,
+	#   the errata do not affect S2-only invalidation.
+	#
+	# - The errata only affect broadcast TLB invalidation operations (e.g.
+	#   TLBI VMALLE1IS), and do not affect local TLB invalidation
+	#   operations (e.g. TLBI VMALLE1).
+	#
+	# - After any number of affected TLBI operations are completed with a
+	#   DSB, the errata can be mitigated by executing a single arbitrary
+	#   broadcast TLBI (which targets an arbitrary translation regime),
+	#   followed by a DSB.
+	#
+	# For more rationale, see commit a8f78680ee6bf795.
 
 config ARM64_ERRATUM_2441007
 	bool "Cortex-A55: Completion of affected memory accesses might not be guaranteed by completion of a TLBI (rare)"
-	select ARM64_WORKAROUND_REPEAT_TLBI
+	select ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	help
 	  This option adds a workaround for ARM Cortex-A55 erratum #2441007.
 
@@ -722,7 +744,7 @@ config ARM64_ERRATUM_2441007
 
 config ARM64_ERRATUM_1286807
 	bool "Cortex-A76: Modification of the translation table for a virtual address might lead to read-after-read ordering violation (rare)"
-	select ARM64_WORKAROUND_REPEAT_TLBI
+	select ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	help
 	  This option adds a workaround for ARM Cortex-A76 erratum 1286807.
 
@@ -944,7 +966,7 @@ config ARM64_ERRATUM_2224489
 
 config ARM64_ERRATUM_2441009
 	bool "Cortex-A510: Completion of affected memory accesses might not be guaranteed by completion of a TLBI (rare)"
-	select ARM64_WORKAROUND_REPEAT_TLBI
+	select ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	help
 	  This option adds a workaround for ARM Cortex-A510 erratum #2441009.
 
@@ -1156,7 +1178,7 @@ config ARM64_ERRATUM_4193714
 config ARM64_ERRATUM_4118414
 	bool "Various: Completion of affected memory accesses might not be guaranteed by completion of a TLBI"
 	default y
-	select ARM64_WORKAROUND_REPEAT_TLBI
+	select ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	help
 	  This option adds a workaround for the following errata:
 
@@ -1340,7 +1362,7 @@ config QCOM_FALKOR_ERRATUM_1003
 config QCOM_FALKOR_ERRATUM_1009
 	bool "Falkor E1009: Prematurely complete a DSB after a TLBI"
 	default y
-	select ARM64_WORKAROUND_REPEAT_TLBI
+	select ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	help
 	  On Falkor v1, the CPU may prematurely complete a DSB following a
 	  TLBI xxIS invalidate maintenance operation. Repeat the TLBI operation
diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h
index 25c61cda901c5..76350b38f0d7a 100644
--- a/arch/arm64/include/asm/cpucaps.h
+++ b/arch/arm64/include/asm/cpucaps.h
@@ -60,8 +60,8 @@ cpucap_is_possible(const unsigned int cap)
 		return IS_ENABLED(CONFIG_CAVIUM_ERRATUM_23154);
 	case ARM64_WORKAROUND_DISABLE_CNP:
 		return IS_ENABLED(CONFIG_ARM64_WORKAROUND_DISABLE_CNP);
-	case ARM64_WORKAROUND_REPEAT_TLBI:
-		return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI);
+	case ARM64_WORKAROUND_REPEAT_TLBI_SYNC:
+		return IS_ENABLED(CONFIG_ARM64_WORKAROUND_REPEAT_TLBI_SYNC);
 	case ARM64_WORKAROUND_SPECULATIVE_SSBS:
 		return IS_ENABLED(CONFIG_ARM64_ERRATUM_3194386);
 	case ARM64_WORKAROUND_4193714:
diff --git a/arch/arm64/include/asm/tlbflush.h b/arch/arm64/include/asm/tlbflush.h
index d52ac8c17190d..bd68ca6df62ba 100644
--- a/arch/arm64/include/asm/tlbflush.h
+++ b/arch/arm64/include/asm/tlbflush.h
@@ -268,7 +268,7 @@ static inline void __tlbi_level(tlbi_op op, u64 addr, u32 level)
 
 #define __repeat_tlbi_sync(op, arg...)						\
 do {										\
-	if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI))	\
+	if (!alternative_has_cap_unlikely(ARM64_WORKAROUND_REPEAT_TLBI_SYNC))	\
 		break;								\
 	__tlbi(op, ##arg);							\
 	dsb(ish);								\
diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c
index 1995e1198648e..685077d44ad17 100644
--- a/arch/arm64/kernel/cpu_errata.c
+++ b/arch/arm64/kernel/cpu_errata.c
@@ -309,7 +309,7 @@ static void cpu_enable_impdef_pmuv3_traps(const struct arm64_cpu_capabilities *_
 	sysreg_clear_set_s(SYS_HACR_EL2, 0, BIT(56));
 }
 
-#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
+#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 static const struct arm64_cpu_capabilities arm64_repeat_tlbi_list[] = {
 #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009
 	{
@@ -733,10 +733,10 @@ const struct arm64_cpu_capabilities arm64_errata[] = {
 		.match_list = qcom_erratum_1003_list,
 	},
 #endif
-#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI
+#ifdef CONFIG_ARM64_WORKAROUND_REPEAT_TLBI_SYNC
 	{
 		.desc = "Broken broadcast TLBI completion",
-		.capability = ARM64_WORKAROUND_REPEAT_TLBI,
+		.capability = ARM64_WORKAROUND_REPEAT_TLBI_SYNC,
 		.type = ARM64_CPUCAP_LOCAL_CPU_ERRATUM,
 		.matches = cpucap_multi_entry_cap_matches,
 		.match_list = arm64_repeat_tlbi_list,
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index 9b85a84f6fd49..f8368e5d81a8e 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -124,7 +124,7 @@ WORKAROUND_DISABLE_CNP
 WORKAROUND_PMUV3_IMPDEF_TRAPS
 WORKAROUND_QCOM_FALKOR_E1003
 WORKAROUND_QCOM_ORYON_CNTVOFF
-WORKAROUND_REPEAT_TLBI
+WORKAROUND_REPEAT_TLBI_SYNC
 WORKAROUND_SPECULATIVE_AT
 WORKAROUND_SPECULATIVE_SSBS
 WORKAROUND_SPECULATIVE_UNPRIV_LOAD
-- 
2.30.2



^ permalink raw reply related

* Re: [PATCH v14 0/7] Provide support for Trigger Generation Unit
From: Songwei.Chai @ 2026-06-29 10:17 UTC (permalink / raw)
  To: Greg KH
  Cc: andersson, alexander.shishkin, mike.leach, konrad.dybcio,
	suzuki.poulose, james.clark, krzk+dt, conor+dt, linux-kernel,
	linux-arm-kernel, linux-arm-msm, coresight, devicetree
In-Reply-To: <2026062959-distaste-launder-e253@gregkh>

On 6/29/2026 12:22 PM, Greg KH wrote:
> On Mon, Jun 29, 2026 at 11:03:33AM +0800, Songwei.Chai wrote:
>> Hi Greg & Alexander,
>>
>> Apologies for interrupting again.
>>
>> As the TGU hardware plays an important role in Qualcomm tracing design, I
>> would greatly appreciate it if you could kindly take some time to review
>> this at your earliest convenience.
> The merge window _just_ closed, please give us a chance to catch up.
>
> Also, why us?  Surely you have other reviewers for this code, right?

Hi Greg,

Understood, thanks for letting us know.

Regarding your question: since this introduces a new 
drivers/hwtracing/qcom directory, there is no existing maintainer for it.
Given your scope (and Alexander's), we believe you are the most relevant 
reviewers.

The reason for creating the qcom directory is as follows:

/We previously tried to upstream this driver under 
drivers/hwtracing/coresight,/
/but it was not accepted as it is considered Qualcomm-specific and not 
tightly/
/coupled with the CoreSight subsystem. Based on this feedback, we are 
exploring/
/a dedicated drivers/hwtracing/qcom directory, similar to intel_th, to 
better/
/support this and future Qualcomm hwtracing drivers./

More details can be found in “[PATCH v14 0/7] -- Why we are proposing this”.

Thanks,
Songwei

>
> thanks,
>
> greg k-h

^ permalink raw reply

* Re: [PATCH v2] ARM: dts: exynos: Add bluetooth support to manta
From: Krzysztof Kozlowski @ 2026-06-29 10:22 UTC (permalink / raw)
  To: Rob Herring, Krzysztof Kozlowski, Conor Dooley, Alim Akhtar,
	Lukas Timmermann
  Cc: devicetree, linux-arm-kernel, linux-samsung-soc, linux-kernel,
	Alexandre Marquet
In-Reply-To: <178272778358.113362.3049339184584034398.b4-ty@b4>

On 29/06/2026 12:09, Krzysztof Kozlowski wrote:
> 
> On Sun, 14 Jun 2026 22:16:35 +0200, Lukas Timmermann wrote:
>> Enable the bcm4330-bt device for manta boards on serial0.
>> Also adds the necessary pin definitions and interrupt handling for
>> wakeup.
> 
> Applied, thanks!
> 
> [1/1] ARM: dts: exynos: Add bluetooth support to manta
>       https://git.kernel.org/krzk/linux/c/718b15471c2b13a4830e80efbb489c2a849060d1
> 

And still incorrect DCO. Checkpatch tells you that, so please run it.

I fixed it up, although already after pushing so all builds will now
complain. That's super annoying. I will reject future patches which
ignore checkpatch.

Best regards,
Krzysztof


^ permalink raw reply

* [PATCH 0/2] arm64: dts: rockchip: fix Li-Po overcharge on Powkiddy RGB10 Max 3 / X55
From: Juan Manuel @ 2026-06-29 10:32 UTC (permalink / raw)
  To: macromorgan, heiko
  Cc: linux-rockchip, devicetree, linux-arm-kernel, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 2017 bytes --]

Hi Chris, Heiko,

While bringing up a couple of Powkiddy RK3566 handhelds I ran into a
battery problem that turns out to be in the device trees, and it has
already cost me two packs, so I'd like to get it fixed for everyone.

Both battery nodes charge the cell above its own declared full voltage:

rk3566-powkiddy-rk2023.dtsi (inherited by the RGB10 Max 3):
constant-charge-voltage-max-microvolt = 4250000 (4.25 V), but
voltage-max-design-microvolt and the ocv-capacity-table-0 100% point
are both 4172000 (4.172 V).

rk3566-powkiddy-x55.dts: constant-charge-voltage-max-microvolt =
4300000 (4.30 V), but voltage-max-design-microvolt and the
ocv-capacity-table-0 100% point are both 4138000 (4.138 V).

So the charger drives each cell ~80–160 mV past its own OCV-100% point
on every cycle. On a standard 4.2 V Li-Po that is an overcharge: it
raises the cell's internal resistance and kills the pack early. The
symptom is textbook — the pack reads a normal voltage/SoC while on the
charger but collapses under load and shuts the device off the moment
it's unplugged. I lost two packs to this before tracing it to the DT;
capping the charge voltage at 4.2 V (verified at the rk817 CHRG_OUT
register) stopped the damage, and a third, already-degraded pack
stabilised.

Patch 1 also corrects the RGB10 Max 3 design capacity: it ships a 4000
mAh cell but inherits the 3151 mAh value from rk2023.dtsi. I did this
as a per-board override so I don't touch the shared profile, which may
well be correct for the RGB30 and other rk2023 users.

One thing worth a look on your side: the shared
rk3566-powkiddy-rk2023.dtsi default itself (4.25 V against a 4.172 V
OCV-100% point) looks like it would overcharge any device using it,
not just the RGB10 Max 3 — but I only have the two units above to test
on, so I've kept the fix scoped to what I can verify.

Thanks a lot for all the handheld DT work; none of these devices would
run mainline without it.

Juan Manuel Lopez Carrillo

[-- Attachment #2: 0001-arm64-dts-rockchip-powkiddy-rgb10max3-fix-battery-pr.patch --]
[-- Type: text/x-patch, Size: 2109 bytes --]

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Juan Manuel Lopez Carrillo <juanmanuellopezcarrillo@gmail.com>
Date: Sun, 29 Jun 2026 12:00:00 +0200
Subject: [PATCH 1/2] arm64: dts: rockchip: powkiddy-rgb10max3: fix battery
 profile

The Powkiddy RGB10 Max 3 ships with a 4000 mAh pack, but it inherits its
battery node from rk3566-powkiddy-rk2023.dtsi, which describes a 3151 mAh
cell and, more importantly, sets constant-charge-voltage-max-microvolt to
4250000 (4.25 V).

That charge voltage is above this pack's declared full voltage: the
inherited voltage-max-design-microvolt and the ocv-capacity-table-0 100%
point are both 4172000 (4.172 V). The charger therefore drives the cell
~78 mV past its own declared "full" on every cycle.

For a standard 4.2 V Li-Po this is an overcharge. It raises the cell's
internal resistance and kills the pack prematurely. The failure mode seen
in the field is characteristic: the pack reads a plausible voltage/SoC
while on the charger but collapses under load (and shuts the device off)
as soon as it is unplugged. Two packs were lost this way before the cause
was traced to the DT.

Override the node for this board with the correct 4000 mAh design capacity
and a safe 4.2 V charge ceiling, at/below the cell design max and the
OCV-100% point. The charge current limit (2 A = 0.5C) and the OCV curve
are left unchanged.

Signed-off-by: Juan Manuel Lopez Carrillo <juanmanuellopezcarrillo@gmail.com>
---
 arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts b/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts
--- a/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-rgb10max3.dts
@@ -12,6 +12,11 @@
 	compatible = "powkiddy,rgb10max3", "rockchip,rk3566";
 };

+&battery {
+	charge-full-design-microamp-hours = <4000000>;
+	constant-charge-voltage-max-microvolt = <4200000>;
+};
+
 &bluetooth {
 	compatible = "realtek,rtl8723ds-bt";
 };
--
2.43.0

[-- Attachment #3: 0002-arm64-dts-rockchip-powkiddy-x55-cap-battery-charge-4.patch --]
[-- Type: text/x-patch, Size: 1723 bytes --]

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Juan Manuel Lopez Carrillo <juanmanuellopezcarrillo@gmail.com>
Date: Sun, 29 Jun 2026 12:05:00 +0200
Subject: [PATCH 2/2] arm64: dts: rockchip: powkiddy-x55: cap battery charge
 voltage at 4.2V

The x55 battery node sets constant-charge-voltage-max-microvolt to
4300000 (4.30 V), but the same node declares voltage-max-design-microvolt
and an ocv-capacity-table-0 100% point of 4138000 (4.138 V). The charger
therefore drives the pack ~162 mV above its own declared full voltage on
every cycle.

This overcharges the standard 4.2 V Li-Po, raising its internal resistance
and killing it early - it reads fine on the charger but collapses under
load once unplugged. Cap the charge voltage at the standard, safe 4.2 V.
Design capacity (4000 mAh) and charge current (2 A) are already correct.

Signed-off-by: Juan Manuel Lopez Carrillo <juanmanuellopezcarrillo@gmail.com>
---
 arch/arm64/boot/dts/rockchip/rk3566-powkiddy-x55.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-x55.dts b/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-x55.dts
--- a/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-x55.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3566-powkiddy-x55.dts
@@ -77,7 +77,7 @@
 		charge-full-design-microamp-hours = <4000000>;
 		charge-term-current-microamp = <300000>;
 		constant-charge-current-max-microamp = <2000000>;
-		constant-charge-voltage-max-microvolt = <4300000>;
+		constant-charge-voltage-max-microvolt = <4200000>;
 		factory-internal-resistance-micro-ohms = <91000>;
 		voltage-max-design-microvolt = <4138000>;
 		voltage-min-design-microvolt = <3400000>;
--
2.43.0

^ permalink raw reply

* Re: [PATCH v3 1/3] dt-bindings: clock: exynos990: Add CLK_GOUT_PERIS_TMU_SUB_PCLK
From: Peter Griffin @ 2026-06-29 10:37 UTC (permalink / raw)
  To: Denzeel Oliva
  Cc: Krzysztof Kozlowski, Sylwester Nawrocki, Chanwoo Choi,
	Alim Akhtar, Michael Turquette, Stephen Boyd, Brian Masney,
	Rob Herring, Conor Dooley, linux-samsung-soc, linux-clk,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <20260613-exynos990-peris-fix-v3-v3-1-2b230db78ae4@gmail.com>

On Sat, 13 Jun 2026 at 13:36, Denzeel Oliva <wachiturroxd150@gmail.com> wrote:
>
> Add the missing TMU_SUB_PCLK clock ID for the Exynos990 PERIS CMU.
>
> Signed-off-by: Denzeel Oliva <wachiturroxd150@gmail.com>
> ---

Reviewed-by: Peter Griffin <peter.griffin@linaro.org>

>  include/dt-bindings/clock/samsung,exynos990.h | 1 +
>  1 file changed, 1 insertion(+)
>
> diff --git a/include/dt-bindings/clock/samsung,exynos990.h b/include/dt-bindings/clock/samsung,exynos990.h
> index 47540307cb52..c06f591d9d90 100644
> --- a/include/dt-bindings/clock/samsung,exynos990.h
> +++ b/include/dt-bindings/clock/samsung,exynos990.h
> @@ -434,5 +434,6 @@
>  #define CLK_GOUT_PERIS_TMU_TOP_PCLK            17
>  #define CLK_GOUT_PERIS_OTP_CON_BIRA_OSCCLK     18
>  #define CLK_GOUT_PERIS_OTP_CON_TOP_OSCCLK      19
> +#define CLK_GOUT_PERIS_TMU_SUB_PCLK            20
>
>  #endif
>
> --
> 2.54.0
>


^ permalink raw reply

* Re: [PATCH v3 3/5] KVM: arm64: nv: Avoid full shadow s2 unmap
From: Wei-Lin Chang @ 2026-06-29 10:38 UTC (permalink / raw)
  To: Marc Zyngier
  Cc: linux-arm-kernel, kvmarm, linux-kernel, Oliver Upton, Joey Gouly,
	Suzuki K Poulose, Zenghui Yu, Catalin Marinas, Will Deacon
In-Reply-To: <86cxyfvgf4.wl-maz@kernel.org>

Hi Marc,

Sorry for the late reply.

On Thu, May 28, 2026 at 01:59:11PM +0100, Marc Zyngier wrote:
> On Sun, 10 May 2026 15:53:36 +0100,
> Wei-Lin Chang <weilin.chang@arm.com> wrote:
> > 
> > Currently we are forced to fully unmap all shadow stage-2 for a VM when
> > unmapping a page from the canonical stage-2, for example during an MMU
> > notifier call. This is because we are not tracking what canonical IPA
> > are mapped in the shadow stage-2 page tables hence there is no way to
> > know what to unmap.
> > 
> > Create a per kvm_s2_mmu maple tree to track canonical IPA range ->
> > nested IPA range, so that it is possible to partially unmap shadow
> > stage-2 when a canonical IPA range is unmapped. The algorithm is simple
> > and conservative:
> > 
> > At each shadow stage-2 map, insert the nested IPA range into the maple
> > tree, with the canonical IPA range as the key. If the canonical IPA
> > range doesn't overlap with existing ranges in the tree, insert as is,
> > and a reverse mapping for this range is established. But if the
> > canonical IPA range overlaps with any existing ranges in the tree,
> > create a new range that spans all the overlapping ranges including the
> > input range and replace those existing ranges. In the mean time, mark
> > this new spanning canonical IPA range with an "UNKNOWN_IPA" bit,
> > indicating we give up tracking the nested IPA ranges that map to this
> > canonical IPA range.
> > 
> > The maple tree's 64 bit entry is enough to store the nested IPA and
> > the UNKNOWN_IPA status, therefore besides maple tree's internal
> > operation, memory allocation is avoided.
> > 
> > Example:
> > |||| means existing range, ---- means empty range
> > 
> > input:            $$$$$$$$$$$$$$$$$$$$$$$$$$
> > tree:  --||||-----|||||||---------||||||||||-----------
> > 
> > insert spanning range and replace overlapping ones:
> >        --||||-----||||||||||||||||||||||||||-----------
> >                   ^^^^marked UNKNOWN_IPA^^^^
> > 
> > With the reverse map created, when a canonical IPA range gets unmapped,
> > look into each s2 mmu's maple tree and look for canonical IPA ranges
> > affected, and base on their UNKNOWN_IPA status:
> > 
> > UNKNOWN_IPA     -> fall back and fully unmap the current shadow
> >                    stage-2, also clear the tree
> > 
> > not UNKNOWN_IPA -> unmap the nested IPA range, and remove the reverse
> >                    map entry
> > 
> > Suggested-by: Marc Zyngier <maz@kernel.org>
> > Signed-off-by: Wei-Lin Chang <weilin.chang@arm.com>
> > ---
> >  arch/arm64/include/asm/kvm_host.h   |   4 +
> >  arch/arm64/include/asm/kvm_nested.h |   4 +
> >  arch/arm64/kvm/mmu.c                |  27 ++++--
> >  arch/arm64/kvm/nested.c             | 140 +++++++++++++++++++++++++++-
> >  4 files changed, 167 insertions(+), 8 deletions(-)
> > 
> > diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> > index 1a56d137df10..dc4c0bce1bbb 100644
> > --- a/arch/arm64/include/asm/kvm_host.h
> > +++ b/arch/arm64/include/asm/kvm_host.h
> > @@ -223,6 +223,10 @@ struct kvm_s2_mmu {
> >  	 */
> >  	bool	pending_unmap;
> >  
> > +	bool	nested_revmap_broken;
> > +	/* canonical IPA to nested IPA range lookup */
> > +	struct maple_tree nested_revmap_mt;
> > +
> >  #ifdef CONFIG_PTDUMP_STAGE2_DEBUGFS
> >  	struct dentry *shadow_pt_debugfs_dentry;
> >  #endif
> > diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h
> > index 091544e6af44..5cbf78dfc685 100644
> > --- a/arch/arm64/include/asm/kvm_nested.h
> > +++ b/arch/arm64/include/asm/kvm_nested.h
> > @@ -76,6 +76,8 @@ extern void kvm_s2_mmu_iterate_by_vmid(struct kvm *kvm, u16 vmid,
> >  				       const union tlbi_info *info,
> >  				       void (*)(struct kvm_s2_mmu *,
> >  						const union tlbi_info *));
> > +extern void kvm_record_nested_revmap(gpa_t gpa, struct kvm_s2_mmu *mmu,
> > +				     gpa_t fault_ipa, size_t map_size);
> >  extern void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu);
> >  extern void kvm_vcpu_put_hw_mmu(struct kvm_vcpu *vcpu);
> >  
> > @@ -164,6 +166,8 @@ extern int kvm_s2_handle_perm_fault(struct kvm_vcpu *vcpu,
> >  				    struct kvm_s2_trans *trans);
> >  extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64 esr_el2);
> >  extern void kvm_nested_s2_wp(struct kvm *kvm);
> > +extern void kvm_unmap_gfn_range_nested(struct kvm *kvm, gpa_t gpa, size_t size,
> > +				       bool may_block);
> >  extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block);
> >  extern void kvm_nested_s2_flush(struct kvm *kvm);
> >  
> > diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> > index e4becd5cdf36..ce0bd88cd3c1 100644
> > --- a/arch/arm64/kvm/mmu.c
> > +++ b/arch/arm64/kvm/mmu.c
> > @@ -5,6 +5,7 @@
> >   */
> >  
> >  #include <linux/acpi.h>
> > +#include <linux/maple_tree.h>
> >  #include <linux/mman.h>
> >  #include <linux/kvm_host.h>
> >  #include <linux/io.h>
> > @@ -1099,6 +1100,7 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> >  {
> >  	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
> >  	struct kvm_pgtable *pgt = NULL;
> > +	struct maple_tree *revmap_mt = &mmu->nested_revmap_mt;
> >  
> >  	write_lock(&kvm->mmu_lock);
> >  	pgt = mmu->pgt;
> > @@ -1108,8 +1110,11 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> >  		free_percpu(mmu->last_vcpu_ran);
> >  	}
> >  
> > -	if (kvm_is_nested_s2_mmu(kvm, mmu))
> > +	if (kvm_is_nested_s2_mmu(kvm, mmu)) {
> > +		if (!mtree_empty(revmap_mt))
> > +			mtree_destroy(revmap_mt);
> >  		kvm_init_nested_s2_mmu(mmu);
> > +	}
> >  
> >  	write_unlock(&kvm->mmu_lock);
> >  
> > @@ -1631,6 +1636,10 @@ static int gmem_abort(const struct kvm_s2_fault_desc *s2fd)
> >  		goto out_unlock;
> >  	}
> >  
> > +	if (s2fd->nested)
> > +		kvm_record_nested_revmap(gfn << PAGE_SHIFT, pgt->mmu,
> > +					 s2fd->fault_ipa, PAGE_SIZE);
> > +
> >  	ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, s2fd->fault_ipa, PAGE_SIZE,
> >  						 __pfn_to_phys(pfn), prot,
> >  						 memcache, flags);
> > @@ -2034,6 +2043,10 @@ static int kvm_s2_fault_map(const struct kvm_s2_fault_desc *s2fd,
> >  		ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, gfn_to_gpa(gfn),
> >  								 prot, flags);
> >  	} else {
> > +		if (s2fd->nested)
> > +			kvm_record_nested_revmap(canonical_gpa, pgt->mmu,
> > +						 gfn_to_gpa(gfn), mapping_size);
> > +
> >  		ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, gfn_to_gpa(gfn), mapping_size,
> >  							 __pfn_to_phys(pfn), prot,
> >  							 memcache, flags);
> > @@ -2389,14 +2402,16 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
> >  
> >  bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
> >  {
> > +	gpa_t gpa = range->start << PAGE_SHIFT;
> > +	size_t size = (range->end - range->start) << PAGE_SHIFT;
> > +	bool may_block = range->may_block;
> > +
> >  	if (!kvm->arch.mmu.pgt || kvm_vm_is_protected(kvm))
> >  		return false;
> >  
> > -	__unmap_stage2_range(&kvm->arch.mmu, range->start << PAGE_SHIFT,
> > -			     (range->end - range->start) << PAGE_SHIFT,
> > -			     range->may_block);
> > +	__unmap_stage2_range(&kvm->arch.mmu, gpa, size, may_block);
> 
> This sort of cleanups could be in a separate patch.

Ack.

> 
> > +	kvm_unmap_gfn_range_nested(kvm, gpa, size, may_block);
> >  
> > -	kvm_nested_s2_unmap(kvm, range->may_block);
> >  	return false;
> >  }
> >  
> > @@ -2674,7 +2689,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
> >  
> >  	write_lock(&kvm->mmu_lock);
> >  	kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
> > -	kvm_nested_s2_unmap(kvm, true);
> > +	kvm_unmap_gfn_range_nested(kvm, gpa, size, true);
> >  	write_unlock(&kvm->mmu_lock);
> >  }
> >  
> > diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> > index 883b6c1008fb..35b5d5f21a23 100644
> > --- a/arch/arm64/kvm/nested.c
> > +++ b/arch/arm64/kvm/nested.c
> > @@ -7,6 +7,7 @@
> >  #include <linux/bitfield.h>
> >  #include <linux/kvm.h>
> >  #include <linux/kvm_host.h>
> > +#include <linux/maple_tree.h>
> >  
> >  #include <asm/fixmap.h>
> >  #include <asm/kvm_arm.h>
> > @@ -43,6 +44,20 @@ struct vncr_tlb {
> >   */
> >  #define S2_MMU_PER_VCPU		2
> >  
> > +/*
> > + * Per shadow S2 reverse map (IPA -> nested IPA range) maple tree payload
> > + * layout:
> > + *
> > + * bit  62:     valid, prevents the case where the nested IPA is 0 and turning
> > + *              the whole value to 0
> > + * bits 55-12:  nested IPA bits 55-12
> > + * bit  0:      UNKNOWN_IPA bit, 1 indicates we give up on tracking what nested
> > + *              IPA maps to this canonical IPA in the shadow stage-2
> > + */
> > +#define VALID_ENTRY		BIT(62)
> > +#define ADDR_MASK		GENMASK_ULL(55, 12)
> > +#define UNKNOWN_IPA		BIT(0)
> > +
> >  void kvm_init_nested(struct kvm *kvm)
> >  {
> >  	kvm->arch.nested_mmus = NULL;
> > @@ -769,12 +784,57 @@ static struct kvm_s2_mmu *get_s2_mmu_nested(struct kvm_vcpu *vcpu)
> >  	return s2_mmu;
> >  }
> >  
> > +void kvm_record_nested_revmap(gpa_t ipa, struct kvm_s2_mmu *mmu,
> > +			      gpa_t fault_ipa, size_t map_size)
> 
> The name fault_ipa doesn't really make sense here. This is the IPA as
> seen from L1 (the input to the L1 S2 tables). We indeed obtain it from
> a fault, but that should not influence the naming here.
> 
> Similarly, 'ipa' should be qualified a bit better to reflect that this
> is the L0 IPA.

Yeah these names aren't very clear. I'll change to use 'canonical IPA',
and 'nested IPA'.

> 
> > +{
> > +	struct maple_tree *revmap_mt = &mmu->nested_revmap_mt;
> > +	gpa_t ipa_end = ipa + map_size - 1;
> 
> Are you always guaranteed that ipa is aligned on map_size?

From what I see in kvm_s2_fault_map() and gmem_abort(), yes, the ipa is
aligned to map_size.

What about adding

	if (WARN_ON(!IS_ALIGNED(canonical_ipa, map_size))) {
		ALIGN_DOWN(canonical_ipa, map_size);
		canonical_ipa_end = canonical_ipa + map_size;
	}

to be safe?

> 
> > +	u64 entry, new_entry = 0;
> > +	MA_STATE(mas_rev, revmap_mt, ipa, ipa_end);
> > +
> > +	if (mmu->nested_revmap_broken)
> > +		return;
> > +
> > +	mtree_lock(revmap_mt);
> > +	entry = xa_to_value(mas_find_range(&mas_rev, ipa_end));
> > +
> > +	if (entry) {
> > +		/* maybe just a perm update... */
> > +		if (!(entry & UNKNOWN_IPA) && mas_rev.index == ipa &&
> 
> Shouldn't you check that VALID_ENTRY is set? Is the index guaranteed
> to match the L0 IPA?

Right now either VALID_ENTRY or UNKNOWN_IPA is set for an exising entry
in the tree, having both set is not expected. I think we can turn
VALID_ENTRY and UNKNOWN_IPA tests into helpers and warn if they are both
set in the helpers?

> 
> > +		    mas_rev.last == ipa_end &&
> > +		    fault_ipa == (entry & ADDR_MASK))
> 
> Again, I think there is a potential alignment issue here.
> 
> > +			goto unlock;
> > +		/*
> > +		 * Create a "UNKNOWN_IPA" range that spans all the overlapping
> > +		 * ranges and store it.
> > +		 */
> > +		while (entry && mas_rev.index <= ipa_end) {
> > +			ipa = min(mas_rev.index, ipa);
> > +			ipa_end = max(mas_rev.last, ipa_end);
> > +			entry = xa_to_value(mas_find_range(&mas_rev, ipa_end));
> > +		}
> > +		new_entry |= UNKNOWN_IPA;
> > +	} else {
> > +		new_entry |= fault_ipa;
> > +		new_entry |= VALID_ENTRY;
> > +	}
> > +
> > +	mas_set_range(&mas_rev, ipa, ipa_end);
> > +	if (mas_store_gfp(&mas_rev, xa_mk_value(new_entry),
> > +			  GFP_NOWAIT | __GFP_ACCOUNT))
> > +		mmu->nested_revmap_broken = true;
> 
> I really think we ought to track this event happening. Maybe a trace
> point.

Will do.

> 
> > +unlock:
> > +	mtree_unlock(revmap_mt);
> > +}
> > +
> >  void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu)
> >  {
> >  	/* CnP being set denotes an invalid entry */
> >  	mmu->tlb_vttbr = VTTBR_CNP_BIT;
> >  	mmu->nested_stage2_enabled = false;
> >  	atomic_set(&mmu->refcnt, 0);
> > +	mt_init(&mmu->nested_revmap_mt);
> > +	mmu->nested_revmap_broken = false;
> >  }
> >  
> >  void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
> > @@ -1150,6 +1210,82 @@ void kvm_nested_s2_wp(struct kvm *kvm)
> >  	kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt->ia_bits));
> >  }
> >  
> > +static void reset_revmap_and_unmap(struct kvm_s2_mmu *mmu, bool may_block)
> > +{
> > +	mtree_destroy(&mmu->nested_revmap_mt);
> > +	mmu->nested_revmap_broken = false;
> > +	kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu), may_block);
> > +}
> > +
> > +static void unmap_mmu_ipa_range(struct kvm_s2_mmu *mmu, gpa_t gpa,
> > +				  size_t unmap_size, bool may_block)
> 
> Same comment as above about the nature of 'gpa'. I *think* this is the
> L0 IPA, but please clarify.

Will use clearer names from now.

> 
> > +{
> > +	struct maple_tree *revmap_mt = &mmu->nested_revmap_mt;
> > +	gpa_t ipa = gpa;
> > +	gpa_t ipa_end = gpa + unmap_size - 1;
> 
> Similar concerns about alignments.

For this function, I don't think there are any guarantees on alignment
for sizes > PAGE_SIZE, but I think if we make sure the range we unmap
completely covers the input range then things should be fine.

> 
> > +	u64 entry;
> > +	size_t entry_size;
> > +	MA_STATE(mas_rev, revmap_mt, gpa, ipa_end);
> > +
> > +	if (mmu->nested_revmap_broken) {
> > +		reset_revmap_and_unmap(mmu, may_block);
> > +		return;
> > +	}
> > +
> > +	mtree_lock(revmap_mt);
> > +	entry = xa_to_value(mas_find_range(&mas_rev, ipa_end));
> > +
> > +	while (entry && mas_rev.index <= ipa_end) {
> 
> I'm again concerned that the VALID bit is never checked.
> 
> > +		ipa = mas_rev.last + 1;
> > +		entry_size = mas_rev.last - mas_rev.index + 1;
> > +		/*
> > +		 * Give up and invalidate this s2 mmu if the unmap range
> > +		 * touches any UNKNOWN_IPA range.
> > +		 */
> > +		if (entry & UNKNOWN_IPA) {
> > +			mtree_unlock(revmap_mt);
> > +			reset_revmap_and_unmap(mmu, may_block);
> > +			return;
> > +		}
> > +
> > +		/*
> > +		 * Ignore result, it is okay if a reverse mapping erase
> > +		 * fails.
> > +		 */
> > +		mas_store_gfp(&mas_rev, NULL, GFP_NOWAIT | __GFP_ACCOUNT);
> > +
> > +		mtree_unlock(revmap_mt);
> > +		kvm_stage2_unmap_range(mmu, entry & ADDR_MASK, entry_size,
> > +				       may_block);
> > +		mtree_lock(revmap_mt);
> > +		/*
> > +		 * Other maple tree operations during preemption could render
> > +		 * this ma_state invalid, so reset it.
> > +		 */
> > +		mas_set_range(&mas_rev, ipa, ipa_end);
> > +		entry = xa_to_value(mas_find_range(&mas_rev, ipa_end));
> > +	}
> > +	mtree_unlock(revmap_mt);
> > +}
> > +
> > +void kvm_unmap_gfn_range_nested(struct kvm *kvm, gpa_t gpa, size_t size,
> > +				bool may_block)
> > +{
> > +	int i;
> > +
> > +	if (!kvm->arch.nested_mmus_size)
> > +		return;
> > +
> > +	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
> > +		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
> > +
> > +		if (kvm_s2_mmu_valid(mmu))
> > +			unmap_mmu_ipa_range(mmu, gpa, size, may_block);
> > +	}
> > +
> > +	kvm_invalidate_vncr_ipa(kvm, gpa, gpa + size);
> 
> I'm not overly fond of propagating the VNCR invalidation in all the S2
> manipulations. I understand why you are doing it here, but I think we
> need to have a better solution.

I agree. I think moving kvm_invalidate_vncr_ipa() up one level, to the
places where canonical IPA are being invalidated, is slightly more
sensible.

> 
> Fundamentally, VNCR invalidation has nothing to do with S2. This
> really is a EL2 S1 thing. And given that you have a reverse map per
> s2_mmu, it would be easy enough to track VNCR TLBs through that.
> 
> It doesn't have to be part of this patch, but that would be a good
> thing to disentangle as a subsequent patch.

I'll have a think, thanks!

Thanks,
Wei-Lin Chang

> 
> Thanks,
> 
> 	M.
> 
> -- 
> Without deviation from the norm, progress is not possible.


^ permalink raw reply

* RE: [PATCH v6 3/4] reset: cix: add sky1 audss auxiliary reset driver
From: Joakim  Zhang @ 2026-06-29  9:11 UTC (permalink / raw)
  To: Philipp Zabel, mturquette@baylibre.com, sboyd@kernel.org,
	bmasney@redhat.com, robh@kernel.org, krzk+dt@kernel.org,
	conor+dt@kernel.org, Gary Yang
  Cc: cix-kernel-upstream, linux-clk@vger.kernel.org,
	devicetree@vger.kernel.org, linux-kernel@vger.kernel.org,
	linux-arm-kernel@lists.infradead.org
In-Reply-To: <0193c47ff4ca98b1a6cb56ed8f4d8876b54756d8.camel@pengutronix.de>


Hello, Philipp

> -----Original Message-----
> From: Philipp Zabel <p.zabel@pengutronix.de>
> Sent: Wednesday, June 24, 2026 4:30 PM
> To: Joakim Zhang <joakim.zhang@cixtech.com>; mturquette@baylibre.com;
> sboyd@kernel.org; bmasney@redhat.com; robh@kernel.org;
> krzk+dt@kernel.org; conor+dt@kernel.org; Gary Yang
> <gary.yang@cixtech.com>
> Cc: cix-kernel-upstream <cix-kernel-upstream@cixtech.com>; linux-
> clk@vger.kernel.org; devicetree@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-kernel@lists.infradead.org
> Subject: Re: [PATCH v6 3/4] reset: cix: add sky1 audss auxiliary reset driver
> 
> EXTERNAL EMAIL
> 
> CAUTION: Suspicious Email from unusual domain.
> 
> On Di, 2026-06-23 at 15:08 +0800, joakim.zhang@cixtech.com wrote:
> > From: Joakim Zhang <joakim.zhang@cixtech.com>
> >
> > Add an auxiliary reset controller driver for the AUDSS CRU. Sixteen
> > software reset lines for audio subsystem peripherals are controlled
> > through one register in the CRU register map.
> >
> > The driver is created by the AUDSS clock platform driver and registers
> > the reset controller on the CRU device node.
> >
> > Signed-off-by: Joakim Zhang <joakim.zhang@cixtech.com>
> > ---
> >  drivers/reset/Kconfig            |  14 +++
> >  drivers/reset/Makefile           |   1 +
> >  drivers/reset/reset-sky1-audss.c | 192
> > +++++++++++++++++++++++++++++++
> >  3 files changed, 207 insertions(+)
> >  create mode 100644 drivers/reset/reset-sky1-audss.c
> >
> > diff --git a/drivers/reset/Kconfig b/drivers/reset/Kconfig index
> > d009eb0849a3..f74859b292ae 100644
> > --- a/drivers/reset/Kconfig
> > +++ b/drivers/reset/Kconfig
> > @@ -300,6 +300,20 @@ config RESET_SKY1
> >       help
> >         This enables the reset controller for Cix Sky1.
> >
> > +config RESET_SKY1_AUDSS
> > +     tristate "Cix Sky1 Audio Subsystem reset controller"
> > +     depends on ARCH_CIX || COMPILE_TEST
> > +     select AUXILIARY_BUS
> > +     select REGMAP_MMIO
> > +     default CLK_SKY1_AUDSS
> > +     help
> > +       Support for block-level software reset lines in the Cix Sky1
> > +       Audio Subsystem (AUDSS) Clock and Reset Unit. Sixteen reset
> > +       outputs for audio peripherals are controlled through the CRU
> > +       register map. The driver binds as an auxiliary device from
> > +       the AUDSS clock driver. Say M or Y here if you want to build
> > +       this driver.
> > +
> >  config RESET_SOCFPGA
> >       bool "SoCFPGA Reset Driver" if COMPILE_TEST && (!ARM
> || !ARCH_INTEL_SOCFPGA)
> >       default ARM && ARCH_INTEL_SOCFPGA diff --git
> > a/drivers/reset/Makefile b/drivers/reset/Makefile index
> > 3e52569bd276..e81407ea3e29 100644
> > --- a/drivers/reset/Makefile
> > +++ b/drivers/reset/Makefile
> > @@ -39,6 +39,7 @@ obj-$(CONFIG_RESET_RZV2H_USB2PHY) +=
> > reset-rzv2h-usb2phy.o
> >  obj-$(CONFIG_RESET_SCMI) += reset-scmi.o
> >  obj-$(CONFIG_RESET_SIMPLE) += reset-simple.o
> >  obj-$(CONFIG_RESET_SKY1) += reset-sky1.o
> > +obj-$(CONFIG_RESET_SKY1_AUDSS) += reset-sky1-audss.o
> >  obj-$(CONFIG_RESET_SOCFPGA) += reset-socfpga.o
> >  obj-$(CONFIG_RESET_SUNPLUS) += reset-sunplus.o
> >  obj-$(CONFIG_RESET_SUNXI) += reset-sunxi.o diff --git
> > a/drivers/reset/reset-sky1-audss.c b/drivers/reset/reset-sky1-audss.c
> > new file mode 100644
> > index 000000000000..20870f37d7d7
> > --- /dev/null
> > +++ b/drivers/reset/reset-sky1-audss.c
> > @@ -0,0 +1,192 @@
> > +// SPDX-License-Identifier: GPL-2.0-only
> > +/*
> > + * Cix Sky1 Audio Subsystem reset controller driver
> > + *
> > + * Copyright 2026 Cix Technology Group Co., Ltd.
> > + */
> > +
> > +#include <dt-bindings/reset/cix,sky1-audss-cru.h>
> > +
> > +#include <linux/auxiliary_bus.h>
> > +#include <linux/delay.h>
> > +#include <linux/device.h>
> > +#include <linux/io.h>
> > +#include <linux/module.h>
> > +#include <linux/of.h>
> > +#include <linux/of_address.h>
> > +#include <linux/regmap.h>
> > +#include <linux/reset-controller.h>
> > +
> > +#define SKY1_RESET_SLEEP_MIN_US              50
> > +#define SKY1_RESET_SLEEP_MAX_US              100
> > +
> > +#define AUDSS_SW_RST                 0x78
> > +
> > +struct sky1_audss_reset_map {
> > +     unsigned int offset;
> > +     unsigned int mask;
> > +};
> > +
> > +struct sky1_audss_reset {
> > +     struct reset_controller_dev rcdev;
> > +     struct regmap *regmap;
> > +     const struct sky1_audss_reset_map *map; };
> > +
> > +static const struct sky1_audss_reset_map sky1_audss_reset_map[] = {
> > +     [AUDSS_I2S0_SW_RST]   = { AUDSS_SW_RST, BIT(0) },
> > +     [AUDSS_I2S1_SW_RST]   = { AUDSS_SW_RST, BIT(1) },
> > +     [AUDSS_I2S2_SW_RST]   = { AUDSS_SW_RST, BIT(2) },
> > +     [AUDSS_I2S3_SW_RST]   = { AUDSS_SW_RST, BIT(3) },
> > +     [AUDSS_I2S4_SW_RST]   = { AUDSS_SW_RST, BIT(4) },
> > +     [AUDSS_I2S5_SW_RST]   = { AUDSS_SW_RST, BIT(5) },
> > +     [AUDSS_I2S6_SW_RST]   = { AUDSS_SW_RST, BIT(6) },
> > +     [AUDSS_I2S7_SW_RST]   = { AUDSS_SW_RST, BIT(7) },
> > +     [AUDSS_I2S8_SW_RST]   = { AUDSS_SW_RST, BIT(8) },
> > +     [AUDSS_I2S9_SW_RST]   = { AUDSS_SW_RST, BIT(9) },
> > +     [AUDSS_WDT_SW_RST]    = { AUDSS_SW_RST, BIT(10) },
> > +     [AUDSS_TIMER_SW_RST]  = { AUDSS_SW_RST, BIT(11) },
> > +     [AUDSS_MB0_SW_RST]    = { AUDSS_SW_RST, BIT(12) },
> > +     [AUDSS_MB1_SW_RST]    = { AUDSS_SW_RST, BIT(13) },
> > +     [AUDSS_HDA_SW_RST]    = { AUDSS_SW_RST, BIT(14) },
> > +     [AUDSS_DMAC_SW_RST]   = { AUDSS_SW_RST, BIT(15) },
> > +};
> > +
> > +static struct sky1_audss_reset *to_sky1_audss_reset(struct
> > +reset_controller_dev *rcdev) {
> > +     return container_of(rcdev, struct sky1_audss_reset, rcdev); }
> > +
> > +static int sky1_audss_reset_set(struct reset_controller_dev *rcdev,
> > +                             unsigned long id, bool assert) {
> > +     struct sky1_audss_reset *priv = to_sky1_audss_reset(rcdev);
> > +     const struct sky1_audss_reset_map *signal = &priv->map[id];
> > +     unsigned int value = assert ? 0 : signal->mask;
> > +
> > +     return regmap_update_bits(priv->regmap, signal->offset,
> > + signal->mask, value);
> 
> Why does this propagate the return value ...
I'll propagate the return value in the ops callbacks.


> > +}
> > +
> > +static int sky1_audss_reset_assert(struct reset_controller_dev *rcdev,
> > +                                unsigned long id) {
> > +     sky1_audss_reset_set(rcdev, id, true);
> 
> ... only to be ignored? It'd be better to pass it on.
Yes, will add.


> > +     usleep_range(SKY1_RESET_SLEEP_MIN_US,
> SKY1_RESET_SLEEP_MAX_US);
> > +     return 0;
> > +}
> > +
> > +static int sky1_audss_reset_deassert(struct reset_controller_dev *rcdev,
> > +                                  unsigned long id) {
> > +     sky1_audss_reset_set(rcdev, id, false);
> > +     usleep_range(SKY1_RESET_SLEEP_MIN_US,
> SKY1_RESET_SLEEP_MAX_US);
> > +     return 0;
> > +}
> > +
> > +static int sky1_audss_reset(struct reset_controller_dev *rcdev,
> > +                         unsigned long id) {
> > +     sky1_audss_reset_assert(rcdev, id);
> > +     sky1_audss_reset_deassert(rcdev, id);
> > +     return 0;
> > +}
> 
> Will any AUDSS reset consumer use the reset_control_reset() API?
> If not, no need to implement this.
Will remove both .reset and .status.


> > +
> > +static int sky1_audss_reset_status(struct reset_controller_dev *rcdev,
> > +                                unsigned long id) {
> > +     struct sky1_audss_reset *priv = to_sky1_audss_reset(rcdev);
> > +     const struct sky1_audss_reset_map *signal = &priv->map[id];
> > +     unsigned int value;
> > +
> > +     regmap_read(priv->regmap, signal->offset, &value);
> > +     return !!(value & signal->mask); }
> > +
> > +static const struct reset_control_ops sky1_audss_reset_ops = {
> > +     .reset    = sky1_audss_reset,
> > +     .assert   = sky1_audss_reset_assert,
> > +     .deassert = sky1_audss_reset_deassert,
> > +     .status   = sky1_audss_reset_status,
> > +};
> > +
> > +static const struct regmap_config sky1_audss_regmap_config = {
> > +     .reg_bits = 32,
> > +     .val_bits = 32,
> > +     .reg_stride = 4,
> > +};
> > +
> > +static void sky1_audss_reset_iounmap(void *data) {
> > +     iounmap(data);
> > +}
> > +
> > +static int sky1_audss_reset_get_regmap(struct sky1_audss_reset *priv)
> > +{
> > +     struct device *dev = priv->rcdev.dev;
> > +     void __iomem *base;
> > +     int ret;
> > +
> > +     priv->regmap = dev_get_regmap(dev->parent, NULL);
> > +     if (priv->regmap)
> > +             return 0;
> > +
> > +     base = of_iomap(dev->parent->of_node, 0);
> > +     if (!base)
> > +             return dev_err_probe(dev, -ENOMEM, "failed to iomap
> > + address space\n");
> > +
> > +     ret = devm_add_action_or_reset(dev, sky1_audss_reset_iounmap, base);
> > +     if (ret)
> > +             return dev_err_probe(dev, ret, "failed to register
> > + iounmap action\n");
> > +
> > +     priv->regmap = devm_regmap_init_mmio(dev, base,
> &sky1_audss_regmap_config);
> > +     if (IS_ERR(priv->regmap))
> > +             return dev_err_probe(dev, PTR_ERR(priv->regmap),
> > +                                  "failed to initialize regmap\n");
> 
> Why is there a fallback path? The clock driver creates the regmap before
> creating the reset aux device, so dev_get_regmap() can never fail.
Agreed. 


> > +
> > +     return 0;
> > +}
> > +
> > +static int sky1_audss_reset_probe(struct auxiliary_device *adev,
> > +                               const struct auxiliary_device_id *id)
> > +{
> > +     struct sky1_audss_reset *priv;
> > +     struct device *dev = &adev->dev;
> > +     int ret;
> > +
> > +     priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL);
> > +     if (!priv)
> > +             return -ENOMEM;
> > +
> > +     priv->map = sky1_audss_reset_map;
> > +     priv->rcdev.owner = THIS_MODULE;
> > +     priv->rcdev.nr_resets = ARRAY_SIZE(sky1_audss_reset_map);
> > +     priv->rcdev.ops = &sky1_audss_reset_ops;
> > +     priv->rcdev.of_node = dev->parent->of_node;
> 
> auxiliary_device_create() uses device_set_of_node_from_dev() to inherit the
> parent of_node, so you can use dev->of_node here.
Done. rcdev.of_node now uses dev->of_node.


> > +     priv->rcdev.dev = dev;
> > +     priv->rcdev.of_reset_n_cells = 1;
> 
> No need to set of_reset_n_cells.
> 
> > +
> > +     dev_set_drvdata(dev, priv);
> 
> This seems unnecessary as well.
> 
> > +
> > +     ret = sky1_audss_reset_get_regmap(priv);
> > +     if (ret)
> > +             return dev_err_probe(dev, ret, "failed to get
> > + regmap\n");
> > +
> > +     return devm_reset_controller_register(dev, &priv->rcdev); }
> > +
> > +static const struct auxiliary_device_id sky1_audss_reset_ids[] = {
> > +     { .name = "clk_sky1_audss.reset" },
> > +     { }
> > +};
> > +MODULE_DEVICE_TABLE(auxiliary, sky1_audss_reset_ids);
> > +
> > +static struct auxiliary_driver sky1_audss_reset_driver = {
> > +     .probe = sky1_audss_reset_probe,
> > +     .id_table = sky1_audss_reset_ids, };
> > +
> 
> Drop this empty line.
Removed dev_set_drvdata() and the extra blank line.

Thanks,
Joakim


^ permalink raw reply

* Re: [PATCH v2 2/4] dt-bindings: phy: nuvoton,ma35d1-usb2-phy: extend for dual-port OTG support
From: Joey Lu @ 2026-06-29 10:40 UTC (permalink / raw)
  To: Krzysztof Kozlowski
  Cc: Vinod Koul, Neil Armstrong, Rob Herring, Krzysztof Kozlowski,
	Conor Dooley, Arnd Bergmann, Catalin Marinas, Jacky Huang,
	Shan-Chun Hung, Hui-Ping Chen, Joey Lu, linux-phy, devicetree,
	linux-arm-kernel, linux-kernel
In-Reply-To: <20260625-sexy-black-tarantula-4031a6@quoll>


On 6/25/2026 3:58 PM, Krzysztof Kozlowski wrote:
> On Thu, Jun 25, 2026 at 10:39:56AM +0800, Joey Lu wrote:
>>   properties:
>>     compatible:
>>       enum:
>>         - nuvoton,ma35d1-usb2-phy
>>   
>> +  reg:
>> +    maxItems: 1
>> +
>>     "#phy-cells":
>> -    const: 0
>> +    const: 1
>> +    description:
>> +      The single cell selects the PHY port. 0 selects the OTG port (USB0,
>> +      shared with DWC2 gadget controller) and 1 selects the host-only port
>> +      (USB1).
>>   
>> -  clocks:
>> -    maxItems: 1
> This is odd, considering that parent does not have clocks. So explain me
> this:
> 1. USB PHY needed clocks.
> 2. You extend USB PHY to cover second part.
> 3. That extension for second part means that clocks are not needed.
> Really, how? How is it possible in hardware?
The hardware has two independent clock domains:

   - The PHY analog block takes the 24 MHz HXT as its reference, wired
     directly to the PHY's internal PLL, which derives the required 
operating
     frequencies internally. This reference path is entirely outside the SoC
     software clock tree; no software-gatable clock gate needs to be enabled
     for the PHY to power up and lock its PLL. The only software control the
     PHY driver exercises is toggling each PHY's Power-On Reset (POR) bit,
     which resides in the SYS register block. The driver accesses this via
     the parent regmap

   - `HUSBH0_GATE` / `HUSBH1_GATE` / `USBD_GATE` are AHB/APB bus interface
     clocks for the host and gadget (EHCI, OHCI, DWC2). They gate
     the register-access path between the CPU and each controller, not 
the PHY
     analog circuitry itself.

The original single-port driver enabled `HUSBH0_GATE` as if it belonged 
to the
PHY, but that gate is actually owned by EHCI0/OHCI0 and is already 
managed by
those controller drivers through their own `clocks` DTS bindings. The PHY
driver was redundantly enabling the same gate.

When extending the driver to cover PHY1, the same pattern held: EHCI1/OHCI1
manage `HUSBH1_GATE` themselves. There is no clock that belongs 
exclusively to
the PHY, so `clocks` will be dropped from the PHY binding entirely.
>> +  nuvoton,rcalcode:
>> +    $ref: /schemas/types.yaml#/definitions/uint32-array
>> +    minItems: 1
>> +    maxItems: 2
> You should require two values. I understand that any PHY is optional,
> thus you skip the entry, so how would you provide value for PHY1 only?
`nuvoton,rcalcode` will be changed to require exactly two values
(`minItems: 2, maxItems: 2`), one for PHY0 and one for PHY1 respectively.
The property will remain optional overall; when absent, each port 
retains its
power-on default value loaded at hardware initialisation. When present, both
entries must be supplied.
>> +    items:
>> +      minimum: 0
>> +      maximum: 15
>> +    description:
>> +      Resistor calibration trim codes for PHY0 and PHY1 respectively.
>> +      Each 4-bit value is written to the RCALCODE field in USBPMISCR and
>> +      adjusts the PHY's internal termination resistance. Both entries are
>> +      optional; when absent the hardware reset default is used.
>>   
>> -  nuvoton,sys:
>> -    $ref: /schemas/types.yaml#/definitions/phandle
>> +  nuvoton,oc-active-high:
>> +    type: boolean
>>       description:
>> -      phandle to syscon for checking the PHY clock status.
>> +      When present, the over-current detect input from the VBUS power switch
>> +      is treated as active-high. The default (property absent) is active-low.
>> +      This setting is shared by both USB host ports.
>>   
>>   required:
>>     - compatible
>> +  - reg
> That's ABI break which was not explained in the commit msg - neither
> specifying impact nor actually providing reasons why you break ABI.
>
> And honestly, you have no resources here except the address, so now it
> is clear that this should be folded into parent. See DTS101 talk slides.
The commit message will be updated to explicitly acknowledge the ABI break:
existing DTS files that contain a standalone `usb-phy` node without a `reg`
property will fail dt-schema validation after this change. The impact is
limited to the MA35D1 SoC; no upstream DTS for this SoC existed before this
patch series, so no in-tree board files are broken. The break is intentional
and justified: the PHY register block is physically contained within the 
syscon
MMIO range, and modelling it as a child of the syscon with a standard `reg`
property correctly reflects the hardware topology and follows the convention
established by similar sub-blocks in other SoCs.
>>     - "#phy-cells"
>> -  - clocks
>> -  - nuvoton,sys
>>   
>>   additionalProperties: false
>>   
>>   examples:
>>     - |
>> -    #include <dt-bindings/clock/nuvoton,ma35d1-clk.h>
>> +    system-management@40460000 {
>> +        compatible = "nuvoton,ma35d1-reset", "syscon", "simple-mfd";
>> +        reg = <0x40460000 0x200>;
>> +        #reset-cells = <1>;
>> +        #address-cells = <1>;
>> +        #size-cells = <1>;
> Drop. Keep only child node and make parent binding example complete.
The example in `nuvoton,ma35d1-usb2-phy.yaml` will be changed to show only
the `usb-phy@60` child node, without wrapping it in the parent node.
The full parent + child example will be moved to 
`nuvoton,ma35d1-reset.yaml`.

Thanks for review.
>>   
>> -    usb_phy: usb-phy {
>> -        compatible = "nuvoton,ma35d1-usb2-phy";
>> -        clocks = <&clk USBD_GATE>;
>> -        nuvoton,sys = <&sys>;
>> -        #phy-cells = <0>;
>> +        usb-phy@60 {
>> +            compatible = "nuvoton,ma35d1-usb2-phy";
>> +            reg = <0x60 0x14>;
>> +            #phy-cells = <1>;
>> +        };
>>       };
>> -- 
>> 2.43.0
>>


^ permalink raw reply

* Re: [PATCH v3 2/3] clk: samsung: exynos990: Add PERIS TMU_SUB_PCLK gate
From: Peter Griffin @ 2026-06-29 10:44 UTC (permalink / raw)
  To: Denzeel Oliva
  Cc: Krzysztof Kozlowski, Sylwester Nawrocki, Chanwoo Choi,
	Alim Akhtar, Michael Turquette, Stephen Boyd, Brian Masney,
	Rob Herring, Conor Dooley, linux-samsung-soc, linux-clk,
	devicetree, linux-arm-kernel, linux-kernel
In-Reply-To: <20260613-exynos990-peris-fix-v3-v3-2-2b230db78ae4@gmail.com>

On Sat, 13 Jun 2026 at 13:36, Denzeel Oliva <wachiturroxd150@gmail.com> wrote:
>
> Add the missing CLK_GOUT_PERIS_TMU_SUB_PCLK gate clock for the Thermal
> Management Unit sub-block and update CLKS_NR_PERIS accordingly.
>
> Signed-off-by: Denzeel Oliva <wachiturroxd150@gmail.com>
> ---

Reviewed-by: Peter Griffin <peter.griffin@linaro.org>

>  drivers/clk/samsung/clk-exynos990.c | 6 +++++-
>  1 file changed, 5 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/clk/samsung/clk-exynos990.c b/drivers/clk/samsung/clk-exynos990.c
> index 4385c3b76dd6..ee3566b8e57c 100644
> --- a/drivers/clk/samsung/clk-exynos990.c
> +++ b/drivers/clk/samsung/clk-exynos990.c
> @@ -21,7 +21,7 @@
>  #define CLKS_NR_HSI0 (CLK_GOUT_HSI0_LHS_ACEL_D_HSI0_CLK + 1)
>  #define CLKS_NR_PERIC0 (CLK_GOUT_PERIC0_SYSREG_PCLK + 1)
>  #define CLKS_NR_PERIC1 (CLK_GOUT_PERIC1_XIU_P_ACLK + 1)
> -#define CLKS_NR_PERIS (CLK_GOUT_PERIS_OTP_CON_TOP_OSCCLK + 1)
> +#define CLKS_NR_PERIS (CLK_GOUT_PERIS_TMU_SUB_PCLK + 1)
>
>  /* ---- CMU_TOP ------------------------------------------------------------- */
>
> @@ -2619,6 +2619,10 @@ static const struct samsung_gate_clock peris_gate_clks[] __initconst = {
>              "gout_peris_d_tzpc_peris_pclk", "mout_peris_bus_user",
>              CLK_CON_GAT_GOUT_BLK_PERIS_UID_D_TZPC_PERIS_IPCLKPORT_PCLK,
>              21, 0, 0),
> +       GATE(CLK_GOUT_PERIS_TMU_SUB_PCLK,
> +            "gout_peris_tmu_sub_pclk", "mout_peris_bus_user",
> +            CLK_CON_GAT_GOUT_BLK_PERIS_UID_TMU_SUB_IPCLKPORT_PCLK,
> +            21, 0, 0),
>         GATE(CLK_GOUT_PERIS_TMU_TOP_PCLK,
>              "gout_peris_tmu_top_pclk", "mout_peris_clk_peris_gic",
>              CLK_CON_GAT_GOUT_BLK_PERIS_UID_TMU_TOP_IPCLKPORT_PCLK,
>
> --
> 2.54.0
>


^ permalink raw reply

* Re: [PATCH v14 0/7] Provide support for Trigger Generation Unit
From: Suzuki K Poulose @ 2026-06-29 10:44 UTC (permalink / raw)
  To: Songwei.Chai, Greg KH
  Cc: andersson, alexander.shishkin, mike.leach, konrad.dybcio,
	james.clark, krzk+dt, conor+dt, linux-kernel, linux-arm-kernel,
	linux-arm-msm, coresight, devicetree
In-Reply-To: <c09d70e1-edd6-41a8-8ab3-db353bb6f8eb@oss.qualcomm.com>

Hello,

On 29/06/2026 11:17, Songwei.Chai wrote:
> 
> On 6/29/2026 12:22 PM, Greg KH wrote:
>> On Mon, Jun 29, 2026 at 11:03:33AM +0800, Songwei.Chai wrote:
>>> Hi Greg & Alexander,
>>>
>>> Apologies for interrupting again.
>>>
>>> As the TGU hardware plays an important role in Qualcomm tracing 
>>> design, I
>>> would greatly appreciate it if you could kindly take some time to review
>>> this at your earliest convenience.
>> The merge window _just_ closed, please give us a chance to catch up.
>>
>> Also, why us?  Surely you have other reviewers for this code, right?
> 
> Hi Greg,
> 
> Understood, thanks for letting us know.
> 
> Regarding your question: since this introduces a new drivers/hwtracing/ 
> qcom directory, there is no existing maintainer for it.
> Given your scope (and Alexander's), we believe you are the most relevant 
> reviewers.
> 
> The reason for creating the qcom directory is as follows:
> 
> /We previously tried to upstream this driver under drivers/hwtracing/ 
> coresight,/
> /but it was not accepted as it is considered Qualcomm-specific and not 
> tightly/
> /coupled with the CoreSight subsystem. Based on this feedback, we are 

Some clarification here: This device is not CoreSight  so we denied
keeping this under drivers/hwtracing/coresight/ - Not because it is 
Qualcomm specific. We have TPDM, TPDA, TnoC devices under the coresight
subsystem, which are all Qualcomm specific for e.g.

That said, there are other drivers in drivers/hwtracing/ which I usually
merge and push to Greg, after some reviews/acks from the respective
people (e.g., PTT HiSilicon PCIe Tune and Trace).

But, your proposal was that there were other maintainers for your new 
subtree and you were going to push this via ,linux-arm-msm ? to which I
didn't have any objections.

That said, I am fine with pushing this to Greg via the CoreSight pull
requests (similar to Hisilicon PTT driver), but would need someone to
Maintain/Review the driver (with entries in MAINTAINERS, similar to
PTT).

Thoughts ?

Kind regards
Suzuki

> exploring/
> /a dedicated drivers/hwtracing/qcom directory, similar to intel_th, to 
> better/
> /support this and future Qualcomm hwtracing drivers./
> 
> More details can be found in “[PATCH v14 0/7] -- Why we are proposing 
> this”.
> 
> Thanks,
> Songwei
> 
>>
>> thanks,
>>
>> greg k-h

^ permalink raw reply

* Re: [PATCH v4 0/2] arm64: errata: NVIDIA Olympus device store/load ordering
From: Vladimir Murzin @ 2026-06-29 10:45 UTC (permalink / raw)
  To: Shanker Donthineni, Catalin Marinas, Will Deacon
  Cc: Jason Gunthorpe, linux-arm-kernel, Mark Rutland, linux-kernel,
	linux-doc, Vikram Sethi, Jason Sequeira
In-Reply-To: <20260625182425.3194066-1-sdonthineni@nvidia.com>

Hi,

On 6/25/26 19:24, Shanker Donthineni wrote:
> This series works around the NVIDIA Olympus device store/load ordering
> erratum (T410-OLY-1027): a Device-nGnR* load can be observed by a
> peripheral before an older, non-overlapping Device-nGnR* store to the
> same peripheral, breaking the program order that drivers rely on for
> MMIO and potentially leaving a device in an incorrect state.
> 
> Patch 1 adds the workaround. It promotes the raw MMIO store helpers
> (__raw_writeb/w/l/q, and therefore writel()/writel_relaxed()) to
> store-release on affected CPUs, and promotes the trailing DGH of the
> write-combining __iowrite{32,64}_copy() helpers to dmb osh. Everything is
> gated on a new ARM64_WORKAROUND_DEVICE_STORE_RELEASE cpucap and patched
> in only on affected parts, so it is a no-op elsewhere.
> 
> Patch 2 provides arm64 memset_io()/memcpy_toio(). The generic versions
> are built on __raw_write*(), so patch 1 would promote every store in a
> block to a store-release; as each STLR drains the write-combining buffer,
> block MMIO becomes O(n) store-releases. The arm64 versions emit plain
> STR in the loop and order the whole block with a single trailing dmb osh,
> keeping block MMIO at one-barrier cost.
> 
> Performance: NVIDIA Olympus, write-combining MMIO to a device BAR, single
> PE pinned; per-call cost in ns. Consecutive writes ping-pong between two
> buffers so repeated stores are not coalesced. iowrite64/iowrite32 =
> __iowrite{64,32}_copy().
> 
> Table 1 - workaround off (CONFIG_NVIDIA_OLYMPUS_1027_ERRATUM=n)
> +-------+-----------+-----------+-----------+-------------+
> |  size | iowrite64 | iowrite32 | memset_io | memcpy_toio |
> +-------+-----------+-----------+-----------+-------------+
> |    8B |   67.9 ns |   67.8 ns |    3.6 ns |    3.6 ns   |
> |   16B |   67.9 ns |   67.8 ns |    4.0 ns |    4.0 ns   |
> |   32B |   67.9 ns |   67.9 ns |    4.6 ns |    4.6 ns   |
> |   64B |   69.1 ns |   69.1 ns |   69.1 ns |   69.0 ns   |
> |  128B |  138.3 ns |  138.3 ns |  138.4 ns |  138.3 ns   |
> |  256B |  276.6 ns |  276.6 ns |  276.6 ns |  276.7 ns   |
> |  512B |  276.6 ns |  276.5 ns |  276.6 ns |  276.6 ns   |
> |   1KB |  276.6 ns |  278.4 ns |  276.6 ns |  276.6 ns   |
> |   2KB |  278.4 ns |  278.4 ns |  275.9 ns |  276.6 ns   |
> |   4KB |  365.7 ns |  365.7 ns |  365.7 ns |  365.7 ns   |
> +-------+-----------+-----------+-----------+-------------+
> relaxed/no-flush: memset_io()/memcpy_toio() issue plain stores with no
> trailing dgh() or barrier, unlike __iowrite*_copy() which ends with dgh().
> 
> Table 2 - workaround on, arm64 memset_io/memcpy_toio (this series)
> +-------+-----------+-----------+-----------+-------------+
> |  size | iowrite64 | iowrite32 | memset_io | memcpy_toio |
> +-------+-----------+-----------+-----------+-------------+
> |    8B |  231.6 ns |  231.6 ns |  232.4 ns |  232.4 ns   |
> |   16B |  231.7 ns |  231.9 ns |  232.7 ns |  232.6 ns   |
> |   32B |  231.9 ns |  232.7 ns |  232.9 ns |  232.9 ns   |
> |   64B |  232.7 ns |  235.0 ns |  233.7 ns |  233.6 ns   |
> |  128B |  233.6 ns |  235.8 ns |  234.4 ns |  234.3 ns   |
> |  256B |  237.7 ns |  276.8 ns |  264.0 ns |  276.7 ns   |
> |  512B |  237.7 ns |  277.1 ns |  238.1 ns |  277.6 ns   |
> |   1KB |  253.7 ns |  279.3 ns |  276.1 ns |  294.1 ns   |
> |   2KB |  295.0 ns |  318.7 ns |  288.5 ns |  308.3 ns   |
> |   4KB |  365.9 ns |  381.4 ns |  365.7 ns |  381.3 ns   |
> +-------+-----------+-----------+-----------+-------------+
> all four helpers end with a single trailing barrier (dmb osh).
> 
> Table 3 - workaround on, generic per-store memset_io/memcpy_toio
> +-------+-----------+-----------+-------------+--------------+
> |  size | iowrite64 | iowrite32 |   memset_io |  memcpy_toio |
> +-------+-----------+-----------+-------------+--------------+
> |    8B |  231.6 ns |  231.6 ns |    229.0 ns |    229.0 ns  |
> |   16B |  231.7 ns |  231.9 ns |    458.4 ns |    458.5 ns  |
> |   32B |  231.9 ns |  232.7 ns |    917.4 ns |    917.5 ns  |
> |   64B |  232.7 ns |  234.8 ns |   1835.4 ns |   1835.5 ns  |
> |  128B |  233.6 ns |  235.8 ns |   3670.9 ns |   3670.8 ns  |
> |  256B |  237.7 ns |  276.7 ns |   7341.6 ns |   7341.6 ns  |
> |  512B |  237.7 ns |  279.4 ns |  14001.4 ns |  14001.3 ns  |
> |   1KB |  253.7 ns |  279.1 ns |  28631.5 ns |  28631.8 ns  |
> |   2KB |  279.4 ns |  317.9 ns |  57276.3 ns |  57275.2 ns  |
> |   4KB |  365.7 ns |  381.5 ns | 114564.4 ns | 114563.6 ns  |
> +-------+-----------+-----------+-------------+--------------+
> the generic memset_io()/memcpy_toio() build on __raw_write*(), which the
> workaround promotes to store-release, so every store is individually
> ordered - hence O(n) in the store count.
> 
> Tables 2 and 3 show why patch 2 is needed: the generic per-store block
> writers collapse to O(n) under the workaround (4KB ~314x slower, ~115 us
> vs ~366 ns), while the arm64 versions stay flat at one-barrier cost.

That's interesting. With the way the patch set is structured, it
now looks like:

1. Fix the erratum, but cause a performance regression.
2. Restore the performance regression and (re)apply the erratum
   workaround.

Would it make sense to avoid introducing the performance
regression in the first place by structuring the patch set
slightly differently?

1. (Re)introduce arm64 memset_io()/memcpy_toio().
2. Fix the erratum once for all

What do you reckon?

Cheers
Vladimir



^ permalink raw reply

page: next (older) | prev (newer) | latest
- recent:[subjects (threaded)|topics (new)|topics (active)]

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox