[PATCH v1] ufs: core: decouple CQE processing from spinlock critical section

Linux SCSI subsystem development
 help / color / mirror / Atom feed

* [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
@ 2026-05-14  8:26 peter.wang
  2026-05-14 16:22 ` Bart Van Assche
  0 siblings, 1 reply; 4+ messages in thread
From: peter.wang @ 2026-05-14  8:26 UTC (permalink / raw)
  To: linux-scsi, martin.petersen, avri.altman, alim.akhtar, jejb
  Cc: wsd_upstream, linux-mediatek, peter.wang, chun-hung.wu,
	alice.chao, cc.chou, chaotian.jing, tun-yu.yu, eddie.huang,
	naomi.chu, ed.tsai, bvanassche, quic_cang, quic_asutoshd,
	light.hsieh

From: Peter Wang <peter.wang@mediatek.com>

Currently, ufshcd_mcq_process_cqe() is called while holding the CQ
spinlock, which can lead to unnecessary lock contention since CQE
processing may involve time-consuming operations like completing I/O
requests and invoking callbacks.

Refactor the CQE processing flow to separate the lock-protected queue
head/tail slot updates from the actual CQE processing:

1. Add a new 'cqe_last_addr' field to 'ufs_hw_queue' to cache the
   address of the last CQE entry, precomputed during memory allocation
   in ufshcd_mcq_memory_alloc(). This avoids repeated recalculation
   during the hot path.

2. Introduce ufshcd_mcq_inc_cqe_addr() helper in ufshcd-priv.h to
   increment a CQE pointer with wraparound, using 'cqe_last_addr' for
   boundary checking.

3. Refactor ufshcd_mcq_process_cqe() to accept a 'struct cq_entry *'
   directly instead of deriving it from the hardware queue, decoupling
   it from queue state.

4. In both ufshcd_mcq_compl_all_cqes_lock() and
   ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer before
   advancing the head slot under the spinlock, then process the collected
   CQEs after releasing the lock using the new helper.

This reduces the time spent holding the CQ spinlock to only the
minimal queue slot management operations, improving concurrency and
reducing latency under heavy I/O workloads.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
---
 drivers/ufs/core/ufs-mcq.c     | 23 ++++++++++++++++++-----
 drivers/ufs/core/ufshcd-priv.h | 20 ++++++++++++++++++++
 include/ufs/ufshcd.h           |  1 +
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index c1b1d67a1ddc..74a6595f9bda 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -248,6 +248,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba)
 			dev_err(hba->dev, "CQE allocation failed\n");
 			return -ENOMEM;
 		}
+		hwq->cqe_last_addr = hwq->cqe_base_addr + hwq->max_entries - 1;
 	}
 
 	return 0;
@@ -307,10 +308,8 @@ static int ufshcd_mcq_get_tag(struct ufs_hba *hba, struct cq_entry *cqe)
 }
 
 static void ufshcd_mcq_process_cqe(struct ufs_hba *hba,
-				   struct ufs_hw_queue *hwq)
+			           struct cq_entry *cqe)
 {
-	struct cq_entry *cqe = ufshcd_mcq_cur_cqe(hwq);
-
 	if (cqe->command_desc_base_addr) {
 		int tag = ufshcd_mcq_get_tag(hba, cqe);
 
@@ -335,10 +334,12 @@ void ufshcd_mcq_compl_all_cqes_lock(struct ufs_hba *hba,
 {
 	unsigned long flags;
 	u32 entries = hwq->max_entries;
+	struct cq_entry *cqe;
+	int i;
 
 	spin_lock_irqsave(&hwq->cq_lock, flags);
+	cqe = ufshcd_mcq_cur_cqe(hwq);
 	while (entries > 0) {
-		ufshcd_mcq_process_cqe(hba, hwq);
 		ufshcd_mcq_inc_cq_head_slot(hwq);
 		entries--;
 	}
@@ -346,6 +347,11 @@ void ufshcd_mcq_compl_all_cqes_lock(struct ufs_hba *hba,
 	ufshcd_mcq_update_cq_tail_slot(hwq);
 	hwq->cq_head_slot = hwq->cq_tail_slot;
 	spin_unlock_irqrestore(&hwq->cq_lock, flags);
+
+	for (i = 0; i < hwq->max_entries; i++) {
+		ufshcd_mcq_process_cqe(hba, cqe);
+		cqe = ufshcd_mcq_inc_cqe_addr(hwq, cqe);
+	}
 }
 
 unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
@@ -353,11 +359,13 @@ unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
 {
 	unsigned long completed_reqs = 0;
 	unsigned long flags;
+	struct cq_entry *cqe;
+	int i;
 
 	spin_lock_irqsave(&hwq->cq_lock, flags);
+	cqe = ufshcd_mcq_cur_cqe(hwq);
 	ufshcd_mcq_update_cq_tail_slot(hwq);
 	while (!ufshcd_mcq_is_cq_empty(hwq)) {
-		ufshcd_mcq_process_cqe(hba, hwq);
 		ufshcd_mcq_inc_cq_head_slot(hwq);
 		completed_reqs++;
 	}
@@ -366,6 +374,11 @@ unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
 		ufshcd_mcq_update_cq_head(hwq);
 	spin_unlock_irqrestore(&hwq->cq_lock, flags);
 
+	for (i = 0; i < completed_reqs; i++) {
+		ufshcd_mcq_process_cqe(hba, cqe);
+		cqe = ufshcd_mcq_inc_cqe_addr(hwq, cqe);
+	}
+
 	return completed_reqs;
 }
 EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_lock);
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 0a72148cb053..6d4d3e726a9a 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -440,6 +440,26 @@ static inline struct scsi_cmnd *ufshcd_tag_to_cmd(struct ufs_hba *hba, u32 tag)
 	return blk_mq_rq_to_pdu(rq);
 }
 
+/**
+ * ufshcd_mcq_inc_cqe_addr - increment CQE pointer with wraparound
+ * @hwq: pointer to the hardware queue
+ * @cqe: current CQE pointer to increment
+ *
+ * Increments the CQE pointer to the next entry. If the pointer
+ * exceeds the last entry, it wraps around to the base address.
+ *
+ * Returns: pointer to the next cq_entry
+ */
+static inline struct cq_entry *ufshcd_mcq_inc_cqe_addr(struct ufs_hw_queue *q,
+	struct cq_entry *cqe)
+{
+        cqe++;
+        if (cqe > q->cqe_last_addr)
+                cqe = q->cqe_base_addr;
+
+        return cqe;
+}
+
 static inline void ufshcd_inc_sq_tail(struct ufs_hw_queue *q)
 	__must_hold(&q->sq_lock)
 {
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index cfbc75d8df83..1becb38e215e 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1291,6 +1291,7 @@ struct ufs_hw_queue {
 	struct utp_transfer_req_desc *sqe_base_addr;
 	dma_addr_t sqe_dma_addr;
 	struct cq_entry *cqe_base_addr;
+	struct cq_entry *cqe_last_addr;
 	dma_addr_t cqe_dma_addr;
 	u32 max_entries;
 	u32 id;
-- 
2.45.2


^ permalink raw reply related	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-14  8:26 [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section peter.wang
@ 2026-05-14 16:22 ` Bart Van Assche
  2026-05-15  8:13   ` Peter Wang (王信友)
  0 siblings, 1 reply; 4+ messages in thread
From: Bart Van Assche @ 2026-05-14 16:22 UTC (permalink / raw)
  To: peter.wang, linux-scsi, martin.petersen, avri.altman, alim.akhtar,
	jejb
  Cc: wsd_upstream, linux-mediatek, chun-hung.wu, alice.chao, cc.chou,
	chaotian.jing, tun-yu.yu, eddie.huang, naomi.chu, ed.tsai,
	quic_cang, quic_asutoshd, light.hsieh

On 5/14/26 1:26 AM, peter.wang@mediatek.com wrote:
> 4. In both ufshcd_mcq_compl_all_cqes_lock() and
>     ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer before
>     advancing the head slot under the spinlock, then process the collected
>     CQEs after releasing the lock using the new helper.

This can't work reliably. ufshcd_mcq_poll_cqe_lock() may be called
concurrently from different CPU cores, e.g. from a UFS completion
interrupt and from ufshcd_poll(). Processing CQEs without holding
hwq->cq_lock may lead to overwriting of CQEs before these have been
processed.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-14 16:22 ` Bart Van Assche
@ 2026-05-15  8:13   ` Peter Wang (王信友)
  2026-05-15 16:43     ` Bart Van Assche
  0 siblings, 1 reply; 4+ messages in thread
From: Peter Wang (王信友) @ 2026-05-15  8:13 UTC (permalink / raw)
  To: linux-scsi@vger.kernel.org, jejb@linux.ibm.com,
	avri.altman@sandisk.com, bvanassche@acm.org,
	alim.akhtar@samsung.com, martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com, Alice Chao (趙珮均),
	Eddie Huang (黃智傑),
	CC Chou (周志杰),
	Ed Tsai (蔡宗軒), wsd_upstream,
	Chaotian Jing (井朝天),
	Chun-Hung Wu (巫駿宏),
	Naomi Chu (朱詠田),
	linux-mediatek@lists.infradead.org,
	Tun-yu Yu (游敦聿), quic_cang@guicinc.com,
	Light Hsieh (謝明燈)

On Thu, 2026-05-14 at 09:22 -0700, Bart Van Assche wrote:
> On 5/14/26 1:26 AM, peter.wang@mediatek.com wrote:
> > 4. In both ufshcd_mcq_compl_all_cqes_lock() and
> >     ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer
> > before
> >     advancing the head slot under the spinlock, then process the
> > collected
> >     CQEs after releasing the lock using the new helper.
> 
> This can't work reliably. ufshcd_mcq_poll_cqe_lock() may be called
> concurrently from different CPU cores, e.g. from a UFS completion
> interrupt and from ufshcd_poll(). Processing CQEs without holding
> hwq->cq_lock may lead to overwriting of CQEs before these have been
> processed.
> 
> Thanks,
> 
> Bart.

Hi Bart,

This is not an issue because the CQ head is protected by cq_lock.
Only the CQEs from head to tail will be processed by ufshcd_poll
or the ISR. The main difference is that these CQEs will be 
processed later, without holding the cq_lock.

Thanks
Peter


^ permalink raw reply	[flat|nested] 4+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-15  8:13   ` Peter Wang (王信友)
@ 2026-05-15 16:43     ` Bart Van Assche
  0 siblings, 0 replies; 4+ messages in thread
From: Bart Van Assche @ 2026-05-15 16:43 UTC (permalink / raw)
  To: Peter Wang (王信友), linux-scsi@vger.kernel.org,
	martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com

On 5/15/26 1:13 AM, Peter Wang (王信友) wrote:
> This is not an issue because the CQ head is protected by cq_lock.
> Only the CQEs from head to tail will be processed by ufshcd_poll
> or the ISR. The main difference is that these CQEs will be
> processed later, without holding the cq_lock.

Hi Peter,

Do you agree that the following can happen with this patch applied 
(assuming there is space for 9 CQEs on completion queues)?

(1) Host allocates tags 0, 1, 2 and 3 and adds the corresponding SQEs to
     a submission queue.
(2) ufshcd_mcq_poll_cqe_lock() is called from thread context because the
     host is polling for completions. The CQ tail is updated but CQE
     processing is delayed, e.g. because the process scheduler triggered
     a context switch to another thread.
(3) The host allocates tags 4, 5, 6 and 7 and sends the corresponding
     commands to the same submission queue.
(4) ufshcd_mcq_poll_cqe_lock() is called because a completion interrupt
     has been generated and processes completions for tags 4, 5, 6 and 7.
     The CQ tail is updated and the CQEs are processed.
(5) The host reallocates tags 4, 5, 6 and 7 and writes the corresponding
     SQEs to the tail of the submission queue.
(6) The host controller completes the corresponding commands and stores
     the CQEs in CQ slots 8, 0, 1 and 2. Hence, slots 0, 1 and 2 are
     overwritten although the overwritten CQEs have not yet been
     processed.
(7) The polling code from (2) continues and completes the CQEs in slots
     0, 1, 2 and 3. This causes three of the four of the commands from
     (6) to be reported as completed to the block layer although these
     have not yet been completed. This will likely trigger data
     corruption.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 4+ messages in thread

end of thread, other threads:[~2026-05-15 16:43 UTC | newest]

Thread overview: 4+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-14  8:26 [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section peter.wang
2026-05-14 16:22 ` Bart Van Assche
2026-05-15  8:13   ` Peter Wang (王信友)
2026-05-15 16:43     ` Bart Van Assche

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox