[PATCH v1] ufs: core: decouple CQE processing from spinlock critical section

All of lore.kernel.org
 help / color / mirror / Atom feed

* [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
@ 2026-05-14  8:26 peter.wang
  2026-05-14 16:22 ` Bart Van Assche
  0 siblings, 1 reply; 7+ messages in thread
From: peter.wang @ 2026-05-14  8:26 UTC (permalink / raw)
  To: linux-scsi, martin.petersen, avri.altman, alim.akhtar, jejb
  Cc: wsd_upstream, linux-mediatek, peter.wang, chun-hung.wu,
	alice.chao, cc.chou, chaotian.jing, tun-yu.yu, eddie.huang,
	naomi.chu, ed.tsai, bvanassche, quic_cang, quic_asutoshd,
	light.hsieh

From: Peter Wang <peter.wang@mediatek.com>

Currently, ufshcd_mcq_process_cqe() is called while holding the CQ
spinlock, which can lead to unnecessary lock contention since CQE
processing may involve time-consuming operations like completing I/O
requests and invoking callbacks.

Refactor the CQE processing flow to separate the lock-protected queue
head/tail slot updates from the actual CQE processing:

1. Add a new 'cqe_last_addr' field to 'ufs_hw_queue' to cache the
   address of the last CQE entry, precomputed during memory allocation
   in ufshcd_mcq_memory_alloc(). This avoids repeated recalculation
   during the hot path.

2. Introduce ufshcd_mcq_inc_cqe_addr() helper in ufshcd-priv.h to
   increment a CQE pointer with wraparound, using 'cqe_last_addr' for
   boundary checking.

3. Refactor ufshcd_mcq_process_cqe() to accept a 'struct cq_entry *'
   directly instead of deriving it from the hardware queue, decoupling
   it from queue state.

4. In both ufshcd_mcq_compl_all_cqes_lock() and
   ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer before
   advancing the head slot under the spinlock, then process the collected
   CQEs after releasing the lock using the new helper.

This reduces the time spent holding the CQ spinlock to only the
minimal queue slot management operations, improving concurrency and
reducing latency under heavy I/O workloads.

Signed-off-by: Peter Wang <peter.wang@mediatek.com>
---
 drivers/ufs/core/ufs-mcq.c     | 23 ++++++++++++++++++-----
 drivers/ufs/core/ufshcd-priv.h | 20 ++++++++++++++++++++
 include/ufs/ufshcd.h           |  1 +
 3 files changed, 39 insertions(+), 5 deletions(-)

diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c
index c1b1d67a1ddc..74a6595f9bda 100644
--- a/drivers/ufs/core/ufs-mcq.c
+++ b/drivers/ufs/core/ufs-mcq.c
@@ -248,6 +248,7 @@ int ufshcd_mcq_memory_alloc(struct ufs_hba *hba)
 			dev_err(hba->dev, "CQE allocation failed\n");
 			return -ENOMEM;
 		}
+		hwq->cqe_last_addr = hwq->cqe_base_addr + hwq->max_entries - 1;
 	}
 
 	return 0;
@@ -307,10 +308,8 @@ static int ufshcd_mcq_get_tag(struct ufs_hba *hba, struct cq_entry *cqe)
 }
 
 static void ufshcd_mcq_process_cqe(struct ufs_hba *hba,
-				   struct ufs_hw_queue *hwq)
+			           struct cq_entry *cqe)
 {
-	struct cq_entry *cqe = ufshcd_mcq_cur_cqe(hwq);
-
 	if (cqe->command_desc_base_addr) {
 		int tag = ufshcd_mcq_get_tag(hba, cqe);
 
@@ -335,10 +334,12 @@ void ufshcd_mcq_compl_all_cqes_lock(struct ufs_hba *hba,
 {
 	unsigned long flags;
 	u32 entries = hwq->max_entries;
+	struct cq_entry *cqe;
+	int i;
 
 	spin_lock_irqsave(&hwq->cq_lock, flags);
+	cqe = ufshcd_mcq_cur_cqe(hwq);
 	while (entries > 0) {
-		ufshcd_mcq_process_cqe(hba, hwq);
 		ufshcd_mcq_inc_cq_head_slot(hwq);
 		entries--;
 	}
@@ -346,6 +347,11 @@ void ufshcd_mcq_compl_all_cqes_lock(struct ufs_hba *hba,
 	ufshcd_mcq_update_cq_tail_slot(hwq);
 	hwq->cq_head_slot = hwq->cq_tail_slot;
 	spin_unlock_irqrestore(&hwq->cq_lock, flags);
+
+	for (i = 0; i < hwq->max_entries; i++) {
+		ufshcd_mcq_process_cqe(hba, cqe);
+		cqe = ufshcd_mcq_inc_cqe_addr(hwq, cqe);
+	}
 }
 
 unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
@@ -353,11 +359,13 @@ unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
 {
 	unsigned long completed_reqs = 0;
 	unsigned long flags;
+	struct cq_entry *cqe;
+	int i;
 
 	spin_lock_irqsave(&hwq->cq_lock, flags);
+	cqe = ufshcd_mcq_cur_cqe(hwq);
 	ufshcd_mcq_update_cq_tail_slot(hwq);
 	while (!ufshcd_mcq_is_cq_empty(hwq)) {
-		ufshcd_mcq_process_cqe(hba, hwq);
 		ufshcd_mcq_inc_cq_head_slot(hwq);
 		completed_reqs++;
 	}
@@ -366,6 +374,11 @@ unsigned long ufshcd_mcq_poll_cqe_lock(struct ufs_hba *hba,
 		ufshcd_mcq_update_cq_head(hwq);
 	spin_unlock_irqrestore(&hwq->cq_lock, flags);
 
+	for (i = 0; i < completed_reqs; i++) {
+		ufshcd_mcq_process_cqe(hba, cqe);
+		cqe = ufshcd_mcq_inc_cqe_addr(hwq, cqe);
+	}
+
 	return completed_reqs;
 }
 EXPORT_SYMBOL_GPL(ufshcd_mcq_poll_cqe_lock);
diff --git a/drivers/ufs/core/ufshcd-priv.h b/drivers/ufs/core/ufshcd-priv.h
index 0a72148cb053..6d4d3e726a9a 100644
--- a/drivers/ufs/core/ufshcd-priv.h
+++ b/drivers/ufs/core/ufshcd-priv.h
@@ -440,6 +440,26 @@ static inline struct scsi_cmnd *ufshcd_tag_to_cmd(struct ufs_hba *hba, u32 tag)
 	return blk_mq_rq_to_pdu(rq);
 }
 
+/**
+ * ufshcd_mcq_inc_cqe_addr - increment CQE pointer with wraparound
+ * @hwq: pointer to the hardware queue
+ * @cqe: current CQE pointer to increment
+ *
+ * Increments the CQE pointer to the next entry. If the pointer
+ * exceeds the last entry, it wraps around to the base address.
+ *
+ * Returns: pointer to the next cq_entry
+ */
+static inline struct cq_entry *ufshcd_mcq_inc_cqe_addr(struct ufs_hw_queue *q,
+	struct cq_entry *cqe)
+{
+        cqe++;
+        if (cqe > q->cqe_last_addr)
+                cqe = q->cqe_base_addr;
+
+        return cqe;
+}
+
 static inline void ufshcd_inc_sq_tail(struct ufs_hw_queue *q)
 	__must_hold(&q->sq_lock)
 {
diff --git a/include/ufs/ufshcd.h b/include/ufs/ufshcd.h
index cfbc75d8df83..1becb38e215e 100644
--- a/include/ufs/ufshcd.h
+++ b/include/ufs/ufshcd.h
@@ -1291,6 +1291,7 @@ struct ufs_hw_queue {
 	struct utp_transfer_req_desc *sqe_base_addr;
 	dma_addr_t sqe_dma_addr;
 	struct cq_entry *cqe_base_addr;
+	struct cq_entry *cqe_last_addr;
 	dma_addr_t cqe_dma_addr;
 	u32 max_entries;
 	u32 id;
-- 
2.45.2



^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-14  8:26 [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section peter.wang
@ 2026-05-14 16:22 ` Bart Van Assche
  2026-05-15  8:13   ` Peter Wang (王信友)
  0 siblings, 1 reply; 7+ messages in thread
From: Bart Van Assche @ 2026-05-14 16:22 UTC (permalink / raw)
  To: peter.wang, linux-scsi, martin.petersen, avri.altman, alim.akhtar,
	jejb
  Cc: wsd_upstream, linux-mediatek, chun-hung.wu, alice.chao, cc.chou,
	chaotian.jing, tun-yu.yu, eddie.huang, naomi.chu, ed.tsai,
	quic_cang, quic_asutoshd, light.hsieh

On 5/14/26 1:26 AM, peter.wang@mediatek.com wrote:
> 4. In both ufshcd_mcq_compl_all_cqes_lock() and
>     ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer before
>     advancing the head slot under the spinlock, then process the collected
>     CQEs after releasing the lock using the new helper.

This can't work reliably. ufshcd_mcq_poll_cqe_lock() may be called
concurrently from different CPU cores, e.g. from a UFS completion
interrupt and from ufshcd_poll(). Processing CQEs without holding
hwq->cq_lock may lead to overwriting of CQEs before these have been
processed.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-14 16:22 ` Bart Van Assche
@ 2026-05-15  8:13   ` Peter Wang (王信友)
  2026-05-15 16:43     ` Bart Van Assche
  0 siblings, 1 reply; 7+ messages in thread
From: Peter Wang (王信友) @ 2026-05-15  8:13 UTC (permalink / raw)
  To: linux-scsi@vger.kernel.org, jejb@linux.ibm.com,
	avri.altman@sandisk.com, bvanassche@acm.org,
	alim.akhtar@samsung.com, martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com, Alice Chao (趙珮均),
	Eddie Huang (黃智傑),
	CC Chou (周志杰),
	Ed Tsai (蔡宗軒), wsd_upstream,
	Chaotian Jing (井朝天),
	Chun-Hung Wu (巫駿宏),
	Naomi Chu (朱詠田),
	linux-mediatek@lists.infradead.org,
	Tun-yu Yu (游敦聿), quic_cang@guicinc.com,
	Light Hsieh (謝明燈)

On Thu, 2026-05-14 at 09:22 -0700, Bart Van Assche wrote:
> On 5/14/26 1:26 AM, peter.wang@mediatek.com wrote:
> > 4. In both ufshcd_mcq_compl_all_cqes_lock() and
> >     ufshcd_mcq_poll_cqe_lock(), snapshot the starting CQE pointer
> > before
> >     advancing the head slot under the spinlock, then process the
> > collected
> >     CQEs after releasing the lock using the new helper.
> 
> This can't work reliably. ufshcd_mcq_poll_cqe_lock() may be called
> concurrently from different CPU cores, e.g. from a UFS completion
> interrupt and from ufshcd_poll(). Processing CQEs without holding
> hwq->cq_lock may lead to overwriting of CQEs before these have been
> processed.
> 
> Thanks,
> 
> Bart.

Hi Bart,

This is not an issue because the CQ head is protected by cq_lock.
Only the CQEs from head to tail will be processed by ufshcd_poll
or the ISR. The main difference is that these CQEs will be 
processed later, without holding the cq_lock.

Thanks
Peter


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-15  8:13   ` Peter Wang (王信友)
@ 2026-05-15 16:43     ` Bart Van Assche
  2026-05-16  6:24       ` Peter Wang (王信友)
  2026-05-16 11:52       ` Peter Wang (王信友)
  0 siblings, 2 replies; 7+ messages in thread
From: Bart Van Assche @ 2026-05-15 16:43 UTC (permalink / raw)
  To: Peter Wang (王信友), linux-scsi@vger.kernel.org,
	martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com

On 5/15/26 1:13 AM, Peter Wang (王信友) wrote:
> This is not an issue because the CQ head is protected by cq_lock.
> Only the CQEs from head to tail will be processed by ufshcd_poll
> or the ISR. The main difference is that these CQEs will be
> processed later, without holding the cq_lock.

Hi Peter,

Do you agree that the following can happen with this patch applied 
(assuming there is space for 9 CQEs on completion queues)?

(1) Host allocates tags 0, 1, 2 and 3 and adds the corresponding SQEs to
     a submission queue.
(2) ufshcd_mcq_poll_cqe_lock() is called from thread context because the
     host is polling for completions. The CQ tail is updated but CQE
     processing is delayed, e.g. because the process scheduler triggered
     a context switch to another thread.
(3) The host allocates tags 4, 5, 6 and 7 and sends the corresponding
     commands to the same submission queue.
(4) ufshcd_mcq_poll_cqe_lock() is called because a completion interrupt
     has been generated and processes completions for tags 4, 5, 6 and 7.
     The CQ tail is updated and the CQEs are processed.
(5) The host reallocates tags 4, 5, 6 and 7 and writes the corresponding
     SQEs to the tail of the submission queue.
(6) The host controller completes the corresponding commands and stores
     the CQEs in CQ slots 8, 0, 1 and 2. Hence, slots 0, 1 and 2 are
     overwritten although the overwritten CQEs have not yet been
     processed.
(7) The polling code from (2) continues and completes the CQEs in slots
     0, 1, 2 and 3. This causes three of the four of the commands from
     (6) to be reported as completed to the block layer although these
     have not yet been completed. This will likely trigger data
     corruption.

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-15 16:43     ` Bart Van Assche
@ 2026-05-16  6:24       ` Peter Wang (王信友)
  2026-05-16 14:01         ` Bart Van Assche
  2026-05-16 11:52       ` Peter Wang (王信友)
  1 sibling, 1 reply; 7+ messages in thread
From: Peter Wang (王信友) @ 2026-05-16  6:24 UTC (permalink / raw)
  To: linux-scsi@vger.kernel.org, bvanassche@acm.org,
	martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com


Hi Bart,

This cannot happen. I think you've missed my point and 
are confusing head with tail, and tag with slot.


On Fri, 2026-05-15 at 09:43 -0700, Bart Van Assche wrote
> Hi Peter,
> 
> Do you agree that the following can happen with this patch applied
> (assuming there is space for 9 CQEs on completion queues)?
> 
> (1) Host allocates tags 0, 1, 2 and 3 and adds the corresponding SQEs
> to
>      a submission queue.
> 

Tags 0, 1, 2 and 3 in SQ slot 0, 1, 2 and 3

> (2) ufshcd_mcq_poll_cqe_lock() is called from thread context because
> the
>      host is polling for completions. The CQ tail is updated but CQE
>      processing is delayed, e.g. because the process scheduler
> triggered
>      a context switch to another thread.
> 

SW can only update CQ head, hence this should be
The CQ head is updated, but CQ slot 0, 1, 2 and 3 processing is delayed
CQ slot 0, 1, 2, 3 is tag 0, 1, 2, 3, CQ head = tail = 4


> (3) The host allocates tags 4, 5, 6 and 7 and sends the corresponding
>      commands to the same submission queue.
> 

Tags 4, 5, 6 and 7 is in SQ slot 4, 5, 6 and 7

> (4) ufshcd_mcq_poll_cqe_lock() is called because a completion
> interrupt
>      has been generated and processes completions for tags 4, 5, 6
> and 7.
>      The CQ tail is updated and the CQEs are processed.

Again, SW can only update CQ head, hence this should be
The CQ head is updated and the CQ slot 4, 5, 6, 7 are processed.
CQ slot 4, 5, 6, 7 is tag 4, 5, 6, 7, CQ head = tail = 8

> (5) The host reallocates tags 4, 5, 6 and 7 and writes the
> corresponding
>      SQEs to the tail of the submission queue.
> 

Here SQ slot 8, 9, 10, 11 is tag 4, 5, 6, 7, Same as CQ if hw finish
it.

> (6) The host controller completes the corresponding commands and
> stores
>      the CQEs in CQ slots 8, 0, 1 and 2. Hence, slots 0, 1 and 2 are
>      overwritten although the overwritten CQEs have not yet been
>      processed.
> 

CQ tail is 12 if HW finish, CQ head is 8 after step 4.
Only complete CQ slot 8 , 9, 10, 11, which is tag 4, 5, 6, 7

> (7) The polling code from (2) continues and completes the CQEs in
> slots
>      0, 1, 2 and 3. This causes three of the four of the commands
> from
>      (6) to be reported as completed to the block layer although
> these
>      have not yet been completed. This will likely trigger data
>      corruption.

The polling of tag 0 1 2 3 in CQ slot 0 1 2 3 complete without error.
and SQ tail = SQ head = CQ tail = CQ head = 12


Thanks.
Peter


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-16  6:24       ` Peter Wang (王信友)
@ 2026-05-16 14:01         ` Bart Van Assche
  0 siblings, 0 replies; 7+ messages in thread
From: Bart Van Assche @ 2026-05-16 14:01 UTC (permalink / raw)
  To: Peter Wang (王信友), linux-scsi@vger.kernel.org,
	martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com

On 5/15/26 11:24 PM, Peter Wang (王信友) wrote:
> This cannot happen. I think you've missed my point and
> are confusing head with tail, and tag with slot.

Yes, I swapped head and tail but that doesn't alter the conclusion that
your patch can cause data corruption and kernel crashes. Not only for
UFS but also for other storage controllers that use circular queues it
is essential that completions are processed in the order that these have
been pushed onto the completion queue by the storage controller.
Otherwise completion queue entries can get overwritten by the storage
controller before these have been processed.

Additionally, reducing how long hwq->cq_lock is held is not sufficient.
This patch does not alter how much time is spent inside the UFS
completion queue interrupt. According to my measurements more than 10 ms
can be spent inside that interrupt. That is way too much - this can
cause slowness of the user interface and audio glitches.

The patch below reduces the time spent in UFS completion interrupts from
10 ms to 100 microseconds (100x) on my test setup. This patch needs
further refinement but is sufficient to show the root cause and a
potential solution.

Bart.

diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1c47bec101f0..33ddb234eb3d 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -320,14 +320,23 @@ static void f2fs_read_end_io(struct bio *bio)
  	f2fs_verify_and_finish_bio(bio, intask);
  }

-static void f2fs_write_end_io(struct bio *bio)
+struct f2fs_write_end_io_work {
+	struct work_struct work;
+	struct bio *bio;
+	bool needs_to_be_freed;
+};
+
+static void f2fs_write_end_io_work(struct work_struct *work)
  {
-	struct f2fs_sb_info *sbi;
+	struct f2fs_write_end_io_work *weiw =
+		container_of(work, typeof(*weiw), work);
+	struct bio *bio = weiw->bio;
+	struct f2fs_sb_info *sbi = bio->bi_private;
  	struct bio_vec *bvec;
  	struct bvec_iter_all iter_all;

-	iostat_update_and_unbind_ctx(bio);
-	sbi = bio->bi_private;
+	if (weiw->needs_to_be_freed)
+		kfree(weiw);

  	if (time_to_inject(sbi, FAULT_WRITE_IO))
  		bio->bi_status = BLK_STS_IOERR;
@@ -374,6 +383,25 @@ static void f2fs_write_end_io(struct bio *bio)
  	bio_put(bio);
  }

+static void f2fs_write_end_io(struct bio *bio)
+{
+	struct f2fs_write_end_io_work *weiw;
+
+	iostat_update_and_unbind_ctx(bio);
+	weiw = kmalloc(sizeof(*weiw), GFP_ATOMIC | __GFP_NOWARN);
+	if (weiw) {
+		INIT_WORK(&weiw->work, f2fs_write_end_io_work);
+		weiw->bio = bio;
+		weiw->needs_to_be_freed = true;
+		queue_work(system_wq, &weiw->work);
+	} else {
+		struct f2fs_write_end_io_work weiw_on_stack = { .bio = bio };
+
+		WARN_ON_ONCE(true);
+		f2fs_write_end_io_work(&weiw_on_stack.work);
+	}
+}
+
  #ifdef CONFIG_BLK_DEV_ZONED
  static void f2fs_zone_write_end_io(struct bio *bio)
  {

^ permalink raw reply related	[flat|nested] 7+ messages in thread

* Re: [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section
  2026-05-15 16:43     ` Bart Van Assche
  2026-05-16  6:24       ` Peter Wang (王信友)
@ 2026-05-16 11:52       ` Peter Wang (王信友)
  1 sibling, 0 replies; 7+ messages in thread
From: Peter Wang (王信友) @ 2026-05-16 11:52 UTC (permalink / raw)
  To: linux-scsi@vger.kernel.org, bvanassche@acm.org,
	martin.petersen@oracle.com
  Cc: quic_asutoshd@guicinc.com

On Fri, 2026-05-15 at 09:43 -0700, Bart Van Assche wrote:
> (6) The host controller completes the corresponding commands and
> stores
>      the CQEs in CQ slots 8, 0, 1 and 2. Hence, slots 0, 1 and 2 are
>      overwritten although the overwritten CQEs have not yet been
>      processed.

Sorry, I might have misunderstood your point. 
Are you saying that the HWQ depth is only 9?

Thanks
Peter

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2026-05-16 14:01 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2026-05-14  8:26 [PATCH v1] ufs: core: decouple CQE processing from spinlock critical section peter.wang
2026-05-14 16:22 ` Bart Van Assche
2026-05-15  8:13   ` Peter Wang (王信友)
2026-05-15 16:43     ` Bart Van Assche
2026-05-16  6:24       ` Peter Wang (王信友)
2026-05-16 14:01         ` Bart Van Assche
2026-05-16 11:52       ` Peter Wang (王信友)

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.