* [PATCH RFT 1/5] dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
@ 2026-01-09 20:13 ` Frank Li
2026-01-09 20:13 ` [PATCH RFT 2/5] dmaengine: dw-edma: Move dw_hdma_set_callback_result() up Frank Li
` (5 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-09 20:13 UTC (permalink / raw)
To: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Manivannan Sadhasivam,
Krzysztof Wilczyński, Kishon Vijay Abraham I, Bjorn Helgaas,
Christoph Hellwig, Niklas Cassel
Cc: dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx, Frank Li
Add dw_edma_core_ll_cur_idx() to get completed DMA link entry position to
prepare support dymatic add DMA request during DMA running.
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/dw-edma-core.h | 10 ++++++++++
drivers/dma/dw-edma/dw-edma-v0-core.c | 17 +++++++++++++++++
drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++++++++++++++++
3 files changed, 44 insertions(+)
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 31039eb85079cbbd38a90d249091113ad646c6f9..d68c4592c6177e4fe2a2ae8a645bb065279ac45d 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -123,6 +123,7 @@ struct dw_edma_core_ops {
void (*ll_data)(struct dw_edma_chan *chan, struct dw_edma_burst *burst,
u32 idx, bool cb, bool irq);
void (*ll_link)(struct dw_edma_chan *chan, u32 idx, bool cb, u64 addr);
+ int (*ll_cur_idx)(struct dw_edma_chan *chan);
void (*ch_doorbell)(struct dw_edma_chan *chan);
void (*ch_enable)(struct dw_edma_chan *chan);
void (*ch_config)(struct dw_edma_chan *chan);
@@ -164,6 +165,15 @@ struct dw_edma_chan *dchan2dw_edma_chan(struct dma_chan *dchan)
return vc2dw_edma_chan(to_virt_chan(dchan));
}
+/*
+ * Get current DMA running idx.
+ * < 0 means channel have not initialized or hardware reset by PCI link lost
+ */
+static inline int dw_edma_core_ll_cur_idx(struct dw_edma_chan *chan)
+{
+ return chan->dw->core->ll_cur_idx(chan);
+}
+
static inline u64 dw_edma_core_get_ll_paddr(struct dw_edma_chan *chan)
{
if (chan->dir == EDMA_DIR_WRITE)
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index 7b4591f984ad8b6f9909db16775368ff471db2b8..edc71a4dbc798386508e15f44e85c23e7e50f2ee 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -504,6 +504,22 @@ static void dw_edma_v0_core_ch_doorbell(struct dw_edma_chan *chan)
FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id));
}
+static int dw_edma_v0_core_ll_cur_idx(struct dw_edma_chan *chan)
+{
+ u64 paddr;
+ u32 val;
+
+ /* LL region never cross 4G memory boundary, so only check low 32bit */
+ val = GET_CH_32(chan->dw, chan->dir, chan->id, llp.lsb);
+ paddr = dw_edma_core_get_ll_paddr(chan);
+
+ /* DMA have not setup or DMA engine reset because PCIe link lost */
+ if (!val)
+ return -EINVAL;
+
+ return (val - (paddr & 0xFFFFFFFF)) / EDMA_LL_SZ;
+}
+
/* eDMA debugfs callbacks */
static void dw_edma_v0_core_debugfs_on(struct dw_edma *dw)
{
@@ -517,6 +533,7 @@ static const struct dw_edma_core_ops dw_edma_v0_core = {
.handle_int = dw_edma_v0_core_handle_int,
.ll_data = dw_edma_v0_core_ll_data,
.ll_link = dw_edma_v0_core_ll_link,
+ .ll_cur_idx = dw_edma_v0_core_ll_cur_idx,
.ch_doorbell = dw_edma_v0_core_ch_doorbell,
.ch_enable = dw_edma_v0_core_ch_enable,
.ch_config = dw_edma_v0_core_ch_config,
diff --git a/drivers/dma/dw-edma/dw-hdma-v0-core.c b/drivers/dma/dw-edma/dw-hdma-v0-core.c
index 7f9fe3a6edd94583fd09c80a8d79527ed6383a8c..8e5bdcd208b5c2553ac45f744a4c678932ea5a03 100644
--- a/drivers/dma/dw-edma/dw-hdma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-hdma-v0-core.c
@@ -285,6 +285,22 @@ static void dw_hdma_v0_core_ch_doorbell(struct dw_edma_chan *chan)
SET_CH_32(dw, chan->dir, chan->id, doorbell, HDMA_V0_DOORBELL_START);
}
+static int dw_hdma_v0_core_ll_cur_idx(struct dw_edma_chan *chan)
+{
+ u64 paddr;
+ u32 val;
+
+ /* LL region never cross 4G memory boundary, so only check low 32bit */
+ val = GET_CH_32(chan->dw, chan->dir, chan->id, llp.lsb);
+ paddr = dw_edma_core_get_ll_paddr(chan);
+
+ /* DMA have not setup or DMA engine reset because PCIe link lost */
+ if (!val)
+ return -EINVAL;
+
+ return (val - (paddr & 0xFFFFFFFF)) / EDMA_LL_SZ;
+}
+
/* HDMA debugfs callbacks */
static void dw_hdma_v0_core_debugfs_on(struct dw_edma *dw)
{
@@ -298,6 +314,7 @@ static const struct dw_edma_core_ops dw_hdma_v0_core = {
.handle_int = dw_hdma_v0_core_handle_int,
.ll_data = dw_hdma_v0_core_ll_data,
.ll_link = dw_hdma_v0_core_ll_link,
+ .ll_cur_idx = dw_hdma_v0_core_ll_cur_idx,
.ch_doorbell = dw_hdma_v0_core_ch_doorbell,
.ch_enable = dw_hdma_v0_core_ch_enable,
.ch_config = dw_hdma_v0_core_ch_config,
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH RFT 2/5] dmaengine: dw-edma: Move dw_hdma_set_callback_result() up
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
2026-01-09 20:13 ` [PATCH RFT 1/5] dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos Frank Li
@ 2026-01-09 20:13 ` Frank Li
2026-01-09 20:13 ` [PATCH RFT 3/5] dmaengine: dw-edma: Make DMA link list work as a circular buffer Frank Li
` (4 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-09 20:13 UTC (permalink / raw)
To: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Manivannan Sadhasivam,
Krzysztof Wilczyński, Kishon Vijay Abraham I, Bjorn Helgaas,
Christoph Hellwig, Niklas Cassel
Cc: dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx, Frank Li
Move dw_hdma_set_callback_result() before dw_edma_device_tx_status() to
avoid forward declear.
No functional change.
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/dw-edma-core.c | 50 +++++++++++++++++++-------------------
1 file changed, 25 insertions(+), 25 deletions(-)
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 1c8aef5e03b0e2c93aec9f1cb0588b4e8b1508d9..9fb7ae4001207b2ccb058d6efa9856dded379b8f 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -106,6 +106,31 @@ static int dw_edma_start_transfer(struct dw_edma_chan *chan)
return 1;
}
+static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
+ enum dmaengine_tx_result result)
+{
+ u32 residue = 0;
+ struct dw_edma_desc *desc;
+ struct dmaengine_result *res;
+
+ if (!vd->tx.callback_result)
+ return;
+
+ desc = vd2dw_edma_desc(vd);
+ if (desc) {
+ residue = desc->alloc_sz;
+
+ if (result == DMA_TRANS_NOERROR)
+ residue -= desc->burst[desc->start_burst - 1].xfer_sz;
+ else if (desc->done_burst)
+ residue -= desc->burst[desc->done_burst - 1].xfer_sz;
+ }
+
+ res = &vd->tx_result;
+ res->result = result;
+ res->residue = residue;
+}
+
static void dw_edma_device_caps(struct dma_chan *dchan,
struct dma_slave_caps *caps)
{
@@ -488,31 +513,6 @@ dw_edma_device_prep_interleaved_dma(struct dma_chan *dchan,
return dw_edma_device_transfer(&xfer, dw_edma_device_get_config(dchan, NULL));
}
-static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
- enum dmaengine_tx_result result)
-{
- u32 residue = 0;
- struct dw_edma_desc *desc;
- struct dmaengine_result *res;
-
- if (!vd->tx.callback_result)
- return;
-
- desc = vd2dw_edma_desc(vd);
- if (desc) {
- residue = desc->alloc_sz;
-
- if (result == DMA_TRANS_NOERROR)
- residue -= desc->burst[desc->start_burst - 1].xfer_sz;
- else if (desc->done_burst)
- residue -= desc->burst[desc->done_burst - 1].xfer_sz;
- }
-
- res = &vd->tx_result;
- res->result = result;
- res->residue = residue;
-}
-
static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
{
struct dw_edma_desc *desc;
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH RFT 3/5] dmaengine: dw-edma: Make DMA link list work as a circular buffer
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
2026-01-09 20:13 ` [PATCH RFT 1/5] dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos Frank Li
2026-01-09 20:13 ` [PATCH RFT 2/5] dmaengine: dw-edma: Move dw_hdma_set_callback_result() up Frank Li
@ 2026-01-09 20:13 ` Frank Li
2026-01-09 20:13 ` [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running Frank Li
` (3 subsequent siblings)
6 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-09 20:13 UTC (permalink / raw)
To: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Manivannan Sadhasivam,
Krzysztof Wilczyński, Kishon Vijay Abraham I, Bjorn Helgaas,
Christoph Hellwig, Niklas Cassel
Cc: dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx, Frank Li
The existing code rebuilds the entire link list from the beginning and
resets the DMA link header for each transfer, which is unnecessary.
The DMA link list can be treated as a circular buffer, where new DMA
requests are appended at ll_head with the appropriate CB flags and push
door bell, without rebuilding the whole list.
Switch to this circular-buffer model to prepare for dynamically adding
new requests while the DMA engine is running.
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/dw-edma-core.c | 57 +++++++++++++++++++++++++++++---------
drivers/dma/dw-edma/dw-edma-core.h | 28 ++++++++++++++++++-
2 files changed, 71 insertions(+), 14 deletions(-)
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 9fb7ae4001207b2ccb058d6efa9856dded379b8f..678bbc4e65f0e2fced6efec88a3af6935d833bc6 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -51,7 +51,6 @@ dw_edma_alloc_desc(struct dw_edma_chan *chan, u32 nburst)
desc->chan = chan;
desc->nburst = nburst;
- desc->cb = true;
return desc;
}
@@ -61,27 +60,56 @@ static void vchan_free_desc(struct virt_dma_desc *vdesc)
kfree(vd2dw_edma_desc(vdesc));
}
+static void dw_edma_core_reset_ll(struct dw_edma_chan *chan)
+{
+ chan->ll_head = 0;
+ chan->ll_end = 0;
+ chan->cb = true;
+
+ dw_edma_core_ll_link(chan, chan->ll_max - 1, chan->cb,
+ chan->ll_region.paddr);
+
+ dw_edma_core_ch_enable(chan);
+}
+
+static u32 dw_edma_core_get_free_num(struct dw_edma_chan *chan)
+{
+ /*
+ * Max entries is ll_max - 1 because last one used for link back to
+ * start of ll_region.
+ */
+ return (chan->ll_end + chan->ll_max - 2 - chan->ll_head) %
+ (chan->ll_max - 1);
+}
+
static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
{
struct dw_edma_chan *chan = desc->chan;
u32 i = 0;
+ u32 free;
+
+ for (i = desc->start_burst; i < desc->nburst; i++) {
+ free = dw_edma_core_get_free_num(chan);
- for (i = 0; i < desc->nburst; i++) {
- if (i == chan->ll_max - 1)
+ if (!free)
break;
- dw_edma_core_ll_data(chan, &desc->burst[i + desc->start_burst],
- i, desc->cb,
- i == desc->nburst - 1 || i == chan->ll_max - 2);
+ /* Enable irq for last free entry or last burst */
+ dw_edma_core_ll_data(chan, &desc->burst[i],
+ chan->ll_head, chan->cb,
+ i == desc->nburst - 1 || free == 1);
+
+ chan->ll_head++;
+
+ if (chan->ll_head == chan->ll_max - 1) {
+ chan->cb = !chan->cb;
+ chan->ll_head = 0;
+ }
}
desc->done_burst = desc->start_burst;
desc->start_burst += i;
-
- dw_edma_core_ll_link(chan, i, desc->cb, chan->ll_region.paddr);
-
- if (first)
- dw_edma_core_ch_enable(chan);
+ desc->ll_end = chan->ll_head;
dw_edma_core_ch_doorbell(chan);
}
@@ -90,6 +118,10 @@ static int dw_edma_start_transfer(struct dw_edma_chan *chan)
{
struct dw_edma_desc *desc;
struct virt_dma_desc *vd;
+ int index = dw_edma_core_ll_cur_idx(chan);
+
+ if (index < 0)
+ dw_edma_core_reset_ll(chan);
vd = vchan_next_desc(&chan->vc);
if (!vd)
@@ -101,8 +133,6 @@ static int dw_edma_start_transfer(struct dw_edma_chan *chan)
dw_edma_core_start(desc, !desc->start_burst);
- desc->cb = !desc->cb;
-
return 1;
}
@@ -530,6 +560,7 @@ static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
DMA_TRANS_NOERROR);
list_del(&vd->node);
vchan_cookie_complete(vd);
+ chan->ll_end = desc->ll_end;
}
/* Continue transferring if there are remaining chunks or issued requests.
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index d68c4592c6177e4fe2a2ae8a645bb065279ac45d..fd4b086a36441cc3209131e4274d6c47de4d616c 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -60,9 +60,10 @@ struct dw_edma_desc {
u32 alloc_sz;
u32 xfer_sz;
+ u32 ll_end;
+
u32 done_burst;
u32 start_burst;
- u8 cb;
u32 nburst;
struct dw_edma_burst burst[] __counted_by(nburst);
};
@@ -73,9 +74,34 @@ struct dw_edma_chan {
int id;
enum dw_edma_dir dir;
+ /*
+ * Add new entry from ll_header.
+ *
+ * ll_end ll_head
+ * │ │
+ * ▼ ▼
+ * ┌─────────────────────────────────────────┌─┐
+ * │SSSSSSSDDDDDDDDDDDDDDDDDDDDDSSSSSSSSSSSSS│ │
+ * └─────────────────────────────────────────└┬┘
+ * ▲ │
+ * └─────────────────────────────────────────┘
+ * DMA Link To Region Start
+ * D: eDMA owned LL entry
+ * S: Software owned LL entry.
+ *
+ * ll_header == ll_end means all own by software, all previous DMA
+ * already done.
+ *
+ * Software at lease owned one entry, all D is impossible.
+ */
+ u32 ll_head;
+ u32 ll_end;
+
u32 ll_max;
struct dw_edma_region ll_region; /* Linked list */
+ bool cb;
+
struct msi_msg msi;
enum dw_edma_request request;
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
` (2 preceding siblings ...)
2026-01-09 20:13 ` [PATCH RFT 3/5] dmaengine: dw-edma: Make DMA link list work as a circular buffer Frank Li
@ 2026-01-09 20:13 ` Frank Li
2026-01-23 10:41 ` Niklas Cassel
2026-01-09 20:13 ` [PATCH RFT 5/5] dmaengine: dw-edma: Add trace support Frank Li
` (2 subsequent siblings)
6 siblings, 1 reply; 14+ messages in thread
From: Frank Li @ 2026-01-09 20:13 UTC (permalink / raw)
To: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Manivannan Sadhasivam,
Krzysztof Wilczyński, Kishon Vijay Abraham I, Bjorn Helgaas,
Christoph Hellwig, Niklas Cassel
Cc: dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx, Frank Li
This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which
support append new DMA request during dmaengine runnings.
Append new request during dmaengine runnings.
But look like hardware have bug, which missed doorbell when engine is
running. So add workaround to push doorbelll again when found engine stop.
Get more than 10% performance gain.
The before
Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
After
Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/dw-edma-core.c | 104 ++++++++++++++++++++++++++--------
drivers/dma/dw-edma/dw-edma-core.h | 2 +
drivers/dma/dw-edma/dw-edma-v0-core.c | 22 ++++++-
3 files changed, 102 insertions(+), 26 deletions(-)
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 678bbc4e65f0e2fced6efec88a3af6935d833bc6..5aacd04bd2da4a65aabec48f6631f6f8882eecfd 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -65,6 +65,7 @@ static void dw_edma_core_reset_ll(struct dw_edma_chan *chan)
chan->ll_head = 0;
chan->ll_end = 0;
chan->cb = true;
+ chan->cur_idx = -1;
dw_edma_core_ll_link(chan, chan->ll_max - 1, chan->cb,
chan->ll_region.paddr);
@@ -82,6 +83,12 @@ static u32 dw_edma_core_get_free_num(struct dw_edma_chan *chan)
(chan->ll_max - 1);
}
+static u32 dw_edma_core_get_done_num(struct dw_edma_chan *chan, u32 index)
+{
+ return (index - chan->ll_end + chan->ll_max - 1) % (chan->ll_max - 1);
+}
+
+/* Need hold vc.lock */
static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
{
struct dw_edma_chan *chan = desc->chan;
@@ -94,6 +101,11 @@ static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
if (!free)
break;
+ /* need update link CB before last update last item */
+ if (chan->ll_head == chan->ll_max - 2)
+ dw_edma_core_ll_link(chan, chan->ll_max - 1, chan->cb,
+ chan->ll_region.paddr);
+
/* Enable irq for last free entry or last burst */
dw_edma_core_ll_data(chan, &desc->burst[i],
chan->ll_head, chan->cb,
@@ -108,32 +120,36 @@ static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
}
desc->done_burst = desc->start_burst;
- desc->start_burst += i;
+ desc->start_burst = i;
desc->ll_end = chan->ll_head;
-
- dw_edma_core_ch_doorbell(chan);
}
+/* Need hold vc.lock */
static int dw_edma_start_transfer(struct dw_edma_chan *chan)
{
struct dw_edma_desc *desc;
struct virt_dma_desc *vd;
int index = dw_edma_core_ll_cur_idx(chan);
+ int ret = 0;
if (index < 0)
dw_edma_core_reset_ll(chan);
- vd = vchan_next_desc(&chan->vc);
- if (!vd)
- return 0;
+ list_for_each_entry(vd, &chan->vc.desc_issued, node) {
+ if (!dw_edma_core_get_free_num(chan))
+ return ret;
- desc = vd2dw_edma_desc(vd);
- if (!desc)
- return 0;
+ desc = vd2dw_edma_desc(vd);
- dw_edma_core_start(desc, !desc->start_burst);
+ if (desc->start_burst != desc->nburst) {
+ dw_edma_core_start(desc, !desc->start_burst);
+ ret = 1;
+ } else {
+ break;
+ }
+ }
- return 1;
+ return ret;
}
static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
@@ -161,6 +177,31 @@ static void dw_hdma_set_callback_result(struct virt_dma_desc *vd,
res->residue = residue;
}
+/* Need hold vc.lock */
+static void dw_edma_ll_clean_pending(struct dw_edma_chan *chan, int idx)
+{
+ struct virt_dma_desc *vd, *_vd;
+
+ list_for_each_entry_safe(vd, _vd, &chan->vc.desc_issued, node) {
+ struct dw_edma_desc *desc = vd2dw_edma_desc(vd);
+
+ if (desc->start_burst == desc->nburst) {
+ if (dw_edma_core_get_done_num(chan, idx) >=
+ dw_edma_core_get_done_num(chan, desc->ll_end)) {
+
+ dw_hdma_set_callback_result(vd,
+ DMA_TRANS_NOERROR);
+ list_del(&vd->node);
+ vchan_cookie_complete(vd);
+ chan->ll_end = desc->ll_end;
+ }
+ } else {
+ break;
+ }
+ }
+
+}
+
static void dw_edma_device_caps(struct dma_chan *dchan,
struct dma_slave_caps *caps)
{
@@ -272,12 +313,13 @@ static void dw_edma_device_issue_pending(struct dma_chan *dchan)
return;
spin_lock_irqsave(&chan->vc.lock, flags);
- if (vchan_issue_pending(&chan->vc) && chan->request == EDMA_REQ_NONE &&
- chan->status == EDMA_ST_IDLE) {
+ if (vchan_issue_pending(&chan->vc)) {
chan->status = EDMA_ST_BUSY;
dw_edma_start_transfer(chan);
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ dw_edma_core_ch_doorbell(chan);
}
static enum dma_status
@@ -290,7 +332,23 @@ dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
unsigned long flags;
enum dma_status ret;
u32 residue = 0;
+ int idx;
+ ret = dma_cookie_status(dchan, cookie, txstate);
+ if (ret == DMA_COMPLETE)
+ return ret;
+
+ spin_lock_irqsave(&chan->vc.lock, flags);
+ idx = dw_edma_core_ll_cur_idx(chan);
+ if (idx != chan->cur_idx) {
+ chan->cur_idx = idx;
+
+ dw_edma_ll_clean_pending(chan, idx);
+ dw_edma_start_transfer(chan);
+ }
+ spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ /* check gain because dw_edma_ll_clean_pending() may update cookie */
ret = dma_cookie_status(dchan, cookie, txstate);
if (ret == DMA_COMPLETE)
return ret;
@@ -545,26 +603,20 @@ dw_edma_device_prep_interleaved_dma(struct dma_chan *dchan,
static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
{
- struct dw_edma_desc *desc;
struct virt_dma_desc *vd;
unsigned long flags;
+ int idx;
spin_lock_irqsave(&chan->vc.lock, flags);
+ idx = dw_edma_core_ll_cur_idx(chan);
+ if (idx != chan->cur_idx) {
+ chan->cur_idx = idx;
+ dw_edma_ll_clean_pending(chan, idx);
+ }
vd = vchan_next_desc(&chan->vc);
if (vd) {
switch (chan->request) {
case EDMA_REQ_NONE:
- desc = vd2dw_edma_desc(vd);
- if (desc->start_burst >= desc->nburst) {
- dw_hdma_set_callback_result(vd,
- DMA_TRANS_NOERROR);
- list_del(&vd->node);
- vchan_cookie_complete(vd);
- chan->ll_end = desc->ll_end;
- }
-
- /* Continue transferring if there are remaining chunks or issued requests.
- */
chan->status = dw_edma_start_transfer(chan) ? EDMA_ST_BUSY : EDMA_ST_IDLE;
break;
@@ -585,6 +637,8 @@ static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
}
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
+
+ dw_edma_core_ch_doorbell(chan);
}
static void dw_edma_abort_interrupt(struct dw_edma_chan *chan)
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index fd4b086a36441cc3209131e4274d6c47de4d616c..94d49f8359b99a9b0f8ca708edf81ca854dff4c2 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -108,6 +108,8 @@ struct dw_edma_chan {
enum dw_edma_status status;
u8 configured;
+ int cur_idx;
+
struct dma_slave_config config;
};
diff --git a/drivers/dma/dw-edma/dw-edma-v0-core.c b/drivers/dma/dw-edma/dw-edma-v0-core.c
index edc71a4dbc798386508e15f44e85c23e7e50f2ee..bb9a1682f943dafef28bcf52ab83f3485068f8ed 100644
--- a/drivers/dma/dw-edma/dw-edma-v0-core.c
+++ b/drivers/dma/dw-edma/dw-edma-v0-core.c
@@ -499,7 +499,6 @@ static void dw_edma_v0_core_ch_doorbell(struct dw_edma_chan *chan)
dw_edma_v0_sync_ll_data(chan);
- /* Doorbell */
SET_RW_32(dw, chan->dir, doorbell,
FIELD_PREP(EDMA_V0_DOORBELL_CH_MASK, chan->id));
}
@@ -517,6 +516,27 @@ static int dw_edma_v0_core_ll_cur_idx(struct dw_edma_chan *chan)
if (!val)
return -EINVAL;
+ /*
+ * DMA engine looks like have hardware bugs, Doorbell will be missed
+ * if DMA engine running, so last update descriptor have not fetched by
+ * DMA engine, so DMA engine stop.
+ *
+ * Most like issue happen at
+ *
+ * DMA Engine | SW
+ * ======================================
+ * 1 send Read req for LL
+ * 2 update LL
+ * 3 doorbell
+ * 4 *Missed doorbell*
+ * 5 Get old LL data
+ * 6 DMA stop
+ *
+ * Workaround: Push doorbell again when found DMA stop.
+ */
+ if (dw_edma_v0_core_ch_status(chan) != DMA_IN_PROGRESS)
+ dw_edma_v0_core_ch_doorbell(chan);
+
return (val - (paddr & 0xFFFFFFFF)) / EDMA_LL_SZ;
}
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running
2026-01-09 20:13 ` [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running Frank Li
@ 2026-01-23 10:41 ` Niklas Cassel
2026-01-23 14:15 ` Frank Li
0 siblings, 1 reply; 14+ messages in thread
From: Niklas Cassel @ 2026-01-23 10:41 UTC (permalink / raw)
To: Frank Li
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx
On Fri, Jan 09, 2026 at 03:13:28PM -0500, Frank Li wrote:
> This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which
> support append new DMA request during dmaengine runnings.
>
> Append new request during dmaengine runnings.
>
> But look like hardware have bug, which missed doorbell when engine is
> running. So add workaround to push doorbelll again when found engine stop.
>
> Get more than 10% performance gain.
>
> The before
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
>
> After
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
>
> Signed-off-by: Frank Li <Frank.Li@nxp.com>
> ---
Hello Frank,
First of all, I hope that your:
[PATCH v3 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
series will make it to the upcoming 6.20/7.0 merge window.
This RFT series however breaks pci-epf-test:
Before:
# RUN pci_ep_data_transfer.dma.READ_TEST ...
# OK pci_ep_data_transfer.dma.READ_TEST
ok 14 pci_ep_data_transfer.dma.READ_TEST
# RUN pci_ep_data_transfer.dma.WRITE_TEST ...
# OK pci_ep_data_transfer.dma.WRITE_TEST
ok 15 pci_ep_data_transfer.dma.WRITE_TEST
After:
# RUN pci_ep_data_transfer.dma.READ_TEST ...
# READ_TEST: Test terminated by timeout
# FAIL pci_ep_data_transfer.dma.READ_TEST
not ok 14 pci_ep_data_transfer.dma.READ_TEST
# RUN pci_ep_data_transfer.dma.WRITE_TEST ...
# WRITE_TEST: Test terminated by timeout
# FAIL pci_ep_data_transfer.dma.WRITE_TEST
not ok 15 pci_ep_data_transfer.dma.WRITE_TEST
After a bisect, first bad commit:
commit 352fd8d5ed468ea616eb4974b5ac19203528b207
Author: Frank Li <Frank.Li@nxp.com>
Date: Fri Jan 9 15:13:28 2026 -0500
dmaengine: dw-edma: Dynamitc append new request during dmaengine running
Kind regards,
Niklas
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running
2026-01-23 10:41 ` Niklas Cassel
@ 2026-01-23 14:15 ` Frank Li
0 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-23 14:15 UTC (permalink / raw)
To: Niklas Cassel
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx
On Fri, Jan 23, 2026 at 11:41:54AM +0100, Niklas Cassel wrote:
> On Fri, Jan 09, 2026 at 03:13:28PM -0500, Frank Li wrote:
> > This use PCS-CCS-CB-TCB Producer-Consumer Synchronization module, which
> > support append new DMA request during dmaengine runnings.
> >
> > Append new request during dmaengine runnings.
> >
> > But look like hardware have bug, which missed doorbell when engine is
> > running. So add workaround to push doorbelll again when found engine stop.
> >
> > Get more than 10% performance gain.
> >
> > The before
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> >
> > After
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> >
> > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > ---
>
> Hello Frank,
>
> First of all, I hope that your:
> [PATCH v3 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
> series will make it to the upcoming 6.20/7.0 merge window.
>
>
> This RFT series however breaks pci-epf-test:
>
> Before:
> # RUN pci_ep_data_transfer.dma.READ_TEST ...
> # OK pci_ep_data_transfer.dma.READ_TEST
> ok 14 pci_ep_data_transfer.dma.READ_TEST
> # RUN pci_ep_data_transfer.dma.WRITE_TEST ...
> # OK pci_ep_data_transfer.dma.WRITE_TEST
> ok 15 pci_ep_data_transfer.dma.WRITE_TEST
>
> After:
> # RUN pci_ep_data_transfer.dma.READ_TEST ...
> # READ_TEST: Test terminated by timeout
> # FAIL pci_ep_data_transfer.dma.READ_TEST
> not ok 14 pci_ep_data_transfer.dma.READ_TEST
> # RUN pci_ep_data_transfer.dma.WRITE_TEST ...
> # WRITE_TEST: Test terminated by timeout
> # FAIL pci_ep_data_transfer.dma.WRITE_TEST
> not ok 15 pci_ep_data_transfer.dma.WRITE_TEST
>
>
> After a bisect, first bad commit:
> commit 352fd8d5ed468ea616eb4974b5ac19203528b207
> Author: Frank Li <Frank.Li@nxp.com>
> Date: Fri Jan 9 15:13:28 2026 -0500
>
> dmaengine: dw-edma: Dynamitc append new request during dmaengine running
>
Thanks, let me try to fix it.
Frank
>
>
> Kind regards,
> Niklas
^ permalink raw reply [flat|nested] 14+ messages in thread
* [PATCH RFT 5/5] dmaengine: dw-edma: Add trace support
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
` (3 preceding siblings ...)
2026-01-09 20:13 ` [PATCH RFT 4/5] dmaengine: dw-edma: Dynamitc append new request during dmaengine running Frank Li
@ 2026-01-09 20:13 ` Frank Li
2026-01-12 13:35 ` [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Niklas Cassel
2026-06-04 7:08 ` Koichiro Den
6 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-09 20:13 UTC (permalink / raw)
To: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Manivannan Sadhasivam,
Krzysztof Wilczyński, Kishon Vijay Abraham I, Bjorn Helgaas,
Christoph Hellwig, Niklas Cassel
Cc: dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx, Frank Li
Add trace support to help debug eDMA problem.
Signed-off-by: Frank Li <Frank.Li@nxp.com>
---
drivers/dma/dw-edma/Makefile | 3 +
drivers/dma/dw-edma/dw-edma-core.c | 12 +++
drivers/dma/dw-edma/dw-edma-core.h | 2 +
drivers/dma/dw-edma/dw-edma-trace.c | 4 +
drivers/dma/dw-edma/dw-edma-trace.h | 150 ++++++++++++++++++++++++++++++++++++
5 files changed, 171 insertions(+)
diff --git a/drivers/dma/dw-edma/Makefile b/drivers/dma/dw-edma/Makefile
index 83ab58f87760831883bcfad788306e1722634a83..3e31e7d92f3ecb577136bbb0e430801b6f8ff2b3 100644
--- a/drivers/dma/dw-edma/Makefile
+++ b/drivers/dma/dw-edma/Makefile
@@ -1,9 +1,12 @@
# SPDX-License-Identifier: GPL-2.0
+dw-edma-trace-$(CONFIG_TRACING) := dw-edma-trace.o
+CFLAGS_dw-edma-trace.o := -I$(src)
obj-$(CONFIG_DW_EDMA) += dw-edma.o
dw-edma-$(CONFIG_DEBUG_FS) := dw-edma-v0-debugfs.o \
dw-hdma-v0-debugfs.o
dw-edma-objs := dw-edma-core.o \
dw-edma-v0-core.o \
+ ${dw-edma-trace-y} \
dw-hdma-v0-core.o $(dw-edma-y)
obj-$(CONFIG_DW_EDMA_PCIE) += dw-edma-pcie.o
diff --git a/drivers/dma/dw-edma/dw-edma-core.c b/drivers/dma/dw-edma/dw-edma-core.c
index 5aacd04bd2da4a65aabec48f6631f6f8882eecfd..339e372eb8cf60c3baa0de3e3576865e27d91716 100644
--- a/drivers/dma/dw-edma/dw-edma-core.c
+++ b/drivers/dma/dw-edma/dw-edma-core.c
@@ -111,6 +111,12 @@ static void dw_edma_core_start(struct dw_edma_desc *desc, bool first)
chan->ll_head, chan->cb,
i == desc->nburst - 1 || free == 1);
+ trace_edma_fill_ll(chan, chan->ll_head,
+ desc->vd.tx.cookie,
+ desc->burst[i].sar,
+ desc->burst[i].dar, desc->burst[i].sz,
+ chan->cb);
+
chan->ll_head++;
if (chan->ll_head == chan->ll_max - 1) {
@@ -141,6 +147,8 @@ static int dw_edma_start_transfer(struct dw_edma_chan *chan)
desc = vd2dw_edma_desc(vd);
+ trace_edma_start_desc(desc);
+
if (desc->start_burst != desc->nburst) {
dw_edma_core_start(desc, !desc->start_burst);
ret = 1;
@@ -193,6 +201,7 @@ static void dw_edma_ll_clean_pending(struct dw_edma_chan *chan, int idx)
DMA_TRANS_NOERROR);
list_del(&vd->node);
vchan_cookie_complete(vd);
+ trace_edma_complete_desc(desc);
chan->ll_end = desc->ll_end;
}
} else {
@@ -348,6 +357,8 @@ dw_edma_device_tx_status(struct dma_chan *dchan, dma_cookie_t cookie,
}
spin_unlock_irqrestore(&chan->vc.lock, flags);
+ trace_edma_tx_status_info(chan, idx);
+
/* check gain because dw_edma_ll_clean_pending() may update cookie */
ret = dma_cookie_status(dchan, cookie, txstate);
if (ret == DMA_COMPLETE)
@@ -609,6 +620,7 @@ static void dw_edma_done_interrupt(struct dw_edma_chan *chan)
spin_lock_irqsave(&chan->vc.lock, flags);
idx = dw_edma_core_ll_cur_idx(chan);
+ trace_edma_irq(chan, idx);
if (idx != chan->cur_idx) {
chan->cur_idx = idx;
dw_edma_ll_clean_pending(chan, idx);
diff --git a/drivers/dma/dw-edma/dw-edma-core.h b/drivers/dma/dw-edma/dw-edma-core.h
index 94d49f8359b99a9b0f8ca708edf81ca854dff4c2..ecc08dc0d34f4a86cc739dd12a1ce46ace58045c 100644
--- a/drivers/dma/dw-edma/dw-edma-core.h
+++ b/drivers/dma/dw-edma/dw-edma-core.h
@@ -140,6 +140,8 @@ struct dw_edma {
const struct dw_edma_core_ops *core;
};
+#include "dw-edma-trace.h"
+
typedef void (*dw_edma_handler_t)(struct dw_edma_chan *);
struct dw_edma_core_ops {
diff --git a/drivers/dma/dw-edma/dw-edma-trace.c b/drivers/dma/dw-edma/dw-edma-trace.c
new file mode 100644
index 0000000000000000000000000000000000000000..2620ad61a9436a8d21b2408f3613c585fba0d9bb
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-trace.c
@@ -0,0 +1,4 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#define CREATE_TRACE_POINTS
+#include "dw-edma-core.h"
diff --git a/drivers/dma/dw-edma/dw-edma-trace.h b/drivers/dma/dw-edma/dw-edma-trace.h
new file mode 100644
index 0000000000000000000000000000000000000000..3be77b42b04947407536523d1535d1eb7d9bdf71
--- /dev/null
+++ b/drivers/dma/dw-edma/dw-edma-trace.h
@@ -0,0 +1,150 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Copyright 2023 NXP.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM dw_edma
+
+#if !defined(__LINUX_DW_EDMA_TRACE) || defined(TRACE_HEADER_MULTI_READ)
+#define __LINUX_DW_EDMA_TRACE
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(edma_desc_info,
+ TP_PROTO(struct dw_edma_desc *desc),
+ TP_ARGS(desc),
+ TP_STRUCT__entry(
+ __field(u32, nburst)
+ __field(u32, start_burst)
+ __field(u32, ll_end)
+ __field(u32, cookie)
+ __field(u32, id)
+ __field(u8, dir)
+ ),
+ TP_fast_assign(
+ __entry->nburst = desc->nburst,
+ __entry->start_burst = desc->start_burst,
+ __entry->ll_end = desc->ll_end,
+ __entry->id = desc->chan->id,
+ __entry->dir = desc->chan->dir,
+ __entry->cookie = desc->vd.tx.cookie;
+ ),
+ TP_printk("chan %d%c desc %d, nburst %d, start_burst %d, ll_end %d\n",
+ __entry->id,
+ __entry->dir ? 'R': 'W',
+ __entry->cookie,
+ __entry->nburst,
+ __entry->start_burst,
+ __entry->ll_end)
+);
+
+DEFINE_EVENT(edma_desc_info, edma_start_desc,
+ TP_PROTO(struct dw_edma_desc *desc),
+ TP_ARGS(desc)
+);
+
+DEFINE_EVENT(edma_desc_info, edma_complete_desc,
+ TP_PROTO(struct dw_edma_desc *desc),
+ TP_ARGS(desc)
+);
+
+DECLARE_EVENT_CLASS(edma_ll_info,
+ TP_PROTO(struct dw_edma_chan *chan, int idx),
+ TP_ARGS(chan, idx),
+ TP_STRUCT__entry(
+ __field(u32, head)
+ __field(u32, end)
+ __field(u32, total)
+ __field(u32, index)
+ __field(u32, completed_cookie)
+ __field(u32, cookie)
+ __field(u32, id)
+ __field(u8, dir)
+ ),
+ TP_fast_assign(
+ __entry->head = chan->ll_head,
+ __entry->end = chan->ll_end,
+ __entry->total = chan->ll_max,
+ __entry->index = idx,
+ __entry->completed_cookie = chan->vc.chan.completed_cookie,
+ __entry->cookie = chan->vc.chan.cookie,
+ __entry->id = chan->id,
+ __entry->dir = chan->dir;
+ ),
+ TP_printk("chan %d%c head: %d end: %d: dma cur index: %d, complete cookie: %d, cookie: %d\n",
+ __entry->id,
+ __entry->dir ? 'R': 'W',
+ __entry->head,
+ __entry->end,
+ __entry->index,
+ __entry->completed_cookie,
+ __entry->cookie)
+);
+
+DEFINE_EVENT(edma_ll_info, edma_tx_status_info,
+ TP_PROTO(struct dw_edma_chan *chan, int idx),
+ TP_ARGS(chan, idx)
+);
+
+DEFINE_EVENT(edma_ll_info, edma_irq,
+ TP_PROTO(struct dw_edma_chan *chan, int idx),
+ TP_ARGS(chan, idx)
+);
+
+DEFINE_EVENT(edma_ll_info, emda_terminate_all,
+ TP_PROTO(struct dw_edma_chan *chan, int idx),
+ TP_ARGS(chan, idx)
+);
+
+DECLARE_EVENT_CLASS(edma_log_ll,
+ TP_PROTO(struct dw_edma_chan *chan, u32 idx, u32 cookie, u64 src, u64 dest, u32 sz, bool flag),
+ TP_ARGS(chan, idx, cookie, src, dest, sz, flag),
+ TP_STRUCT__entry(
+ __field(u32, idx)
+ __field(u64, src)
+ __field(u64, dest)
+ __field(u32, sz)
+ __field(u32, id)
+ __field(u32, cookie)
+ __field(bool, flag)
+ __field(u8, dir)
+ ),
+ TP_fast_assign(
+ __entry->idx = idx,
+ __entry->src = src,
+ __entry->dest = dest,
+ __entry->sz = sz,
+ __entry->id = chan->id,
+ __entry->dir = chan->dir,
+ __entry->cookie = cookie,
+ __entry->flag = flag;
+ ),
+ TP_printk("chan %d%c %d [%d] %c src: %08llx dest: %08llx sz: %04x\n",
+ __entry->id,
+ __entry->dir ? 'R' : 'W',
+ __entry->cookie,
+ __entry->idx,
+ __entry->flag ? 'C' : 'c',
+ __entry->src,
+ __entry->dest,
+ __entry->sz)
+);
+
+DEFINE_EVENT(edma_log_ll, edma_fill_ll,
+ TP_PROTO(struct dw_edma_chan *chan, u32 idx, u32 cookie, u64 src, u64 dest, u32 sz, bool flag),
+ TP_ARGS(chan, idx, cookie, src, dest, sz, flag)
+);
+
+#endif
+
+/* this part must be outside header guard */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE dw-edma-trace
+
+#include <trace/define_trace.h>
--
2.34.1
^ permalink raw reply related [flat|nested] 14+ messages in thread* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
` (4 preceding siblings ...)
2026-01-09 20:13 ` [PATCH RFT 5/5] dmaengine: dw-edma: Add trace support Frank Li
@ 2026-01-12 13:35 ` Niklas Cassel
2026-01-12 14:54 ` Frank Li
2026-06-04 7:08 ` Koichiro Den
6 siblings, 1 reply; 14+ messages in thread
From: Niklas Cassel @ 2026-01-12 13:35 UTC (permalink / raw)
To: Frank Li
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx
Hello Frank,
On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
Subject: dmaengine: dw-edma: support dynamtic add link entry during dma engine running
s/dynamtic/dynamic/
Also in patch 1/5:
s/dymatic/dynamic/
> Patch depend on
> https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
To make it easier for the reader, please include the full list of
dependencies, i.e. also include:
https://lore.kernel.org/dmaengine/20260105-dma_prep_config-v3-0-a8480362fd42@nxp.com/
here.
>
> Only test eDMA, have not tested HDMA.
> Corn case have not tested, such as pause/resume transfer.
s/Corn case/Corner cases/
>
> Before
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> IOPS=266, BW=135MiB/s (141MB/s)
>
> After
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> IOPS=299, BW=149MiB/s (156MB/s)
We can clearly see the improvement, but overall, your numbers are quite low.
What is the PCIe Gen + number of lanes you are using?
Are you running nvmet-pci-epf backed by a real drive or backed by null-blk?
(Having nvmet-pci-epf backed by null-blk is much better for benchmarking.)
I'm using nvmet-pci-epf backed by null-blk, with a PCIe Gen3 link with 4 lanes.
Applying only your dependencies, I get:
Rnd read, 4KB, QD=1, 1 job : IOPS=12.1k, BW=47.2MiB/s (49.5MB/s)
Rnd read, 4KB, QD=32, 1 job : IOPS=51.1k, BW=200MiB/s (209MB/s)
Rnd read, 4KB, QD=32, 4 jobs: IOPS=72.2k, BW=282MiB/s (296MB/s)
Rnd read, 128KB, QD=1, 1 job : IOPS=2922, BW=365MiB/s (383MB/s)
Rnd read, 128KB, QD=32, 1 job : IOPS=18.9k, BW=2368MiB/s (2483MB/s)
Rnd read, 128KB, QD=32, 4 jobs: IOPS=18.7k, BW=2334MiB/s (2447MB/s)
Rnd read, 512KB, QD=1, 1 job : IOPS=1867, BW=934MiB/s (979MB/s)
Rnd read, 512KB, QD=32, 1 job : IOPS=4738, BW=2369MiB/s (2484MB/s)
Rnd read, 512KB, QD=32, 4 jobs: IOPS=4675, BW=2338MiB/s (2451MB/s)
Rnd write, 4KB, QD=1, 1 job : IOPS=10.6k, BW=41.4MiB/s (43.5MB/s)
Rnd write, 4KB, QD=32, 1 job : IOPS=34.4k, BW=135MiB/s (141MB/s)
Rnd write, 4KB, QD=32, 4 jobs: IOPS=34.4k, BW=135MiB/s (141MB/s)
Rnd write, 128KB, QD=1, 1 job : IOPS=2624, BW=328MiB/s (344MB/s)
Rnd write, 128KB, QD=32, 1 job : IOPS=10.2k, BW=1277MiB/s (1339MB/s)
Rnd write, 128KB, QD=32, 4 jobs: IOPS=10.3k, BW=1282MiB/s (1344MB/s)
Seq read, 128KB, QD=1, 1 job : IOPS=3195, BW=399MiB/s (419MB/s)
Seq read, 128KB, QD=32, 1 job : IOPS=18.6k, BW=2321MiB/s (2434MB/s)
Seq read, 512KB, QD=1, 1 job : IOPS=2162, BW=1081MiB/s (1134MB/s)
Seq read, 512KB, QD=32, 1 job : IOPS=4727, BW=2364MiB/s (2479MB/s)
Seq read, 1MB, QD=32, 1 job : IOPS=2360, BW=2361MiB/s (2476MB/s)
Seq write, 128KB, QD=1, 1 job : IOPS=2997, BW=375MiB/s (393MB/s)
Seq write, 128KB, QD=32, 1 job : IOPS=10.2k, BW=1278MiB/s (1341MB/s)
Seq write, 512KB, QD=1, 1 job : IOPS=1434, BW=717MiB/s (752MB/s)
Seq write, 512KB, QD=32, 1 job : IOPS=2557, BW=1279MiB/s (1341MB/s)
Seq write, 1MB, QD=32, 1 job : IOPS=1276, BW=1276MiB/s (1338MB/s)
Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=2110, BW=1058MiB/s (1109MB/s)
IOPS=2127, BW=1068MiB/s (1120MB/s)
Applying your dependencies + this series, I get:
Rnd read, 4KB, QD=1, 1 job : IOPS=12.5k, BW=48.7MiB/s (51.0MB/s)
Rnd read, 4KB, QD=32, 1 job : IOPS=55.3k, BW=216MiB/s (226MB/s)
Rnd read, 4KB, QD=32, 4 jobs: IOPS=175k, BW=682MiB/s (715MB/s)
Rnd read, 128KB, QD=1, 1 job : IOPS=3018, BW=377MiB/s (396MB/s)
Rnd read, 128KB, QD=32, 1 job : IOPS=20.1k, BW=2519MiB/s (2641MB/s)
Rnd read, 128KB, QD=32, 4 jobs: IOPS=24.4k, BW=3051MiB/s (3199MB/s)
Rnd read, 512KB, QD=1, 1 job : IOPS=1850, BW=925MiB/s (970MB/s)
Rnd read, 512KB, QD=32, 1 job : IOPS=5846, BW=2923MiB/s (3065MB/s)
Rnd read, 512KB, QD=32, 4 jobs: IOPS=6141, BW=3071MiB/s (3220MB/s)
Rnd write, 4KB, QD=1, 1 job : IOPS=11.6k, BW=45.4MiB/s (47.6MB/s)
Rnd write, 4KB, QD=32, 1 job : IOPS=49.6k, BW=194MiB/s (203MB/s)
Rnd write, 4KB, QD=32, 4 jobs: IOPS=82.0k, BW=320MiB/s (336MB/s)
Rnd write, 128KB, QD=1, 1 job : IOPS=3051, BW=381MiB/s (400MB/s)
Rnd write, 128KB, QD=32, 1 job : IOPS=13.0k, BW=1619MiB/s (1698MB/s)
Rnd write, 128KB, QD=32, 4 jobs: IOPS=12.5k, BW=1559MiB/s (1635MB/s)
Seq read, 128KB, QD=1, 1 job : IOPS=3445, BW=431MiB/s (452MB/s)
Seq read, 128KB, QD=32, 1 job : IOPS=18.3k, BW=2283MiB/s (2394MB/s)
Seq read, 512KB, QD=1, 1 job : IOPS=2048, BW=1024MiB/s (1074MB/s)
Seq read, 512KB, QD=32, 1 job : IOPS=5766, BW=2883MiB/s (3023MB/s)
Seq read, 1MB, QD=32, 1 job : IOPS=3038, BW=3038MiB/s (3186MB/s)
Seq write, 128KB, QD=1, 1 job : IOPS=2961, BW=370MiB/s (388MB/s)
Seq write, 128KB, QD=32, 1 job : IOPS=12.3k, BW=1535MiB/s (1609MB/s)
Seq write, 512KB, QD=1, 1 job : IOPS=1482, BW=741MiB/s (777MB/s)
Seq write, 512KB, QD=32, 1 job : IOPS=3144, BW=1572MiB/s (1648MB/s)
Seq write, 1MB, QD=32, 1 job : IOPS=1549, BW=1550MiB/s (1625MB/s)
Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=2596, BW=1303MiB/s (1366MB/s)
IOPS=2617, BW=1313MiB/s (1377MB/s)
So I can clearly see an improvement with this patch series.
Great work so far!
Kind regards,
Niklas
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-01-12 13:35 ` [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Niklas Cassel
@ 2026-01-12 14:54 ` Frank Li
0 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-01-12 14:54 UTC (permalink / raw)
To: Niklas Cassel
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
dmaengine, linux-kernel, linux-hardening, linux-pci, linux-nvme,
Damien Le Moal, imx
On Mon, Jan 12, 2026 at 02:35:35PM +0100, Niklas Cassel wrote:
> Hello Frank,
>
> On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
>
> Subject: dmaengine: dw-edma: support dynamtic add link entry during dma engine running
>
> s/dynamtic/dynamic/
>
> Also in patch 1/5:
> s/dymatic/dynamic/
>
>
> > Patch depend on
> > https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
>
> To make it easier for the reader, please include the full list of
> dependencies, i.e. also include:
> https://lore.kernel.org/dmaengine/20260105-dma_prep_config-v3-0-a8480362fd42@nxp.com/
> here.
>
>
> >
> > Only test eDMA, have not tested HDMA.
> > Corn case have not tested, such as pause/resume transfer.
>
> s/Corn case/Corner cases/
>
Sorry for typo. I need check my tools about why not detect typo at corver
letter.
Frank
>
> >
> > Before
> >
> > Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> > Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> > Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> > Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> > Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> > Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> > Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> > Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> > Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> > Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> > Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> > Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> > Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> > Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> > Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> > Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> > IOPS=266, BW=135MiB/s (141MB/s)
> >
> > After
> >
> > Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> > Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> > Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> > Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> > Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> > Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> > Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> > Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> > Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> > Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> > Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> > Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> > Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> > Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> > Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> > Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> > IOPS=299, BW=149MiB/s (156MB/s)
>
> We can clearly see the improvement, but overall, your numbers are quite low.
> What is the PCIe Gen + number of lanes you are using?
> Are you running nvmet-pci-epf backed by a real drive or backed by null-blk?
> (Having nvmet-pci-epf backed by null-blk is much better for benchmarking.)
>
> I'm using nvmet-pci-epf backed by null-blk, with a PCIe Gen3 link with 4 lanes.
>
>
> Applying only your dependencies, I get:
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=12.1k, BW=47.2MiB/s (49.5MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=51.1k, BW=200MiB/s (209MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=72.2k, BW=282MiB/s (296MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=2922, BW=365MiB/s (383MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=18.9k, BW=2368MiB/s (2483MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=18.7k, BW=2334MiB/s (2447MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=1867, BW=934MiB/s (979MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=4738, BW=2369MiB/s (2484MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=4675, BW=2338MiB/s (2451MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=10.6k, BW=41.4MiB/s (43.5MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=34.4k, BW=135MiB/s (141MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=34.4k, BW=135MiB/s (141MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=2624, BW=328MiB/s (344MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=10.2k, BW=1277MiB/s (1339MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=10.3k, BW=1282MiB/s (1344MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=3195, BW=399MiB/s (419MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=18.6k, BW=2321MiB/s (2434MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=2162, BW=1081MiB/s (1134MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=4727, BW=2364MiB/s (2479MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=2360, BW=2361MiB/s (2476MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=2997, BW=375MiB/s (393MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=10.2k, BW=1278MiB/s (1341MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=1434, BW=717MiB/s (752MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=2557, BW=1279MiB/s (1341MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=1276, BW=1276MiB/s (1338MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=2110, BW=1058MiB/s (1109MB/s)
> IOPS=2127, BW=1068MiB/s (1120MB/s)
>
>
> Applying your dependencies + this series, I get:
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=12.5k, BW=48.7MiB/s (51.0MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=55.3k, BW=216MiB/s (226MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=175k, BW=682MiB/s (715MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=3018, BW=377MiB/s (396MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=20.1k, BW=2519MiB/s (2641MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=24.4k, BW=3051MiB/s (3199MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=1850, BW=925MiB/s (970MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=5846, BW=2923MiB/s (3065MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=6141, BW=3071MiB/s (3220MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=11.6k, BW=45.4MiB/s (47.6MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=49.6k, BW=194MiB/s (203MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=82.0k, BW=320MiB/s (336MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=3051, BW=381MiB/s (400MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=13.0k, BW=1619MiB/s (1698MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=12.5k, BW=1559MiB/s (1635MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=3445, BW=431MiB/s (452MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=18.3k, BW=2283MiB/s (2394MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=2048, BW=1024MiB/s (1074MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=5766, BW=2883MiB/s (3023MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=3038, BW=3038MiB/s (3186MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=2961, BW=370MiB/s (388MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=12.3k, BW=1535MiB/s (1609MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=1482, BW=741MiB/s (777MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=3144, BW=1572MiB/s (1648MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=1549, BW=1550MiB/s (1625MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=2596, BW=1303MiB/s (1366MB/s)
> IOPS=2617, BW=1313MiB/s (1377MB/s)
>
>
> So I can clearly see an improvement with this patch series.
> Great work so far!
>
>
> Kind regards,
> Niklas
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-01-09 20:13 [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Frank Li
` (5 preceding siblings ...)
2026-01-12 13:35 ` [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running Niklas Cassel
@ 2026-06-04 7:08 ` Koichiro Den
2026-06-05 18:34 ` Frank Li
6 siblings, 1 reply; 14+ messages in thread
From: Koichiro Den @ 2026-06-04 7:08 UTC (permalink / raw)
To: Frank Li
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
Niklas Cassel, dmaengine, linux-kernel, linux-hardening,
linux-pci, linux-nvme, Damien Le Moal, imx
On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
> Patch depend on
> https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
>
> Only test eDMA, have not tested HDMA.
Hi Frank,
I expect this series may be revisited in the near future, since the first
dependency series reached v7 and looks close to landing.
With the latest versions of the two dependencies:
- [PATCH v7 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
https://lore.kernel.org/dmaengine/20260521-dma_prep_config-v7-0-1f73f4899883@nxp.com/
- [PATCH v2 00/11] dmaengine: dw-edma: flatten desc structions and simple code
https://lore.kernel.org/dmaengine/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/
I tested this RFT series with the HDMA engine on a SpacemiT K3.
The test results are below, using the same format as your results:
Baseline, before applying the three series (v7 + v2 + this RFT)
Rnd read , 4KB, QD=1 , 1 job : IOPS=8567, BW=33.5MiB/s (35.1MB/s)
Rnd read , 4KB, QD=32, 1 job : IOPS=55.5k, BW=217MiB/s (227MB/s)
Rnd read , 4KB, QD=32, 4 jobs: IOPS=83.0k, BW=324MiB/s (340MB/s)
Rnd read , 128KB, QD=1 , 1 job : IOPS=3817, BW=477MiB/s (500MB/s)
Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1346MiB/s (1411MB/s)
Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
Rnd read , 512KB, QD=1 , 1 job : IOPS=1515, BW=758MiB/s (794MB/s)
Rnd read , 512KB, QD=32, 1 job : IOPS=2795, BW=1399MiB/s (1467MB/s)
Rnd read , 512KB, QD=32, 4 jobs: IOPS=2795, BW=1404MiB/s (1472MB/s)
Rnd write, 4KB, QD=1 , 1 job : IOPS=9035, BW=35.3MiB/s (37.0MB/s)
Rnd write, 4KB, QD=32, 1 job : IOPS=38.3k, BW=149MiB/s (157MB/s)
Rnd write, 4KB, QD=32, 4 jobs: IOPS=41.8k, BW=163MiB/s (171MB/s)
Rnd write, 128KB, QD=1 , 1 job : IOPS=3969, BW=496MiB/s (520MB/s)
Rnd write, 128KB, QD=32, 1 job : IOPS=8260, BW=1033MiB/s (1083MB/s)
Rnd write, 128KB, QD=32, 4 jobs: IOPS=8295, BW=1038MiB/s (1089MB/s)
Seq read , 128KB, QD=1 , 1 job : IOPS=4609, BW=576MiB/s (604MB/s)
Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1345MiB/s (1410MB/s)
Seq read , 512KB, QD=1 , 1 job : IOPS=1524, BW=762MiB/s (799MB/s)
Seq read , 512KB, QD=32, 1 job : IOPS=2799, BW=1401MiB/s (1469MB/s)
Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
Seq write, 128KB, QD=1 , 1 job : IOPS=3722, BW=465MiB/s (488MB/s)
Seq write, 128KB, QD=32, 1 job : IOPS=8246, BW=1031MiB/s (1081MB/s)
Seq write, 512KB, QD=1 , 1 job : IOPS=1283, BW=642MiB/s (673MB/s)
Seq write, 512KB, QD=32, 1 job : IOPS=2072, BW=1038MiB/s (1088MB/s)
Seq write, 1MB, QD=32, 1 job : IOPS=1037, BW=1040MiB/s (1091MB/s)
Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1540, BW=768MiB/s (805MB/s)
IOPS=1549, BW=768MiB/s (805MB/s)
After your three series (v7 + v2 + this)
Rnd read , 4KB, QD=1 , 1 job : IOPS=7216, BW=28.2MiB/s (29.6MB/s)
Rnd read , 4KB, QD=32, 1 job : IOPS=61.1k, BW=239MiB/s (250MB/s)
Rnd read , 4KB, QD=32, 4 jobs: IOPS=75.3k, BW=294MiB/s (309MB/s)
Rnd read , 128KB, QD=1 , 1 job : IOPS=4711, BW=589MiB/s (618MB/s)
Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
Rnd read , 512KB, QD=1 , 1 job : IOPS=1497, BW=749MiB/s (785MB/s)
Rnd read , 512KB, QD=32, 1 job : IOPS=2802, BW=1403MiB/s (1471MB/s)
Rnd read , 512KB, QD=32, 4 jobs: IOPS=2798, BW=1405MiB/s (1474MB/s)
Rnd write, 4KB, QD=1 , 1 job : IOPS=7411, BW=29.0MiB/s (30.4MB/s)
Rnd write, 4KB, QD=32, 1 job : IOPS=39.3k, BW=153MiB/s (161MB/s)
Rnd write, 4KB, QD=32, 4 jobs: IOPS=42.9k, BW=167MiB/s (176MB/s)
Rnd write, 128KB, QD=1 , 1 job : IOPS=3736, BW=467MiB/s (490MB/s)
Rnd write, 128KB, QD=32, 1 job : IOPS=8302, BW=1038MiB/s (1089MB/s)
Rnd write, 128KB, QD=32, 4 jobs: IOPS=8314, BW=1041MiB/s (1091MB/s)
Seq read , 128KB, QD=1 , 1 job : IOPS=4092, BW=512MiB/s (536MB/s)
Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
Seq read , 512KB, QD=1 , 1 job : IOPS=1474, BW=737MiB/s (773MB/s)
Seq read , 512KB, QD=32, 1 job : IOPS=2794, BW=1399MiB/s (1467MB/s)
Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
Seq write, 128KB, QD=1 , 1 job : IOPS=4135, BW=517MiB/s (542MB/s)
Seq write, 128KB, QD=32, 1 job : IOPS=8307, BW=1039MiB/s (1089MB/s)
Seq write, 512KB, QD=1 , 1 job : IOPS=1259, BW=630MiB/s (660MB/s)
Seq write, 512KB, QD=32, 1 job : IOPS=2073, BW=1038MiB/s (1089MB/s)
Seq write, 1MB, QD=32, 1 job : IOPS=1034, BW=1038MiB/s (1088MB/s)
Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1531, BW=763MiB/s (801MB/s)
IOPS=1540, BW=765MiB/s (802MB/s)
On this HDMA setup, I did not observe a clear performance difference from
applying the three series alone. Still, I like the overall direction.
P.S.
Separately, as a follow-up experiment, I also prototyped an extra series on top
of your three series that allows us to make use of HDMA watermark interrupts.
With that series, in particular for the high queue-depth cases, the results
improved noticeably on this platform. I haven't posted that series yet though.
After your three series (v7 + v2 + this) + use of HDMA watermark interrupts
Rnd read , 4KB, QD=1 , 1 job : IOPS=8016, BW=31.3MiB/s (32.8MB/s)
Rnd read , 4KB, QD=32, 1 job : IOPS=63.4k, BW=248MiB/s (260MB/s)
Rnd read , 4KB, QD=32, 4 jobs: IOPS=92.7k, BW=362MiB/s (380MB/s)
Rnd read , 128KB, QD=1 , 1 job : IOPS=3530, BW=441MiB/s (463MB/s)
Rnd read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1500MiB/s (1573MB/s)
Rnd read , 128KB, QD=32, 4 jobs: IOPS=12.4k, BW=1555MiB/s (1631MB/s)
Rnd read , 512KB, QD=1 , 1 job : IOPS=1541, BW=771MiB/s (808MB/s)
Rnd read , 512KB, QD=32, 1 job : IOPS=3116, BW=1560MiB/s (1636MB/s)
Rnd read , 512KB, QD=32, 4 jobs: IOPS=3099, BW=1556MiB/s (1632MB/s)
Rnd write, 4KB, QD=1 , 1 job : IOPS=8748, BW=34.2MiB/s (35.8MB/s)
Rnd write, 4KB, QD=32, 1 job : IOPS=57.6k, BW=225MiB/s (236MB/s)
Rnd write, 4KB, QD=32, 4 jobs: IOPS=80.3k, BW=314MiB/s (329MB/s)
Rnd write, 128KB, QD=1 , 1 job : IOPS=3878, BW=485MiB/s (508MB/s)
Rnd write, 128KB, QD=32, 1 job : IOPS=9798, BW=1225MiB/s (1285MB/s)
Rnd write, 128KB, QD=32, 4 jobs: IOPS=9970, BW=1248MiB/s (1308MB/s)
Seq read , 128KB, QD=1 , 1 job : IOPS=4516, BW=565MiB/s (592MB/s)
Seq read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1497MiB/s (1570MB/s)
Seq read , 512KB, QD=1 , 1 job : IOPS=1571, BW=786MiB/s (824MB/s)
Seq read , 512KB, QD=32, 1 job : IOPS=3073, BW=1538MiB/s (1613MB/s)
Seq read , 1MB, QD=32, 1 job : IOPS=1573, BW=1576MiB/s (1653MB/s)
Seq write, 128KB, QD=1 , 1 job : IOPS=3977, BW=497MiB/s (521MB/s)
Seq write, 128KB, QD=32, 1 job : IOPS=9806, BW=1226MiB/s (1286MB/s)
Seq write, 512KB, QD=1 , 1 job : IOPS=1404, BW=702MiB/s (736MB/s)
Seq write, 512KB, QD=32, 1 job : IOPS=2496, BW=1250MiB/s (1310MB/s)
Seq write, 1MB, QD=32, 1 job : IOPS=1252, BW=1256MiB/s (1317MB/s)
Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1682, BW=836MiB/s (877MB/s)
IOPS=1688, BW=838MiB/s (879MB/s)
Best regards,
Koichiro
> Corn case have not tested, such as pause/resume transfer.
>
> Before
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> IOPS=266, BW=135MiB/s (141MB/s)
>
> After
>
> Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> IOPS=299, BW=149MiB/s (156MB/s)
>
> Signed-off-by: Frank Li <Frank.Li@nxp.com>
> ---
> Frank Li (5):
> dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos
> dmaengine: dw-edma: Move dw_hdma_set_callback_result() up
> dmaengine: dw-edma: Make DMA link list work as a circular buffer
> dmaengine: dw-edma: Dynamitc append new request during dmaengine running
> dmaengine: dw-edma: Add trace support
>
> drivers/dma/dw-edma/Makefile | 3 +
> drivers/dma/dw-edma/dw-edma-core.c | 215 ++++++++++++++++++++++++----------
> drivers/dma/dw-edma/dw-edma-core.h | 42 ++++++-
> drivers/dma/dw-edma/dw-edma-trace.c | 4 +
> drivers/dma/dw-edma/dw-edma-trace.h | 150 ++++++++++++++++++++++++
> drivers/dma/dw-edma/dw-edma-v0-core.c | 39 +++++-
> drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++
> 7 files changed, 409 insertions(+), 61 deletions(-)
> ---
> base-commit: 020f6d8442f35105660a29d0d236d3f8650c8142
> change-id: 20251212-edma_dymatic-a57843ff0dfe
>
> Best regards,
> --
> Frank Li <Frank.Li@nxp.com>
>
^ permalink raw reply [flat|nested] 14+ messages in thread* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-06-04 7:08 ` Koichiro Den
@ 2026-06-05 18:34 ` Frank Li
2026-06-09 6:23 ` Koichiro Den
0 siblings, 1 reply; 14+ messages in thread
From: Frank Li @ 2026-06-05 18:34 UTC (permalink / raw)
To: Koichiro Den
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
Niklas Cassel, dmaengine, linux-kernel, linux-hardening,
linux-pci, linux-nvme, Damien Le Moal, imx
On Thu, Jun 04, 2026 at 04:08:06PM +0900, Koichiro Den wrote:
> On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
> > Patch depend on
> > https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
> >
> > Only test eDMA, have not tested HDMA.
>
> Hi Frank,
>
> I expect this series may be revisited in the near future, since the first
> dependency series reached v7 and looks close to landing.
>
> With the latest versions of the two dependencies:
> - [PATCH v7 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
> https://lore.kernel.org/dmaengine/20260521-dma_prep_config-v7-0-1f73f4899883@nxp.com/
> - [PATCH v2 00/11] dmaengine: dw-edma: flatten desc structions and simple code
> https://lore.kernel.org/dmaengine/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/
>
> I tested this RFT series with the HDMA engine on a SpacemiT K3.
> The test results are below, using the same format as your results:
>
> Baseline, before applying the three series (v7 + v2 + this RFT)
>
> Rnd read , 4KB, QD=1 , 1 job : IOPS=8567, BW=33.5MiB/s (35.1MB/s)
> Rnd read , 4KB, QD=32, 1 job : IOPS=55.5k, BW=217MiB/s (227MB/s)
> Rnd read , 4KB, QD=32, 4 jobs: IOPS=83.0k, BW=324MiB/s (340MB/s)
> Rnd read , 128KB, QD=1 , 1 job : IOPS=3817, BW=477MiB/s (500MB/s)
> Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1346MiB/s (1411MB/s)
> Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> Rnd read , 512KB, QD=1 , 1 job : IOPS=1515, BW=758MiB/s (794MB/s)
> Rnd read , 512KB, QD=32, 1 job : IOPS=2795, BW=1399MiB/s (1467MB/s)
> Rnd read , 512KB, QD=32, 4 jobs: IOPS=2795, BW=1404MiB/s (1472MB/s)
> Rnd write, 4KB, QD=1 , 1 job : IOPS=9035, BW=35.3MiB/s (37.0MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=38.3k, BW=149MiB/s (157MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=41.8k, BW=163MiB/s (171MB/s)
> Rnd write, 128KB, QD=1 , 1 job : IOPS=3969, BW=496MiB/s (520MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=8260, BW=1033MiB/s (1083MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=8295, BW=1038MiB/s (1089MB/s)
> Seq read , 128KB, QD=1 , 1 job : IOPS=4609, BW=576MiB/s (604MB/s)
> Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1345MiB/s (1410MB/s)
> Seq read , 512KB, QD=1 , 1 job : IOPS=1524, BW=762MiB/s (799MB/s)
> Seq read , 512KB, QD=32, 1 job : IOPS=2799, BW=1401MiB/s (1469MB/s)
> Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> Seq write, 128KB, QD=1 , 1 job : IOPS=3722, BW=465MiB/s (488MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=8246, BW=1031MiB/s (1081MB/s)
> Seq write, 512KB, QD=1 , 1 job : IOPS=1283, BW=642MiB/s (673MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=2072, BW=1038MiB/s (1088MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=1037, BW=1040MiB/s (1091MB/s)
> Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1540, BW=768MiB/s (805MB/s)
> IOPS=1549, BW=768MiB/s (805MB/s)
>
> After your three series (v7 + v2 + this)
>
> Rnd read , 4KB, QD=1 , 1 job : IOPS=7216, BW=28.2MiB/s (29.6MB/s)
> Rnd read , 4KB, QD=32, 1 job : IOPS=61.1k, BW=239MiB/s (250MB/s)
> Rnd read , 4KB, QD=32, 4 jobs: IOPS=75.3k, BW=294MiB/s (309MB/s)
> Rnd read , 128KB, QD=1 , 1 job : IOPS=4711, BW=589MiB/s (618MB/s)
> Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> Rnd read , 512KB, QD=1 , 1 job : IOPS=1497, BW=749MiB/s (785MB/s)
> Rnd read , 512KB, QD=32, 1 job : IOPS=2802, BW=1403MiB/s (1471MB/s)
> Rnd read , 512KB, QD=32, 4 jobs: IOPS=2798, BW=1405MiB/s (1474MB/s)
> Rnd write, 4KB, QD=1 , 1 job : IOPS=7411, BW=29.0MiB/s (30.4MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=39.3k, BW=153MiB/s (161MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=42.9k, BW=167MiB/s (176MB/s)
> Rnd write, 128KB, QD=1 , 1 job : IOPS=3736, BW=467MiB/s (490MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=8302, BW=1038MiB/s (1089MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=8314, BW=1041MiB/s (1091MB/s)
> Seq read , 128KB, QD=1 , 1 job : IOPS=4092, BW=512MiB/s (536MB/s)
> Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> Seq read , 512KB, QD=1 , 1 job : IOPS=1474, BW=737MiB/s (773MB/s)
> Seq read , 512KB, QD=32, 1 job : IOPS=2794, BW=1399MiB/s (1467MB/s)
> Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> Seq write, 128KB, QD=1 , 1 job : IOPS=4135, BW=517MiB/s (542MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=8307, BW=1039MiB/s (1089MB/s)
> Seq write, 512KB, QD=1 , 1 job : IOPS=1259, BW=630MiB/s (660MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=2073, BW=1038MiB/s (1089MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=1034, BW=1038MiB/s (1088MB/s)
> Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1531, BW=763MiB/s (801MB/s)
> IOPS=1540, BW=765MiB/s (802MB/s)
>
> On this HDMA setup, I did not observe a clear performance difference from
> applying the three series alone. Still, I like the overall direction.
>
>
> P.S.
> Separately, as a follow-up experiment, I also prototyped an extra series on top
> of your three series that allows us to make use of HDMA watermark interrupts.
> With that series, in particular for the high queue-depth cases, the results
> improved noticeably on this platform. I haven't posted that series yet though.
Thanks for test it. I am monitor above recondition patch set.
Frank
>
> After your three series (v7 + v2 + this) + use of HDMA watermark interrupts
>
> Rnd read , 4KB, QD=1 , 1 job : IOPS=8016, BW=31.3MiB/s (32.8MB/s)
> Rnd read , 4KB, QD=32, 1 job : IOPS=63.4k, BW=248MiB/s (260MB/s)
> Rnd read , 4KB, QD=32, 4 jobs: IOPS=92.7k, BW=362MiB/s (380MB/s)
> Rnd read , 128KB, QD=1 , 1 job : IOPS=3530, BW=441MiB/s (463MB/s)
> Rnd read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1500MiB/s (1573MB/s)
> Rnd read , 128KB, QD=32, 4 jobs: IOPS=12.4k, BW=1555MiB/s (1631MB/s)
> Rnd read , 512KB, QD=1 , 1 job : IOPS=1541, BW=771MiB/s (808MB/s)
> Rnd read , 512KB, QD=32, 1 job : IOPS=3116, BW=1560MiB/s (1636MB/s)
> Rnd read , 512KB, QD=32, 4 jobs: IOPS=3099, BW=1556MiB/s (1632MB/s)
> Rnd write, 4KB, QD=1 , 1 job : IOPS=8748, BW=34.2MiB/s (35.8MB/s)
> Rnd write, 4KB, QD=32, 1 job : IOPS=57.6k, BW=225MiB/s (236MB/s)
> Rnd write, 4KB, QD=32, 4 jobs: IOPS=80.3k, BW=314MiB/s (329MB/s)
> Rnd write, 128KB, QD=1 , 1 job : IOPS=3878, BW=485MiB/s (508MB/s)
> Rnd write, 128KB, QD=32, 1 job : IOPS=9798, BW=1225MiB/s (1285MB/s)
> Rnd write, 128KB, QD=32, 4 jobs: IOPS=9970, BW=1248MiB/s (1308MB/s)
> Seq read , 128KB, QD=1 , 1 job : IOPS=4516, BW=565MiB/s (592MB/s)
> Seq read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1497MiB/s (1570MB/s)
> Seq read , 512KB, QD=1 , 1 job : IOPS=1571, BW=786MiB/s (824MB/s)
> Seq read , 512KB, QD=32, 1 job : IOPS=3073, BW=1538MiB/s (1613MB/s)
> Seq read , 1MB, QD=32, 1 job : IOPS=1573, BW=1576MiB/s (1653MB/s)
> Seq write, 128KB, QD=1 , 1 job : IOPS=3977, BW=497MiB/s (521MB/s)
> Seq write, 128KB, QD=32, 1 job : IOPS=9806, BW=1226MiB/s (1286MB/s)
> Seq write, 512KB, QD=1 , 1 job : IOPS=1404, BW=702MiB/s (736MB/s)
> Seq write, 512KB, QD=32, 1 job : IOPS=2496, BW=1250MiB/s (1310MB/s)
> Seq write, 1MB, QD=32, 1 job : IOPS=1252, BW=1256MiB/s (1317MB/s)
> Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1682, BW=836MiB/s (877MB/s)
> IOPS=1688, BW=838MiB/s (879MB/s)
>
> Best regards,
> Koichiro
>
> > Corn case have not tested, such as pause/resume transfer.
> >
> > Before
> >
> > Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> > Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> > Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> > Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> > Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> > Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> > Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> > Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> > Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> > Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> > Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> > Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> > Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> > Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> > Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> > Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> > IOPS=266, BW=135MiB/s (141MB/s)
> >
> > After
> >
> > Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> > Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> > Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> > Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> > Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> > Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> > Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> > Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> > Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> > Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> > Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> > Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> > Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> > Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> > Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> > Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> > IOPS=299, BW=149MiB/s (156MB/s)
> >
> > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > ---
> > Frank Li (5):
> > dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos
> > dmaengine: dw-edma: Move dw_hdma_set_callback_result() up
> > dmaengine: dw-edma: Make DMA link list work as a circular buffer
> > dmaengine: dw-edma: Dynamitc append new request during dmaengine running
> > dmaengine: dw-edma: Add trace support
> >
> > drivers/dma/dw-edma/Makefile | 3 +
> > drivers/dma/dw-edma/dw-edma-core.c | 215 ++++++++++++++++++++++++----------
> > drivers/dma/dw-edma/dw-edma-core.h | 42 ++++++-
> > drivers/dma/dw-edma/dw-edma-trace.c | 4 +
> > drivers/dma/dw-edma/dw-edma-trace.h | 150 ++++++++++++++++++++++++
> > drivers/dma/dw-edma/dw-edma-v0-core.c | 39 +++++-
> > drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++
> > 7 files changed, 409 insertions(+), 61 deletions(-)
> > ---
> > base-commit: 020f6d8442f35105660a29d0d236d3f8650c8142
> > change-id: 20251212-edma_dymatic-a57843ff0dfe
> >
> > Best regards,
> > --
> > Frank Li <Frank.Li@nxp.com>
> >
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-06-05 18:34 ` Frank Li
@ 2026-06-09 6:23 ` Koichiro Den
2026-06-09 15:12 ` Frank Li
0 siblings, 1 reply; 14+ messages in thread
From: Koichiro Den @ 2026-06-09 6:23 UTC (permalink / raw)
To: Frank Li
Cc: Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel, Kees Cook,
Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
Niklas Cassel, dmaengine, linux-kernel, linux-hardening,
linux-pci, linux-nvme, Damien Le Moal, imx
On Fri, Jun 05, 2026 at 02:34:00PM -0400, Frank Li wrote:
> On Thu, Jun 04, 2026 at 04:08:06PM +0900, Koichiro Den wrote:
> > On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
> > > Patch depend on
> > > https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
> > >
> > > Only test eDMA, have not tested HDMA.
> >
> > Hi Frank,
> >
> > I expect this series may be revisited in the near future, since the first
> > dependency series reached v7 and looks close to landing.
> >
> > With the latest versions of the two dependencies:
> > - [PATCH v7 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
> > https://lore.kernel.org/dmaengine/20260521-dma_prep_config-v7-0-1f73f4899883@nxp.com/
> > - [PATCH v2 00/11] dmaengine: dw-edma: flatten desc structions and simple code
> > https://lore.kernel.org/dmaengine/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/
> >
> > I tested this RFT series with the HDMA engine on a SpacemiT K3.
> > The test results are below, using the same format as your results:
> >
> > Baseline, before applying the three series (v7 + v2 + this RFT)
> >
> > Rnd read , 4KB, QD=1 , 1 job : IOPS=8567, BW=33.5MiB/s (35.1MB/s)
> > Rnd read , 4KB, QD=32, 1 job : IOPS=55.5k, BW=217MiB/s (227MB/s)
> > Rnd read , 4KB, QD=32, 4 jobs: IOPS=83.0k, BW=324MiB/s (340MB/s)
> > Rnd read , 128KB, QD=1 , 1 job : IOPS=3817, BW=477MiB/s (500MB/s)
> > Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1346MiB/s (1411MB/s)
> > Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> > Rnd read , 512KB, QD=1 , 1 job : IOPS=1515, BW=758MiB/s (794MB/s)
> > Rnd read , 512KB, QD=32, 1 job : IOPS=2795, BW=1399MiB/s (1467MB/s)
> > Rnd read , 512KB, QD=32, 4 jobs: IOPS=2795, BW=1404MiB/s (1472MB/s)
> > Rnd write, 4KB, QD=1 , 1 job : IOPS=9035, BW=35.3MiB/s (37.0MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=38.3k, BW=149MiB/s (157MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=41.8k, BW=163MiB/s (171MB/s)
> > Rnd write, 128KB, QD=1 , 1 job : IOPS=3969, BW=496MiB/s (520MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=8260, BW=1033MiB/s (1083MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=8295, BW=1038MiB/s (1089MB/s)
> > Seq read , 128KB, QD=1 , 1 job : IOPS=4609, BW=576MiB/s (604MB/s)
> > Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1345MiB/s (1410MB/s)
> > Seq read , 512KB, QD=1 , 1 job : IOPS=1524, BW=762MiB/s (799MB/s)
> > Seq read , 512KB, QD=32, 1 job : IOPS=2799, BW=1401MiB/s (1469MB/s)
> > Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> > Seq write, 128KB, QD=1 , 1 job : IOPS=3722, BW=465MiB/s (488MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=8246, BW=1031MiB/s (1081MB/s)
> > Seq write, 512KB, QD=1 , 1 job : IOPS=1283, BW=642MiB/s (673MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=2072, BW=1038MiB/s (1088MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=1037, BW=1040MiB/s (1091MB/s)
> > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1540, BW=768MiB/s (805MB/s)
> > IOPS=1549, BW=768MiB/s (805MB/s)
> >
> > After your three series (v7 + v2 + this)
> >
> > Rnd read , 4KB, QD=1 , 1 job : IOPS=7216, BW=28.2MiB/s (29.6MB/s)
> > Rnd read , 4KB, QD=32, 1 job : IOPS=61.1k, BW=239MiB/s (250MB/s)
> > Rnd read , 4KB, QD=32, 4 jobs: IOPS=75.3k, BW=294MiB/s (309MB/s)
> > Rnd read , 128KB, QD=1 , 1 job : IOPS=4711, BW=589MiB/s (618MB/s)
> > Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> > Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> > Rnd read , 512KB, QD=1 , 1 job : IOPS=1497, BW=749MiB/s (785MB/s)
> > Rnd read , 512KB, QD=32, 1 job : IOPS=2802, BW=1403MiB/s (1471MB/s)
> > Rnd read , 512KB, QD=32, 4 jobs: IOPS=2798, BW=1405MiB/s (1474MB/s)
> > Rnd write, 4KB, QD=1 , 1 job : IOPS=7411, BW=29.0MiB/s (30.4MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=39.3k, BW=153MiB/s (161MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=42.9k, BW=167MiB/s (176MB/s)
> > Rnd write, 128KB, QD=1 , 1 job : IOPS=3736, BW=467MiB/s (490MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=8302, BW=1038MiB/s (1089MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=8314, BW=1041MiB/s (1091MB/s)
> > Seq read , 128KB, QD=1 , 1 job : IOPS=4092, BW=512MiB/s (536MB/s)
> > Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> > Seq read , 512KB, QD=1 , 1 job : IOPS=1474, BW=737MiB/s (773MB/s)
> > Seq read , 512KB, QD=32, 1 job : IOPS=2794, BW=1399MiB/s (1467MB/s)
> > Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> > Seq write, 128KB, QD=1 , 1 job : IOPS=4135, BW=517MiB/s (542MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=8307, BW=1039MiB/s (1089MB/s)
> > Seq write, 512KB, QD=1 , 1 job : IOPS=1259, BW=630MiB/s (660MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=2073, BW=1038MiB/s (1089MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=1034, BW=1038MiB/s (1088MB/s)
> > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1531, BW=763MiB/s (801MB/s)
> > IOPS=1540, BW=765MiB/s (802MB/s)
This was false. I cleaned up my test environment and retested your three series
again. It seems that the test cannot even run properly. Sorry for the confusion.
(Note that the other results, i.e. "Baseline" and "use of HDMA watermark
interrupts", were re-verified.)
So I looked into why this RFT series does not work well with HDMA. My current
understanding is that HDMA dynamic append needs watermark interrupts from the
beginnning.
The PCI Express DMA Controller Databook (6.10a-lca06), Table 7-3 Channel Context
Register Considerations, says that while the channel is RUNNING, HDMA updates
HDMA_LLP_* only when a watermark interrupt event occurs. It also says that
software can use watermark interrupts to obtain the current transfer location
and recycle descriptors up to the LLP value.
So, without watermark interrupts, I do not think HDMA_LLP_* polling from
software gives us a reliable/valid running progress point for cookie completion.
The only conservative completion point left is the STOP interrupt (i.e. the
current base model).
However, with "dynamic append", software keeps recycling/refilling the ring, so
the channel may continue running and the STOP interrupt can be delayed
indefinitely. In that case, DMA cookies are not completed in time, which leads
to dma_sync_wait() timeouts on my HDMA setup.
Therefore, now I do not think the current STOP-interrupt-only model is suitable
for HDMA dynamic append. If no objections, I will submit a reworked version of
this RFT series that keeps many of your original changes, but enables and uses
HDMA watermark interrupts for the HDMA dynamic-append path.
Best regards,
Koichiro
> >
> > On this HDMA setup, I did not observe a clear performance difference from
> > applying the three series alone. Still, I like the overall direction.
> >
> >
> > P.S.
> > Separately, as a follow-up experiment, I also prototyped an extra series on top
> > of your three series that allows us to make use of HDMA watermark interrupts.
> > With that series, in particular for the high queue-depth cases, the results
> > improved noticeably on this platform. I haven't posted that series yet though.
>
> Thanks for test it. I am monitor above recondition patch set.
>
> Frank
> >
> > After your three series (v7 + v2 + this) + use of HDMA watermark interrupts
> >
> > Rnd read , 4KB, QD=1 , 1 job : IOPS=8016, BW=31.3MiB/s (32.8MB/s)
> > Rnd read , 4KB, QD=32, 1 job : IOPS=63.4k, BW=248MiB/s (260MB/s)
> > Rnd read , 4KB, QD=32, 4 jobs: IOPS=92.7k, BW=362MiB/s (380MB/s)
> > Rnd read , 128KB, QD=1 , 1 job : IOPS=3530, BW=441MiB/s (463MB/s)
> > Rnd read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1500MiB/s (1573MB/s)
> > Rnd read , 128KB, QD=32, 4 jobs: IOPS=12.4k, BW=1555MiB/s (1631MB/s)
> > Rnd read , 512KB, QD=1 , 1 job : IOPS=1541, BW=771MiB/s (808MB/s)
> > Rnd read , 512KB, QD=32, 1 job : IOPS=3116, BW=1560MiB/s (1636MB/s)
> > Rnd read , 512KB, QD=32, 4 jobs: IOPS=3099, BW=1556MiB/s (1632MB/s)
> > Rnd write, 4KB, QD=1 , 1 job : IOPS=8748, BW=34.2MiB/s (35.8MB/s)
> > Rnd write, 4KB, QD=32, 1 job : IOPS=57.6k, BW=225MiB/s (236MB/s)
> > Rnd write, 4KB, QD=32, 4 jobs: IOPS=80.3k, BW=314MiB/s (329MB/s)
> > Rnd write, 128KB, QD=1 , 1 job : IOPS=3878, BW=485MiB/s (508MB/s)
> > Rnd write, 128KB, QD=32, 1 job : IOPS=9798, BW=1225MiB/s (1285MB/s)
> > Rnd write, 128KB, QD=32, 4 jobs: IOPS=9970, BW=1248MiB/s (1308MB/s)
> > Seq read , 128KB, QD=1 , 1 job : IOPS=4516, BW=565MiB/s (592MB/s)
> > Seq read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1497MiB/s (1570MB/s)
> > Seq read , 512KB, QD=1 , 1 job : IOPS=1571, BW=786MiB/s (824MB/s)
> > Seq read , 512KB, QD=32, 1 job : IOPS=3073, BW=1538MiB/s (1613MB/s)
> > Seq read , 1MB, QD=32, 1 job : IOPS=1573, BW=1576MiB/s (1653MB/s)
> > Seq write, 128KB, QD=1 , 1 job : IOPS=3977, BW=497MiB/s (521MB/s)
> > Seq write, 128KB, QD=32, 1 job : IOPS=9806, BW=1226MiB/s (1286MB/s)
> > Seq write, 512KB, QD=1 , 1 job : IOPS=1404, BW=702MiB/s (736MB/s)
> > Seq write, 512KB, QD=32, 1 job : IOPS=2496, BW=1250MiB/s (1310MB/s)
> > Seq write, 1MB, QD=32, 1 job : IOPS=1252, BW=1256MiB/s (1317MB/s)
> > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1682, BW=836MiB/s (877MB/s)
> > IOPS=1688, BW=838MiB/s (879MB/s)
> >
> > Best regards,
> > Koichiro
> >
> > > Corn case have not tested, such as pause/resume transfer.
> > >
> > > Before
> > >
> > > Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> > > Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> > > Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> > > Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> > > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> > > Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> > > Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> > > Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> > > Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> > > Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> > > Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> > > Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> > > Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> > > Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> > > Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> > > Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> > > Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> > > Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> > > Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> > > Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> > > Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> > > Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> > > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> > > IOPS=266, BW=135MiB/s (141MB/s)
> > >
> > > After
> > >
> > > Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> > > Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> > > Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> > > Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> > > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> > > Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> > > Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> > > Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> > > Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> > > Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> > > Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> > > Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> > > Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> > > Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> > > Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> > > Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> > > Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> > > Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> > > Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> > > Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> > > Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> > > Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> > > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> > > IOPS=299, BW=149MiB/s (156MB/s)
> > >
> > > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > > ---
> > > Frank Li (5):
> > > dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos
> > > dmaengine: dw-edma: Move dw_hdma_set_callback_result() up
> > > dmaengine: dw-edma: Make DMA link list work as a circular buffer
> > > dmaengine: dw-edma: Dynamitc append new request during dmaengine running
> > > dmaengine: dw-edma: Add trace support
> > >
> > > drivers/dma/dw-edma/Makefile | 3 +
> > > drivers/dma/dw-edma/dw-edma-core.c | 215 ++++++++++++++++++++++++----------
> > > drivers/dma/dw-edma/dw-edma-core.h | 42 ++++++-
> > > drivers/dma/dw-edma/dw-edma-trace.c | 4 +
> > > drivers/dma/dw-edma/dw-edma-trace.h | 150 ++++++++++++++++++++++++
> > > drivers/dma/dw-edma/dw-edma-v0-core.c | 39 +++++-
> > > drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++
> > > 7 files changed, 409 insertions(+), 61 deletions(-)
> > > ---
> > > base-commit: 020f6d8442f35105660a29d0d236d3f8650c8142
> > > change-id: 20251212-edma_dymatic-a57843ff0dfe
> > >
> > > Best regards,
> > > --
> > > Frank Li <Frank.Li@nxp.com>
> > >
^ permalink raw reply [flat|nested] 14+ messages in thread
* Re: [PATCH RFT 0/5] dmaengine: dw-edma: support dynamtic add link entry during dma engine running
2026-06-09 6:23 ` Koichiro Den
@ 2026-06-09 15:12 ` Frank Li
0 siblings, 0 replies; 14+ messages in thread
From: Frank Li @ 2026-06-09 15:12 UTC (permalink / raw)
To: Koichiro Den
Cc: Frank Li, Manivannan Sadhasivam, Vinod Koul, Gustavo Pimentel,
Kees Cook, Gustavo A. R. Silva, Krzysztof Wilczyński,
Kishon Vijay Abraham I, Bjorn Helgaas, Christoph Hellwig,
Niklas Cassel, dmaengine, linux-kernel, linux-hardening,
linux-pci, linux-nvme, Damien Le Moal, imx
On Tue, Jun 09, 2026 at 03:23:03PM +0900, Koichiro Den wrote:
> On Fri, Jun 05, 2026 at 02:34:00PM -0400, Frank Li wrote:
> > On Thu, Jun 04, 2026 at 04:08:06PM +0900, Koichiro Den wrote:
> > > On Fri, Jan 09, 2026 at 03:13:24PM -0500, Frank Li wrote:
> > > > Patch depend on
> > > > https://lore.kernel.org/imx/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/T/#t
> > > >
> > > > Only test eDMA, have not tested HDMA.
> > >
> > > Hi Frank,
> > >
> > > I expect this series may be revisited in the near future, since the first
> > > dependency series reached v7 and looks close to landing.
> > >
> > > With the latest versions of the two dependencies:
> > > - [PATCH v7 0/9] dmaengine: Add new API to combine configuration and descriptor preparation
> > > https://lore.kernel.org/dmaengine/20260521-dma_prep_config-v7-0-1f73f4899883@nxp.com/
> > > - [PATCH v2 00/11] dmaengine: dw-edma: flatten desc structions and simple code
> > > https://lore.kernel.org/dmaengine/20260109-edma_ll-v2-0-5c0b27b2c664@nxp.com/
> > >
> > > I tested this RFT series with the HDMA engine on a SpacemiT K3.
> > > The test results are below, using the same format as your results:
> > >
> > > Baseline, before applying the three series (v7 + v2 + this RFT)
> > >
> > > Rnd read , 4KB, QD=1 , 1 job : IOPS=8567, BW=33.5MiB/s (35.1MB/s)
> > > Rnd read , 4KB, QD=32, 1 job : IOPS=55.5k, BW=217MiB/s (227MB/s)
> > > Rnd read , 4KB, QD=32, 4 jobs: IOPS=83.0k, BW=324MiB/s (340MB/s)
> > > Rnd read , 128KB, QD=1 , 1 job : IOPS=3817, BW=477MiB/s (500MB/s)
> > > Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1346MiB/s (1411MB/s)
> > > Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> > > Rnd read , 512KB, QD=1 , 1 job : IOPS=1515, BW=758MiB/s (794MB/s)
> > > Rnd read , 512KB, QD=32, 1 job : IOPS=2795, BW=1399MiB/s (1467MB/s)
> > > Rnd read , 512KB, QD=32, 4 jobs: IOPS=2795, BW=1404MiB/s (1472MB/s)
> > > Rnd write, 4KB, QD=1 , 1 job : IOPS=9035, BW=35.3MiB/s (37.0MB/s)
> > > Rnd write, 4KB, QD=32, 1 job : IOPS=38.3k, BW=149MiB/s (157MB/s)
> > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=41.8k, BW=163MiB/s (171MB/s)
> > > Rnd write, 128KB, QD=1 , 1 job : IOPS=3969, BW=496MiB/s (520MB/s)
> > > Rnd write, 128KB, QD=32, 1 job : IOPS=8260, BW=1033MiB/s (1083MB/s)
> > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=8295, BW=1038MiB/s (1089MB/s)
> > > Seq read , 128KB, QD=1 , 1 job : IOPS=4609, BW=576MiB/s (604MB/s)
> > > Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1345MiB/s (1410MB/s)
> > > Seq read , 512KB, QD=1 , 1 job : IOPS=1524, BW=762MiB/s (799MB/s)
> > > Seq read , 512KB, QD=32, 1 job : IOPS=2799, BW=1401MiB/s (1469MB/s)
> > > Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> > > Seq write, 128KB, QD=1 , 1 job : IOPS=3722, BW=465MiB/s (488MB/s)
> > > Seq write, 128KB, QD=32, 1 job : IOPS=8246, BW=1031MiB/s (1081MB/s)
> > > Seq write, 512KB, QD=1 , 1 job : IOPS=1283, BW=642MiB/s (673MB/s)
> > > Seq write, 512KB, QD=32, 1 job : IOPS=2072, BW=1038MiB/s (1088MB/s)
> > > Seq write, 1MB, QD=32, 1 job : IOPS=1037, BW=1040MiB/s (1091MB/s)
> > > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1540, BW=768MiB/s (805MB/s)
> > > IOPS=1549, BW=768MiB/s (805MB/s)
> > >
> > > After your three series (v7 + v2 + this)
> > >
> > > Rnd read , 4KB, QD=1 , 1 job : IOPS=7216, BW=28.2MiB/s (29.6MB/s)
> > > Rnd read , 4KB, QD=32, 1 job : IOPS=61.1k, BW=239MiB/s (250MB/s)
> > > Rnd read , 4KB, QD=32, 4 jobs: IOPS=75.3k, BW=294MiB/s (309MB/s)
> > > Rnd read , 128KB, QD=1 , 1 job : IOPS=4711, BW=589MiB/s (618MB/s)
> > > Rnd read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> > > Rnd read , 128KB, QD=32, 4 jobs: IOPS=11.2k, BW=1403MiB/s (1471MB/s)
> > > Rnd read , 512KB, QD=1 , 1 job : IOPS=1497, BW=749MiB/s (785MB/s)
> > > Rnd read , 512KB, QD=32, 1 job : IOPS=2802, BW=1403MiB/s (1471MB/s)
> > > Rnd read , 512KB, QD=32, 4 jobs: IOPS=2798, BW=1405MiB/s (1474MB/s)
> > > Rnd write, 4KB, QD=1 , 1 job : IOPS=7411, BW=29.0MiB/s (30.4MB/s)
> > > Rnd write, 4KB, QD=32, 1 job : IOPS=39.3k, BW=153MiB/s (161MB/s)
> > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=42.9k, BW=167MiB/s (176MB/s)
> > > Rnd write, 128KB, QD=1 , 1 job : IOPS=3736, BW=467MiB/s (490MB/s)
> > > Rnd write, 128KB, QD=32, 1 job : IOPS=8302, BW=1038MiB/s (1089MB/s)
> > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=8314, BW=1041MiB/s (1091MB/s)
> > > Seq read , 128KB, QD=1 , 1 job : IOPS=4092, BW=512MiB/s (536MB/s)
> > > Seq read , 128KB, QD=32, 1 job : IOPS=10.8k, BW=1354MiB/s (1420MB/s)
> > > Seq read , 512KB, QD=1 , 1 job : IOPS=1474, BW=737MiB/s (773MB/s)
> > > Seq read , 512KB, QD=32, 1 job : IOPS=2794, BW=1399MiB/s (1467MB/s)
> > > Seq read , 1MB, QD=32, 1 job : IOPS=1401, BW=1404MiB/s (1472MB/s)
> > > Seq write, 128KB, QD=1 , 1 job : IOPS=4135, BW=517MiB/s (542MB/s)
> > > Seq write, 128KB, QD=32, 1 job : IOPS=8307, BW=1039MiB/s (1089MB/s)
> > > Seq write, 512KB, QD=1 , 1 job : IOPS=1259, BW=630MiB/s (660MB/s)
> > > Seq write, 512KB, QD=32, 1 job : IOPS=2073, BW=1038MiB/s (1089MB/s)
> > > Seq write, 1MB, QD=32, 1 job : IOPS=1034, BW=1038MiB/s (1088MB/s)
> > > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1531, BW=763MiB/s (801MB/s)
> > > IOPS=1540, BW=765MiB/s (802MB/s)
>
> This was false. I cleaned up my test environment and retested your three series
> again. It seems that the test cannot even run properly. Sorry for the confusion.
> (Note that the other results, i.e. "Baseline" and "use of HDMA watermark
> interrupts", were re-verified.)
>
> So I looked into why this RFT series does not work well with HDMA. My current
> understanding is that HDMA dynamic append needs watermark interrupts from the
> beginnning.
>
> The PCI Express DMA Controller Databook (6.10a-lca06), Table 7-3 Channel Context
> Register Considerations, says that while the channel is RUNNING, HDMA updates
> HDMA_LLP_* only when a watermark interrupt event occurs. It also says that
> software can use watermark interrupts to obtain the current transfer location
> and recycle descriptors up to the LLP value.
>
> So, without watermark interrupts, I do not think HDMA_LLP_* polling from
> software gives us a reliable/valid running progress point for cookie completion.
> The only conservative completion point left is the STOP interrupt (i.e. the
> current base model).
>
> However, with "dynamic append", software keeps recycling/refilling the ring, so
> the channel may continue running and the STOP interrupt can be delayed
> indefinitely. In that case, DMA cookies are not completed in time, which leads
> to dma_sync_wait() timeouts on my HDMA setup.
>
> Therefore, now I do not think the current STOP-interrupt-only model is suitable
> for HDMA dynamic append. If no objections, I will submit a reworked version of
> this RFT series that keeps many of your original changes, but enables and uses
> HDMA watermark interrupts for the HDMA dynamic-append path.
Okay, thanks! I have not hardware to test HDMA. EDMA actually have some
risk condition when dynamic-append. I report the problem synosis, but not
get feedback.
Frank
>
> Best regards,
> Koichiro
>
> > >
> > > On this HDMA setup, I did not observe a clear performance difference from
> > > applying the three series alone. Still, I like the overall direction.
> > >
> > >
> > > P.S.
> > > Separately, as a follow-up experiment, I also prototyped an extra series on top
> > > of your three series that allows us to make use of HDMA watermark interrupts.
> > > With that series, in particular for the high queue-depth cases, the results
> > > improved noticeably on this platform. I haven't posted that series yet though.
> >
> > Thanks for test it. I am monitor above recondition patch set.
> >
> > Frank
> > >
> > > After your three series (v7 + v2 + this) + use of HDMA watermark interrupts
> > >
> > > Rnd read , 4KB, QD=1 , 1 job : IOPS=8016, BW=31.3MiB/s (32.8MB/s)
> > > Rnd read , 4KB, QD=32, 1 job : IOPS=63.4k, BW=248MiB/s (260MB/s)
> > > Rnd read , 4KB, QD=32, 4 jobs: IOPS=92.7k, BW=362MiB/s (380MB/s)
> > > Rnd read , 128KB, QD=1 , 1 job : IOPS=3530, BW=441MiB/s (463MB/s)
> > > Rnd read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1500MiB/s (1573MB/s)
> > > Rnd read , 128KB, QD=32, 4 jobs: IOPS=12.4k, BW=1555MiB/s (1631MB/s)
> > > Rnd read , 512KB, QD=1 , 1 job : IOPS=1541, BW=771MiB/s (808MB/s)
> > > Rnd read , 512KB, QD=32, 1 job : IOPS=3116, BW=1560MiB/s (1636MB/s)
> > > Rnd read , 512KB, QD=32, 4 jobs: IOPS=3099, BW=1556MiB/s (1632MB/s)
> > > Rnd write, 4KB, QD=1 , 1 job : IOPS=8748, BW=34.2MiB/s (35.8MB/s)
> > > Rnd write, 4KB, QD=32, 1 job : IOPS=57.6k, BW=225MiB/s (236MB/s)
> > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=80.3k, BW=314MiB/s (329MB/s)
> > > Rnd write, 128KB, QD=1 , 1 job : IOPS=3878, BW=485MiB/s (508MB/s)
> > > Rnd write, 128KB, QD=32, 1 job : IOPS=9798, BW=1225MiB/s (1285MB/s)
> > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=9970, BW=1248MiB/s (1308MB/s)
> > > Seq read , 128KB, QD=1 , 1 job : IOPS=4516, BW=565MiB/s (592MB/s)
> > > Seq read , 128KB, QD=32, 1 job : IOPS=12.0k, BW=1497MiB/s (1570MB/s)
> > > Seq read , 512KB, QD=1 , 1 job : IOPS=1571, BW=786MiB/s (824MB/s)
> > > Seq read , 512KB, QD=32, 1 job : IOPS=3073, BW=1538MiB/s (1613MB/s)
> > > Seq read , 1MB, QD=32, 1 job : IOPS=1573, BW=1576MiB/s (1653MB/s)
> > > Seq write, 128KB, QD=1 , 1 job : IOPS=3977, BW=497MiB/s (521MB/s)
> > > Seq write, 128KB, QD=32, 1 job : IOPS=9806, BW=1226MiB/s (1286MB/s)
> > > Seq write, 512KB, QD=1 , 1 job : IOPS=1404, BW=702MiB/s (736MB/s)
> > > Seq write, 512KB, QD=32, 1 job : IOPS=2496, BW=1250MiB/s (1310MB/s)
> > > Seq write, 1MB, QD=32, 1 job : IOPS=1252, BW=1256MiB/s (1317MB/s)
> > > Rnd rdwr , 4K..1MB, QD=8 , 4 jobs: IOPS=1682, BW=836MiB/s (877MB/s)
> > > IOPS=1688, BW=838MiB/s (879MB/s)
> > >
> > > Best regards,
> > > Koichiro
> > >
> > > > Corn case have not tested, such as pause/resume transfer.
> > > >
> > > > Before
> > > >
> > > > Rnd read, 4KB, QD=1, 1 job : IOPS=6780, BW=26.5MiB/s (27.8MB/s)
> > > > Rnd read, 4KB, QD=32, 1 job : IOPS=28.6k, BW=112MiB/s (117MB/s)
> > > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=33.4k, BW=130MiB/s (137MB/s)
> > > > Rnd read, 128KB, QD=1, 1 job : IOPS=1188, BW=149MiB/s (156MB/s)
> > > > Rnd read, 128KB, QD=32, 1 job : IOPS=1440, BW=180MiB/s (189MB/s)
> > > > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1282, BW=160MiB/s (168MB/s)
> > > > Rnd read, 512KB, QD=1, 1 job : IOPS=254, BW=127MiB/s (134MB/s)
> > > > Rnd read, 512KB, QD=32, 1 job : IOPS=354, BW=177MiB/s (186MB/s)
> > > > Rnd read, 512KB, QD=32, 4 jobs: IOPS=388, BW=194MiB/s (204MB/s)
> > > > Rnd write, 4KB, QD=1, 1 job : IOPS=6282, BW=24.5MiB/s (25.7MB/s)
> > > > Rnd write, 4KB, QD=32, 1 job : IOPS=24.9k, BW=97.5MiB/s (102MB/s)
> > > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=27.4k, BW=107MiB/s (112MB/s)
> > > > Rnd write, 128KB, QD=1, 1 job : IOPS=1098, BW=137MiB/s (144MB/s)
> > > > Rnd write, 128KB, QD=32, 1 job : IOPS=1195, BW=149MiB/s (157MB/s)
> > > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1120, BW=140MiB/s (147MB/s)
> > > > Seq read, 128KB, QD=1, 1 job : IOPS=936, BW=117MiB/s (123MB/s)
> > > > Seq read, 128KB, QD=32, 1 job : IOPS=1218, BW=152MiB/s (160MB/s)
> > > > Seq read, 512KB, QD=1, 1 job : IOPS=301, BW=151MiB/s (158MB/s)
> > > > Seq read, 512KB, QD=32, 1 job : IOPS=360, BW=180MiB/s (189MB/s)
> > > > Seq read, 1MB, QD=32, 1 job : IOPS=193, BW=194MiB/s (203MB/s)
> > > > Seq write, 128KB, QD=1, 1 job : IOPS=796, BW=99.5MiB/s (104MB/s)
> > > > Seq write, 128KB, QD=32, 1 job : IOPS=1019, BW=127MiB/s (134MB/s)
> > > > Seq write, 512KB, QD=1, 1 job : IOPS=213, BW=107MiB/s (112MB/s)
> > > > Seq write, 512KB, QD=32, 1 job : IOPS=273, BW=137MiB/s (143MB/s)
> > > > Seq write, 1MB, QD=32, 1 job : IOPS=168, BW=168MiB/s (177MB/s)
> > > > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=255, BW=128MiB/s (134MB/s)
> > > > IOPS=266, BW=135MiB/s (141MB/s)
> > > >
> > > > After
> > > >
> > > > Rnd read, 4KB, QD=1, 1 job : IOPS=6148, BW=24.0MiB/s (25.2MB/s)
> > > > Rnd read, 4KB, QD=32, 1 job : IOPS=29.4k, BW=115MiB/s (121MB/s)
> > > > Rnd read, 4KB, QD=32, 4 jobs: IOPS=38.8k, BW=151MiB/s (159MB/s)
> > > > Rnd read, 128KB, QD=1, 1 job : IOPS=859, BW=107MiB/s (113MB/s)
> > > > Rnd read, 128KB, QD=32, 1 job : IOPS=1504, BW=188MiB/s (197MB/s)
> > > > Rnd read, 128KB, QD=32, 4 jobs: IOPS=1531, BW=191MiB/s (201MB/s)
> > > > Rnd read, 512KB, QD=1, 1 job : IOPS=238, BW=119MiB/s (125MB/s)
> > > > Rnd read, 512KB, QD=32, 1 job : IOPS=390, BW=195MiB/s (205MB/s)
> > > > Rnd read, 512KB, QD=32, 4 jobs: IOPS=404, BW=202MiB/s (212MB/s)
> > > > Rnd write, 4KB, QD=1, 1 job : IOPS=5801, BW=22.7MiB/s (23.8MB/s)
> > > > Rnd write, 4KB, QD=32, 1 job : IOPS=24.7k, BW=96.6MiB/s (101MB/s)
> > > > Rnd write, 4KB, QD=32, 4 jobs: IOPS=32.7k, BW=128MiB/s (134MB/s)
> > > > Rnd write, 128KB, QD=1, 1 job : IOPS=744, BW=93.1MiB/s (97.6MB/s)
> > > > Rnd write, 128KB, QD=32, 1 job : IOPS=1278, BW=160MiB/s (168MB/s)
> > > > Rnd write, 128KB, QD=32, 4 jobs: IOPS=1278, BW=160MiB/s (168MB/s)
> > > > Seq read, 128KB, QD=1, 1 job : IOPS=853, BW=107MiB/s (112MB/s)
> > > > Seq read, 128KB, QD=32, 1 job : IOPS=1511, BW=189MiB/s (198MB/s)
> > > > Seq read, 512KB, QD=1, 1 job : IOPS=240, BW=120MiB/s (126MB/s)
> > > > Seq read, 512KB, QD=32, 1 job : IOPS=386, BW=193MiB/s (203MB/s)
> > > > Seq read, 1MB, QD=32, 1 job : IOPS=200, BW=201MiB/s (211MB/s)
> > > > Seq write, 128KB, QD=1, 1 job : IOPS=749, BW=93.7MiB/s (98.3MB/s)
> > > > Seq write, 128KB, QD=32, 1 job : IOPS=1266, BW=158MiB/s (166MB/s)
> > > > Seq write, 512KB, QD=1, 1 job : IOPS=198, BW=99.0MiB/s (104MB/s)
> > > > Seq write, 512KB, QD=32, 1 job : IOPS=352, BW=176MiB/s (185MB/s)
> > > > Seq write, 1MB, QD=32, 1 job : IOPS=184, BW=184MiB/s (193MB/s)
> > > > Rnd rdwr, 4K..1MB, QD=8, 4 jobs: IOPS=287, BW=145MiB/s (152MB/s)
> > > > IOPS=299, BW=149MiB/s (156MB/s)
> > > >
> > > > Signed-off-by: Frank Li <Frank.Li@nxp.com>
> > > > ---
> > > > Frank Li (5):
> > > > dmaengine: dw-edma: Add dw_edma_core_ll_cur_idx() to get completed link entry pos
> > > > dmaengine: dw-edma: Move dw_hdma_set_callback_result() up
> > > > dmaengine: dw-edma: Make DMA link list work as a circular buffer
> > > > dmaengine: dw-edma: Dynamitc append new request during dmaengine running
> > > > dmaengine: dw-edma: Add trace support
> > > >
> > > > drivers/dma/dw-edma/Makefile | 3 +
> > > > drivers/dma/dw-edma/dw-edma-core.c | 215 ++++++++++++++++++++++++----------
> > > > drivers/dma/dw-edma/dw-edma-core.h | 42 ++++++-
> > > > drivers/dma/dw-edma/dw-edma-trace.c | 4 +
> > > > drivers/dma/dw-edma/dw-edma-trace.h | 150 ++++++++++++++++++++++++
> > > > drivers/dma/dw-edma/dw-edma-v0-core.c | 39 +++++-
> > > > drivers/dma/dw-edma/dw-hdma-v0-core.c | 17 +++
> > > > 7 files changed, 409 insertions(+), 61 deletions(-)
> > > > ---
> > > > base-commit: 020f6d8442f35105660a29d0d236d3f8650c8142
> > > > change-id: 20251212-edma_dymatic-a57843ff0dfe
> > > >
> > > > Best regards,
> > > > --
> > > > Frank Li <Frank.Li@nxp.com>
> > > >
>
^ permalink raw reply [flat|nested] 14+ messages in thread