* [PATCH 0/5] arm-smmu: performance optimization
@ 2017-06-26 13:38 Zhen Lei
  2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
                   ` (5 more replies)
  0 siblings, 6 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
I described the optimization more detail in patch 1 and 2, and patch 3-5 are
the implementation on arm-smmu/arm-smmu-v3 of patch 2.
Patch 1 is v2. In v1, I directly replaced writel with writel_relaxed in
queue_inc_prod. But Robin figured that it may lead SMMU consume stale
memory contents. I thought more than 3 whole days and got this one.
This patchset is based on Robin Murphy's [PATCH v2 0/8] io-pgtable lock removal.
Zhen Lei (5):
  iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock
    confliction
  iommu: add a new member unmap_tlb_sync into struct iommu_ops
  iommu/arm-smmu-v3: add support for unmap an iova range with only one
    tlb sync
  iommu/arm-smmu: add support for unmap a memory range with only one tlb
    sync
  iommu/io-pgtable: delete member tlb_sync_pending of struct io_pgtable
 drivers/iommu/arm-smmu-v3.c        | 52 ++++++++++++++++++++++++++++++++++----
 drivers/iommu/arm-smmu.c           | 10 ++++++++
 drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++--------
 drivers/iommu/io-pgtable-arm.c     | 30 ++++++++++++++--------
 drivers/iommu/io-pgtable.h         |  9 ++-----
 drivers/iommu/iommu.c              |  3 +++
 include/linux/iommu.h              |  1 +
 7 files changed, 104 insertions(+), 33 deletions(-)
-- 
2.5.0
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
@ 2017-06-26 13:38 ` Zhen Lei
  2017-06-28  9:32   ` Will Deacon
  2017-08-22 15:41   ` Joerg Roedel
  2017-06-26 13:38 ` [PATCH 2/5] iommu: add a new member unmap_tlb_sync into struct iommu_ops Zhen Lei
                   ` (4 subsequent siblings)
  5 siblings, 2 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
Because all TLBI commands should be followed by a SYNC command, to make
sure that it has been completely finished. So we can just add the TLBI
commands into the queue, and put off the execution until meet SYNC or
other commands. To prevent the followed SYNC command waiting for a long
time because of too many commands have been delayed, restrict the max
delayed number.
According to my test, I got the same performance data as I replaced writel
with writel_relaxed in queue_inc_prod.
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 291da5f..4481123 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -337,6 +337,7 @@
 /* Command queue */
 #define CMDQ_ENT_DWORDS			2
 #define CMDQ_MAX_SZ_SHIFT		8
+#define CMDQ_MAX_DELAYED		32
 
 #define CMDQ_ERR_SHIFT			24
 #define CMDQ_ERR_MASK			0x7f
@@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
 			};
 		} cfgi;
 
+		#define CMDQ_OP_TLBI_NH_ALL	0x10
 		#define CMDQ_OP_TLBI_NH_ASID	0x11
 		#define CMDQ_OP_TLBI_NH_VA	0x12
 		#define CMDQ_OP_TLBI_EL2_ALL	0x20
@@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
 
 struct arm_smmu_queue {
 	int				irq; /* Wired interrupt */
+	u32				nr_delay;
 
 	__le64				*base;
 	dma_addr_t			base_dma;
@@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
 	return ret;
 }
 
-static void queue_inc_prod(struct arm_smmu_queue *q)
+static void queue_inc_swprod(struct arm_smmu_queue *q)
 {
-	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
+	u32 prod = q->prod + 1;
 
 	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
+}
+
+static void queue_inc_prod(struct arm_smmu_queue *q)
+{
+	queue_inc_swprod(q);
 	writel(q->prod, q->prod_reg);
 }
 
@@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
 		*dst++ = cpu_to_le64(*src++);
 }
 
-static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
+static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
 {
 	if (queue_full(q))
 		return -ENOSPC;
 
 	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
-	queue_inc_prod(q);
+
+	/*
+	 * We don't want too many commands to be delayed, this may lead the
+	 * followed sync command to wait for a long time.
+	 */
+	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
+		queue_inc_swprod(q);
+	} else {
+		queue_inc_prod(q);
+		q->nr_delay = 0;
+	}
+
 	return 0;
 }
 
@@ -909,6 +928,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
 static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 				    struct arm_smmu_cmdq_ent *ent)
 {
+	int optimize = 0;
 	u64 cmd[CMDQ_ENT_DWORDS];
 	unsigned long flags;
 	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
@@ -920,8 +940,17 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
 		return;
 	}
 
+	/*
+	 * All TLBI commands should be followed by a sync command later.
+	 * The CFGI commands is the same, but they are rarely executed.
+	 * So just optimize TLBI commands now, to reduce the "if" judgement.
+	 */
+	if ((ent->opcode >= CMDQ_OP_TLBI_NH_ALL) &&
+	    (ent->opcode <= CMDQ_OP_TLBI_NSNH_ALL))
+		optimize = 1;
+
 	spin_lock_irqsave(&smmu->cmdq.lock, flags);
-	while (queue_insert_raw(q, cmd) == -ENOSPC) {
+	while (queue_insert_raw(q, cmd, optimize) == -ENOSPC) {
 		if (queue_poll_cons(q, false, wfe))
 			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
 	}
@@ -1953,6 +1982,8 @@ static int arm_smmu_init_one_queue(struct arm_smmu_device *smmu,
 		     << Q_BASE_LOG2SIZE_SHIFT;
 
 	q->prod = q->cons = 0;
+	q->nr_delay = 0;
+
 	return 0;
 }
 
@@ -2512,6 +2543,7 @@ static int arm_smmu_device_hw_probe(struct arm_smmu_device *smmu)
 		dev_err(smmu->dev, "unit-length command queue not supported\n");
 		return -ENXIO;
 	}
+	BUILD_BUG_ON(CMDQ_MAX_DELAYED >= (1 << CMDQ_MAX_SZ_SHIFT));
 
 	smmu->evtq.q.max_n_shift = min((u32)EVTQ_MAX_SZ_SHIFT,
 				       reg >> IDR1_EVTQ_SHIFT & IDR1_EVTQ_MASK);
-- 
2.5.0
^ permalink raw reply related	[flat|nested] 19+ messages in thread
* [PATCH 2/5] iommu: add a new member unmap_tlb_sync into struct iommu_ops
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
  2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
@ 2017-06-26 13:38 ` Zhen Lei
  2017-06-26 13:38 ` [PATCH 3/5] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
                   ` (3 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
An iova range may contain many pages/blocks, especially for the case of
unmap_sg. Currently, for each page/block unmapping, a tlb invalidation
operation will be followed and wait(called tlb_sync) until the operation's
over. But actually we only need one tlb_sync in the last stage. Look at
the loop in function iommu_unmap:
	while (unmapped < size) {
		...
		unmapped_page = domain->ops->unmap(domain, iova, pgsize);
		...
	}
It's not a good idea to add the tlb_sync in domain->ops->unmap.
There are many profits, below actions can be reduced:
1. iommu hardware is a shared resource for cpus, for the tlb_sync operation,
   lock protection is needed.
2. iommu hardware is not inside CPU, to start tlb_sync and check it finished
   may take a lot of time.
Some people might ask: Is it safe to do so? The answer is yes. The standard
processing flow is:
	alloc iova
	map
	process data
	unmap
	tlb invalidation and sync
	free iova
What should be guaranteed is: "free iova" action is behind "unmap" and "tlbi
operation" action, that is what we are doing right now. This ensures that:
all TLBs of an iova-range have been invalidated before the iova reallocated.
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/iommu.c | 3 +++
 include/linux/iommu.h | 1 +
 2 files changed, 4 insertions(+)
diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c
index cf7ca7e..01e91a8 100644
--- a/drivers/iommu/iommu.c
+++ b/drivers/iommu/iommu.c
@@ -1610,6 +1610,9 @@ size_t iommu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 		unmapped += unmapped_page;
 	}
 
+	if (domain->ops->unmap_tlb_sync)
+		domain->ops->unmap_tlb_sync(domain);
+
 	trace_unmap(orig_iova, size, unmapped);
 	return unmapped;
 }
diff --git a/include/linux/iommu.h b/include/linux/iommu.h
index 2cb54ad..5964121 100644
--- a/include/linux/iommu.h
+++ b/include/linux/iommu.h
@@ -197,6 +197,7 @@ struct iommu_ops {
 		   phys_addr_t paddr, size_t size, int prot);
 	size_t (*unmap)(struct iommu_domain *domain, unsigned long iova,
 		     size_t size);
+	void (*unmap_tlb_sync)(struct iommu_domain *domain);
 	size_t (*map_sg)(struct iommu_domain *domain, unsigned long iova,
 			 struct scatterlist *sg, unsigned int nents, int prot);
 	phys_addr_t (*iova_to_phys)(struct iommu_domain *domain, dma_addr_t iova);
-- 
2.5.0
^ permalink raw reply related	[flat|nested] 19+ messages in thread
* [PATCH 3/5] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
  2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
  2017-06-26 13:38 ` [PATCH 2/5] iommu: add a new member unmap_tlb_sync into struct iommu_ops Zhen Lei
@ 2017-06-26 13:38 ` Zhen Lei
  2017-06-26 13:38 ` [PATCH 4/5] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
                   ` (2 subsequent siblings)
  5 siblings, 0 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
1. remove tlb_sync operation in "unmap"
2. make sure each "unmap" will always be followed by tlb sync operation
The resultant effect is as below:
	unmap memory page-1
	tlb invalidate page-1
	...
	unmap memory page-n
	tlb invalidate page-n
	tlb sync
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu-v3.c    | 10 ++++++++++
 drivers/iommu/io-pgtable-arm.c | 30 ++++++++++++++++++++----------
 drivers/iommu/io-pgtable.h     |  1 +
 3 files changed, 31 insertions(+), 10 deletions(-)
diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
index 4481123..328b9d7 100644
--- a/drivers/iommu/arm-smmu-v3.c
+++ b/drivers/iommu/arm-smmu-v3.c
@@ -1724,6 +1724,15 @@ arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova, size_t size)
 	return ops->unmap(ops, iova, size);
 }
 
+static void arm_smmu_unmap_tlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
+
+	if (ops && ops->unmap_tlb_sync)
+		ops->unmap_tlb_sync(ops);
+}
+
 static phys_addr_t
 arm_smmu_iova_to_phys(struct iommu_domain *domain, dma_addr_t iova)
 {
@@ -1943,6 +1952,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.attach_dev		= arm_smmu_attach_dev,
 	.map			= arm_smmu_map,
 	.unmap			= arm_smmu_unmap,
+	.unmap_tlb_sync		= arm_smmu_unmap_tlb_sync,
 	.map_sg			= default_iommu_map_sg,
 	.iova_to_phys		= arm_smmu_iova_to_phys,
 	.add_device		= arm_smmu_add_device,
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 52700fa..8137e62 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -304,6 +304,8 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 		WARN_ON(!selftest_running);
 		return -EEXIST;
 	} else if (iopte_type(pte, lvl) == ARM_LPAE_PTE_TYPE_TABLE) {
+		size_t unmapped;
+
 		/*
 		 * We need to unmap and free the old table before
 		 * overwriting it with a block entry.
@@ -312,7 +314,9 @@ static int arm_lpae_init_pte(struct arm_lpae_io_pgtable *data,
 		size_t sz = ARM_LPAE_BLOCK_SIZE(lvl, data);
 
 		tblp = ptep - ARM_LPAE_LVL_IDX(iova, lvl, data);
-		if (WARN_ON(__arm_lpae_unmap(data, iova, sz, lvl, tblp) != sz))
+		unmapped = __arm_lpae_unmap(data, iova, sz, lvl, tblp);
+		io_pgtable_tlb_sync(&data->iop);
+		if (WARN_ON(unmapped != sz))
 			return -EINVAL;
 	}
 
@@ -576,7 +580,6 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			/* Also flush any partial walks */
 			io_pgtable_tlb_add_flush(iop, iova, size,
 						ARM_LPAE_GRANULE(data), false);
-			io_pgtable_tlb_sync(iop);
 			ptep = iopte_deref(pte, data);
 			__arm_lpae_free_pgtable(data, lvl + 1, ptep);
 		} else {
@@ -601,16 +604,18 @@ static int __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 static int arm_lpae_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 			  size_t size)
 {
-	size_t unmapped;
 	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 	arm_lpae_iopte *ptep = data->pgd;
 	int lvl = ARM_LPAE_START_LVL(data);
 
-	unmapped = __arm_lpae_unmap(data, iova, size, lvl, ptep);
-	if (unmapped)
-		io_pgtable_tlb_sync(&data->iop);
+	return __arm_lpae_unmap(data, iova, size, lvl, ptep);
+}
+
+static void arm_lpae_unmap_tlb_sync(struct io_pgtable_ops *ops)
+{
+	struct arm_lpae_io_pgtable *data = io_pgtable_ops_to_data(ops);
 
-	return unmapped;
+	io_pgtable_tlb_sync(&data->iop);
 }
 
 static phys_addr_t arm_lpae_iova_to_phys(struct io_pgtable_ops *ops,
@@ -723,6 +728,7 @@ arm_lpae_alloc_pgtable(struct io_pgtable_cfg *cfg)
 	data->iop.ops = (struct io_pgtable_ops) {
 		.map		= arm_lpae_map,
 		.unmap		= arm_lpae_unmap,
+		.unmap_tlb_sync	= arm_lpae_unmap_tlb_sync,
 		.iova_to_phys	= arm_lpae_iova_to_phys,
 	};
 
@@ -1019,7 +1025,7 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 
 	int i, j;
 	unsigned long iova;
-	size_t size;
+	size_t size, unmapped;
 	struct io_pgtable_ops *ops;
 
 	selftest_running = true;
@@ -1071,7 +1077,9 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 
 		/* Partial unmap */
 		size = 1UL << __ffs(cfg->pgsize_bitmap);
-		if (ops->unmap(ops, SZ_1G + size, size) != size)
+		unmapped = ops->unmap(ops, SZ_1G + size, size);
+		ops->unmap_tlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops, i);
 
 		/* Remap of partial unmap */
@@ -1087,7 +1095,9 @@ static int __init arm_lpae_run_tests(struct io_pgtable_cfg *cfg)
 		while (j != BITS_PER_LONG) {
 			size = 1UL << j;
 
-			if (ops->unmap(ops, iova, size) != size)
+			unmapped = ops->unmap(ops, iova, size);
+			ops->unmap_tlb_sync(ops);
+			if (unmapped != size)
 				return __FAIL(ops, i);
 
 			if (ops->iova_to_phys(ops, iova + 42))
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 524263a..7b3fc04 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -120,6 +120,7 @@ struct io_pgtable_ops {
 		   phys_addr_t paddr, size_t size, int prot);
 	int (*unmap)(struct io_pgtable_ops *ops, unsigned long iova,
 		     size_t size);
+	void (*unmap_tlb_sync)(struct io_pgtable_ops *ops);
 	phys_addr_t (*iova_to_phys)(struct io_pgtable_ops *ops,
 				    unsigned long iova);
 };
-- 
2.5.0
^ permalink raw reply related	[flat|nested] 19+ messages in thread
* [PATCH 4/5] iommu/arm-smmu: add support for unmap a memory range with only one tlb sync
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
                   ` (2 preceding siblings ...)
  2017-06-26 13:38 ` [PATCH 3/5] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
@ 2017-06-26 13:38 ` Zhen Lei
  2017-06-26 13:38 ` [PATCH 5/5] iommu/io-pgtable: delete member tlb_sync_pending of struct io_pgtable Zhen Lei
  2017-08-17 14:36 ` [PATCH 0/5] arm-smmu: performance optimization Will Deacon
  5 siblings, 0 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
1. remove tlb_sync operation in "unmap"
2. make sure each "unmap" will always be followed by tlb sync operation
The resultant effect is as below:
	unmap memory page-1
	tlb invalidate page-1
	...
	unmap memory page-n
	tlb invalidate page-n
	tlb sync
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/arm-smmu.c           | 10 ++++++++++
 drivers/iommu/io-pgtable-arm-v7s.c | 32 +++++++++++++++++++++-----------
 2 files changed, 31 insertions(+), 11 deletions(-)
diff --git a/drivers/iommu/arm-smmu.c b/drivers/iommu/arm-smmu.c
index b8d069a..74ca6eb 100644
--- a/drivers/iommu/arm-smmu.c
+++ b/drivers/iommu/arm-smmu.c
@@ -1402,6 +1402,15 @@ static size_t arm_smmu_unmap(struct iommu_domain *domain, unsigned long iova,
 	return ops->unmap(ops, iova, size);
 }
 
+static void arm_smmu_unmap_tlb_sync(struct iommu_domain *domain)
+{
+	struct arm_smmu_domain *smmu_domain = to_smmu_domain(domain);
+	struct io_pgtable_ops *ops= smmu_domain->pgtbl_ops;
+
+	if (ops && ops->unmap_tlb_sync)
+		ops->unmap_tlb_sync(ops);
+}
+
 static phys_addr_t arm_smmu_iova_to_phys_hard(struct iommu_domain *domain,
 					      dma_addr_t iova)
 {
@@ -1698,6 +1707,7 @@ static struct iommu_ops arm_smmu_ops = {
 	.attach_dev		= arm_smmu_attach_dev,
 	.map			= arm_smmu_map,
 	.unmap			= arm_smmu_unmap,
+	.unmap_tlb_sync		= arm_smmu_unmap_tlb_sync,
 	.map_sg			= default_iommu_map_sg,
 	.iova_to_phys		= arm_smmu_iova_to_phys,
 	.add_device		= arm_smmu_add_device,
diff --git a/drivers/iommu/io-pgtable-arm-v7s.c b/drivers/iommu/io-pgtable-arm-v7s.c
index a55fd38..325c1c6 100644
--- a/drivers/iommu/io-pgtable-arm-v7s.c
+++ b/drivers/iommu/io-pgtable-arm-v7s.c
@@ -370,6 +370,8 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
 
 	for (i = 0; i < num_entries; i++)
 		if (ARM_V7S_PTE_IS_TABLE(ptep[i], lvl)) {
+			size_t unmapped;
+
 			/*
 			 * We need to unmap and free the old table before
 			 * overwriting it with a block entry.
@@ -378,8 +380,10 @@ static int arm_v7s_init_pte(struct arm_v7s_io_pgtable *data,
 			size_t sz = ARM_V7S_BLOCK_SIZE(lvl);
 
 			tblp = ptep - ARM_V7S_LVL_IDX(iova, lvl);
-			if (WARN_ON(__arm_v7s_unmap(data, iova + i * sz,
-						    sz, lvl, tblp) != sz))
+			unmapped = __arm_v7s_unmap(data, iova + i * sz,
+						    sz, lvl, tblp);
+			io_pgtable_tlb_sync(&data->iop);
+			if (WARN_ON(unmapped != sz))
 				return -EINVAL;
 		} else if (ptep[i]) {
 			/* We require an unmap first */
@@ -626,7 +630,6 @@ static int __arm_v7s_unmap(struct arm_v7s_io_pgtable *data,
 				/* Also flush any partial walks */
 				io_pgtable_tlb_add_flush(iop, iova, blk_size,
 					ARM_V7S_BLOCK_SIZE(lvl + 1), false);
-				io_pgtable_tlb_sync(iop);
 				ptep = iopte_deref(pte[i], lvl);
 				__arm_v7s_free_table(ptep, lvl + 1, data);
 			} else {
@@ -653,13 +656,15 @@ static int arm_v7s_unmap(struct io_pgtable_ops *ops, unsigned long iova,
 			 size_t size)
 {
 	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
-	size_t unmapped;
 
-	unmapped = __arm_v7s_unmap(data, iova, size, 1, data->pgd);
-	if (unmapped)
-		io_pgtable_tlb_sync(&data->iop);
+	return __arm_v7s_unmap(data, iova, size, 1, data->pgd);
+}
+
+static void arm_v7s_unmap_tlb_sync(struct io_pgtable_ops *ops)
+{
+	struct arm_v7s_io_pgtable *data = io_pgtable_ops_to_data(ops);
 
-	return unmapped;
+	io_pgtable_tlb_sync(&data->iop);
 }
 
 static phys_addr_t arm_v7s_iova_to_phys(struct io_pgtable_ops *ops,
@@ -724,6 +729,7 @@ static struct io_pgtable *arm_v7s_alloc_pgtable(struct io_pgtable_cfg *cfg,
 	data->iop.ops = (struct io_pgtable_ops) {
 		.map		= arm_v7s_map,
 		.unmap		= arm_v7s_unmap,
+		.unmap_tlb_sync	= arm_v7s_unmap_tlb_sync,
 		.iova_to_phys	= arm_v7s_iova_to_phys,
 	};
 
@@ -822,7 +828,7 @@ static int __init arm_v7s_do_selftests(void)
 		.quirks = IO_PGTABLE_QUIRK_ARM_NS | IO_PGTABLE_QUIRK_NO_DMA,
 		.pgsize_bitmap = SZ_4K | SZ_64K | SZ_1M | SZ_16M,
 	};
-	unsigned int iova, size, iova_start;
+	unsigned int iova, size, unmapped, iova_start;
 	unsigned int i, loopnr = 0;
 
 	selftest_running = true;
@@ -877,7 +883,9 @@ static int __init arm_v7s_do_selftests(void)
 	size = 1UL << __ffs(cfg.pgsize_bitmap);
 	while (i < loopnr) {
 		iova_start = i * SZ_16M;
-		if (ops->unmap(ops, iova_start + size, size) != size)
+		unmapped = ops->unmap(ops, iova_start + size, size);
+		ops->unmap_tlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops);
 
 		/* Remap of partial unmap */
@@ -896,7 +904,9 @@ static int __init arm_v7s_do_selftests(void)
 	while (i != BITS_PER_LONG) {
 		size = 1UL << i;
 
-		if (ops->unmap(ops, iova, size) != size)
+		unmapped = ops->unmap(ops, iova, size);
+		ops->unmap_tlb_sync(ops);
+		if (unmapped != size)
 			return __FAIL(ops);
 
 		if (ops->iova_to_phys(ops, iova + 42))
-- 
2.5.0
^ permalink raw reply related	[flat|nested] 19+ messages in thread
* [PATCH 5/5] iommu/io-pgtable: delete member tlb_sync_pending of struct io_pgtable
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
                   ` (3 preceding siblings ...)
  2017-06-26 13:38 ` [PATCH 4/5] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
@ 2017-06-26 13:38 ` Zhen Lei
  2017-08-17 14:36 ` [PATCH 0/5] arm-smmu: performance optimization Will Deacon
  5 siblings, 0 replies; 19+ messages in thread
From: Zhen Lei @ 2017-06-26 13:38 UTC (permalink / raw)
  To: linux-arm-kernel
This member is unused now, because the previous patches ensured that each
unmap will always be followed by tlb sync operation.
By the way, ->tlb_flush_all executes tlb_sync by itself.
Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
---
 drivers/iommu/io-pgtable.h | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)
diff --git a/drivers/iommu/io-pgtable.h b/drivers/iommu/io-pgtable.h
index 7b3fc04..43ddf1f 100644
--- a/drivers/iommu/io-pgtable.h
+++ b/drivers/iommu/io-pgtable.h
@@ -166,7 +166,6 @@ void free_io_pgtable_ops(struct io_pgtable_ops *ops);
 struct io_pgtable {
 	enum io_pgtable_fmt	fmt;
 	void			*cookie;
-	bool			tlb_sync_pending;
 	struct io_pgtable_cfg	cfg;
 	struct io_pgtable_ops	ops;
 };
@@ -176,22 +175,17 @@ struct io_pgtable {
 static inline void io_pgtable_tlb_flush_all(struct io_pgtable *iop)
 {
 	iop->cfg.tlb->tlb_flush_all(iop->cookie);
-	iop->tlb_sync_pending = true;
 }
 
 static inline void io_pgtable_tlb_add_flush(struct io_pgtable *iop,
 		unsigned long iova, size_t size, size_t granule, bool leaf)
 {
 	iop->cfg.tlb->tlb_add_flush(iova, size, granule, leaf, iop->cookie);
-	iop->tlb_sync_pending = true;
 }
 
 static inline void io_pgtable_tlb_sync(struct io_pgtable *iop)
 {
-	if (iop->tlb_sync_pending) {
-		iop->cfg.tlb->tlb_sync(iop->cookie);
-		iop->tlb_sync_pending = false;
-	}
+	iop->cfg.tlb->tlb_sync(iop->cookie);
 }
 
 /**
-- 
2.5.0
^ permalink raw reply related	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
@ 2017-06-28  9:32   ` Will Deacon
  2017-06-29  2:08     ` Leizhen (ThunderTown)
  2017-08-22 15:41   ` Joerg Roedel
  1 sibling, 1 reply; 19+ messages in thread
From: Will Deacon @ 2017-06-28  9:32 UTC (permalink / raw)
  To: linux-arm-kernel
Hi Zhen Lei,
Nate (CC'd), Robin and I have been working on something very similar to
this series, but this patch is different to what we had planned. More below.
On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
> Because all TLBI commands should be followed by a SYNC command, to make
> sure that it has been completely finished. So we can just add the TLBI
> commands into the queue, and put off the execution until meet SYNC or
> other commands. To prevent the followed SYNC command waiting for a long
> time because of too many commands have been delayed, restrict the max
> delayed number.
> 
> According to my test, I got the same performance data as I replaced writel
> with writel_relaxed in queue_inc_prod.
> 
> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> ---
>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>  1 file changed, 37 insertions(+), 5 deletions(-)
> 
> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> index 291da5f..4481123 100644
> --- a/drivers/iommu/arm-smmu-v3.c
> +++ b/drivers/iommu/arm-smmu-v3.c
> @@ -337,6 +337,7 @@
>  /* Command queue */
>  #define CMDQ_ENT_DWORDS			2
>  #define CMDQ_MAX_SZ_SHIFT		8
> +#define CMDQ_MAX_DELAYED		32
>  
>  #define CMDQ_ERR_SHIFT			24
>  #define CMDQ_ERR_MASK			0x7f
> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
>  			};
>  		} cfgi;
>  
> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
>  		#define CMDQ_OP_TLBI_NH_ASID	0x11
>  		#define CMDQ_OP_TLBI_NH_VA	0x12
>  		#define CMDQ_OP_TLBI_EL2_ALL	0x20
> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
>  
>  struct arm_smmu_queue {
>  	int				irq; /* Wired interrupt */
> +	u32				nr_delay;
>  
>  	__le64				*base;
>  	dma_addr_t			base_dma;
> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
>  	return ret;
>  }
>  
> -static void queue_inc_prod(struct arm_smmu_queue *q)
> +static void queue_inc_swprod(struct arm_smmu_queue *q)
>  {
> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
> +	u32 prod = q->prod + 1;
>  
>  	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
> +}
> +
> +static void queue_inc_prod(struct arm_smmu_queue *q)
> +{
> +	queue_inc_swprod(q);
>  	writel(q->prod, q->prod_reg);
>  }
>  
> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
>  		*dst++ = cpu_to_le64(*src++);
>  }
>  
> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>  {
>  	if (queue_full(q))
>  		return -ENOSPC;
>  
>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
> -	queue_inc_prod(q);
> +
> +	/*
> +	 * We don't want too many commands to be delayed, this may lead the
> +	 * followed sync command to wait for a long time.
> +	 */
> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
> +		queue_inc_swprod(q);
> +	} else {
> +		queue_inc_prod(q);
> +		q->nr_delay = 0;
> +	}
> +
So here, you're effectively putting invalidation commands into the command
queue without updating PROD. Do you actually see a performance advantage
from doing so? Another side of the argument would be that we should be
moving PROD as soon as we can, so that the SMMU can process invalidation
commands in the background and reduce the cost of the final SYNC operation
when the high-level unmap operation is complete.
Will
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-06-28  9:32   ` Will Deacon
@ 2017-06-29  2:08     ` Leizhen (ThunderTown)
  2017-07-17 13:06       ` John Garry
  0 siblings, 1 reply; 19+ messages in thread
From: Leizhen (ThunderTown) @ 2017-06-29  2:08 UTC (permalink / raw)
  To: linux-arm-kernel
On 2017/6/28 17:32, Will Deacon wrote:
> Hi Zhen Lei,
> 
> Nate (CC'd), Robin and I have been working on something very similar to
> this series, but this patch is different to what we had planned. More below.
> 
> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
>> Because all TLBI commands should be followed by a SYNC command, to make
>> sure that it has been completely finished. So we can just add the TLBI
>> commands into the queue, and put off the execution until meet SYNC or
>> other commands. To prevent the followed SYNC command waiting for a long
>> time because of too many commands have been delayed, restrict the max
>> delayed number.
>>
>> According to my test, I got the same performance data as I replaced writel
>> with writel_relaxed in queue_inc_prod.
>>
>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>> ---
>>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>>  1 file changed, 37 insertions(+), 5 deletions(-)
>>
>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>> index 291da5f..4481123 100644
>> --- a/drivers/iommu/arm-smmu-v3.c
>> +++ b/drivers/iommu/arm-smmu-v3.c
>> @@ -337,6 +337,7 @@
>>  /* Command queue */
>>  #define CMDQ_ENT_DWORDS			2
>>  #define CMDQ_MAX_SZ_SHIFT		8
>> +#define CMDQ_MAX_DELAYED		32
>>  
>>  #define CMDQ_ERR_SHIFT			24
>>  #define CMDQ_ERR_MASK			0x7f
>> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
>>  			};
>>  		} cfgi;
>>  
>> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
>>  		#define CMDQ_OP_TLBI_NH_ASID	0x11
>>  		#define CMDQ_OP_TLBI_NH_VA	0x12
>>  		#define CMDQ_OP_TLBI_EL2_ALL	0x20
>> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
>>  
>>  struct arm_smmu_queue {
>>  	int				irq; /* Wired interrupt */
>> +	u32				nr_delay;
>>  
>>  	__le64				*base;
>>  	dma_addr_t			base_dma;
>> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
>>  	return ret;
>>  }
>>  
>> -static void queue_inc_prod(struct arm_smmu_queue *q)
>> +static void queue_inc_swprod(struct arm_smmu_queue *q)
>>  {
>> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
>> +	u32 prod = q->prod + 1;
>>  
>>  	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
>> +}
>> +
>> +static void queue_inc_prod(struct arm_smmu_queue *q)
>> +{
>> +	queue_inc_swprod(q);
>>  	writel(q->prod, q->prod_reg);
>>  }
>>  
>> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
>>  		*dst++ = cpu_to_le64(*src++);
>>  }
>>  
>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>>  {
>>  	if (queue_full(q))
>>  		return -ENOSPC;
>>  
>>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
>> -	queue_inc_prod(q);
>> +
>> +	/*
>> +	 * We don't want too many commands to be delayed, this may lead the
>> +	 * followed sync command to wait for a long time.
>> +	 */
>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
>> +		queue_inc_swprod(q);
>> +	} else {
>> +		queue_inc_prod(q);
>> +		q->nr_delay = 0;
>> +	}
>> +
> 
> So here, you're effectively putting invalidation commands into the command
> queue without updating PROD. Do you actually see a performance advantage
> from doing so? Another side of the argument would be that we should be
Yes, my sas ssd performance test showed that it can improve about 100-150K/s(the same to I directly replace
writel with writel_relaxed). And the average execution time of iommu_unmap(which called by iommu_dma_unmap_sg)
dropped from 10us to 5us.
> moving PROD as soon as we can, so that the SMMU can process invalidation
> commands in the background and reduce the cost of the final SYNC operation
> when the high-level unmap operation is complete.
There maybe that __iowmb() is more expensive than wait for tlbi complete. Except the time of __iowmb()
itself, it also protected by spinlock, lock confliction will rise rapidly in the stress scene. __iowmb()
average cost 300-500ns(Sorry, I forget the exact value).
In addition, after applied this patcheset and Robin's v2, and my earlier dma64 iova optimization patchset.
Our net performance test got the same data to global bypass. But sas ssd still have more than 20% dropped.
Maybe we should still focus at map/unamp, because the average execution time of iova alloc/free is only
about 400ns.
By the way, patch2-5 is more effective than this one, it can improve more than 350K/s. And with it, we can
got about 100-150K/s improvement of Robin's v2. Otherwise, I saw non effective of Robin's v2. Sorry, I have
not tested how about this patch without patch2-5. Further more, I got the same performance data to global
bypass for the traditional mechanical hard disk with only patch2-5(without this patch and Robin's).
> 
> Will
> 
> .
> 
-- 
Thanks!
BestRegards
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-06-29  2:08     ` Leizhen (ThunderTown)
@ 2017-07-17 13:06       ` John Garry
  2017-07-17 14:23         ` Jonathan Cameron
  0 siblings, 1 reply; 19+ messages in thread
From: John Garry @ 2017-07-17 13:06 UTC (permalink / raw)
  To: linux-arm-kernel
+
On 29/06/2017 03:08, Leizhen (ThunderTown) wrote:
>
>
> On 2017/6/28 17:32, Will Deacon wrote:
>> Hi Zhen Lei,
>>
>> Nate (CC'd), Robin and I have been working on something very similar to
>> this series, but this patch is different to what we had planned. More below.
>>
>> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
>>> Because all TLBI commands should be followed by a SYNC command, to make
>>> sure that it has been completely finished. So we can just add the TLBI
>>> commands into the queue, and put off the execution until meet SYNC or
>>> other commands. To prevent the followed SYNC command waiting for a long
>>> time because of too many commands have been delayed, restrict the max
>>> delayed number.
>>>
>>> According to my test, I got the same performance data as I replaced writel
>>> with writel_relaxed in queue_inc_prod.
>>>
>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>> ---
>>>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>>>  1 file changed, 37 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>>> index 291da5f..4481123 100644
>>> --- a/drivers/iommu/arm-smmu-v3.c
>>> +++ b/drivers/iommu/arm-smmu-v3.c
>>> @@ -337,6 +337,7 @@
>>>  /* Command queue */
>>>  #define CMDQ_ENT_DWORDS			2
>>>  #define CMDQ_MAX_SZ_SHIFT		8
>>> +#define CMDQ_MAX_DELAYED		32
>>>
>>>  #define CMDQ_ERR_SHIFT			24
>>>  #define CMDQ_ERR_MASK			0x7f
>>> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
>>>  			};
>>>  		} cfgi;
>>>
>>> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
>>>  		#define CMDQ_OP_TLBI_NH_ASID	0x11
>>>  		#define CMDQ_OP_TLBI_NH_VA	0x12
>>>  		#define CMDQ_OP_TLBI_EL2_ALL	0x20
>>> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
>>>
>>>  struct arm_smmu_queue {
>>>  	int				irq; /* Wired interrupt */
>>> +	u32				nr_delay;
>>>
>>>  	__le64				*base;
>>>  	dma_addr_t			base_dma;
>>> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
>>>  	return ret;
>>>  }
>>>
>>> -static void queue_inc_prod(struct arm_smmu_queue *q)
>>> +static void queue_inc_swprod(struct arm_smmu_queue *q)
>>>  {
>>> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
>>> +	u32 prod = q->prod + 1;
>>>
>>>  	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
>>> +}
>>> +
>>> +static void queue_inc_prod(struct arm_smmu_queue *q)
>>> +{
>>> +	queue_inc_swprod(q);
>>>  	writel(q->prod, q->prod_reg);
>>>  }
>>>
>>> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
>>>  		*dst++ = cpu_to_le64(*src++);
>>>  }
>>>
>>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
>>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>>>  {
>>>  	if (queue_full(q))
>>>  		return -ENOSPC;
>>>
>>>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
>>> -	queue_inc_prod(q);
>>> +
>>> +	/*
>>> +	 * We don't want too many commands to be delayed, this may lead the
>>> +	 * followed sync command to wait for a long time.
>>> +	 */
>>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
>>> +		queue_inc_swprod(q);
>>> +	} else {
>>> +		queue_inc_prod(q);
>>> +		q->nr_delay = 0;
>>> +	}
>>> +
>>
>> So here, you're effectively putting invalidation commands into the command
>> queue without updating PROD. Do you actually see a performance advantage
>> from doing so? Another side of the argument would be that we should be
> Yes, my sas ssd performance test showed that it can improve about 100-150K/s(the same to I directly replace
> writel with writel_relaxed). And the average execution time of iommu_unmap(which called by iommu_dma_unmap_sg)
> dropped from 10us to 5us.
>
>> moving PROD as soon as we can, so that the SMMU can process invalidation
>> commands in the background and reduce the cost of the final SYNC operation
>> when the high-level unmap operation is complete.
> There maybe that __iowmb() is more expensive than wait for tlbi complete. Except the time of __iowmb()
> itself, it also protected by spinlock, lock confliction will rise rapidly in the stress scene. __iowmb()
> average cost 300-500ns(Sorry, I forget the exact value).
>
> In addition, after applied this patcheset and Robin's v2, and my earlier dma64 iova optimization patchset.
> Our net performance test got the same data to global bypass. But sas ssd still have more than 20% dropped.
> Maybe we should still focus at map/unamp, because the average execution time of iova alloc/free is only
> about 400ns.
>
> By the way, patch2-5 is more effective than this one, it can improve more than 350K/s. And with it, we can
> got about 100-150K/s improvement of Robin's v2. Otherwise, I saw non effective of Robin's v2. Sorry, I have
> not tested how about this patch without patch2-5. Further more, I got the same performance data to global
> bypass for the traditional mechanical hard disk with only patch2-5(without this patch and Robin's).
>
>>
>> Will
>>
>> .
>>
>
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-07-17 13:06       ` John Garry
@ 2017-07-17 14:23         ` Jonathan Cameron
  2017-07-17 17:28           ` Nate Watterson
  0 siblings, 1 reply; 19+ messages in thread
From: Jonathan Cameron @ 2017-07-17 14:23 UTC (permalink / raw)
  To: linux-arm-kernel
On Mon, 17 Jul 2017 14:06:42 +0100
John Garry <john.garry@huawei.com> wrote:
> +
> 
> On 29/06/2017 03:08, Leizhen (ThunderTown) wrote:
> >
> >
> > On 2017/6/28 17:32, Will Deacon wrote:  
> >> Hi Zhen Lei,
> >>
> >> Nate (CC'd), Robin and I have been working on something very similar to
> >> this series, but this patch is different to what we had planned. More below.
> >>
> >> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:  
> >>> Because all TLBI commands should be followed by a SYNC command, to make
> >>> sure that it has been completely finished. So we can just add the TLBI
> >>> commands into the queue, and put off the execution until meet SYNC or
> >>> other commands. To prevent the followed SYNC command waiting for a long
> >>> time because of too many commands have been delayed, restrict the max
> >>> delayed number.
> >>>
> >>> According to my test, I got the same performance data as I replaced writel
> >>> with writel_relaxed in queue_inc_prod.
> >>>
> >>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>> ---
> >>>  drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
> >>>  1 file changed, 37 insertions(+), 5 deletions(-)
> >>>
> >>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> >>> index 291da5f..4481123 100644
> >>> --- a/drivers/iommu/arm-smmu-v3.c
> >>> +++ b/drivers/iommu/arm-smmu-v3.c
> >>> @@ -337,6 +337,7 @@
> >>>  /* Command queue */
> >>>  #define CMDQ_ENT_DWORDS			2
> >>>  #define CMDQ_MAX_SZ_SHIFT		8
> >>> +#define CMDQ_MAX_DELAYED		32
> >>>
> >>>  #define CMDQ_ERR_SHIFT			24
> >>>  #define CMDQ_ERR_MASK			0x7f
> >>> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
> >>>  			};
> >>>  		} cfgi;
> >>>
> >>> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
> >>>  		#define CMDQ_OP_TLBI_NH_ASID	0x11
> >>>  		#define CMDQ_OP_TLBI_NH_VA	0x12
> >>>  		#define CMDQ_OP_TLBI_EL2_ALL	0x20
> >>> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
> >>>
> >>>  struct arm_smmu_queue {
> >>>  	int				irq; /* Wired interrupt */
> >>> +	u32				nr_delay;
> >>>
> >>>  	__le64				*base;
> >>>  	dma_addr_t			base_dma;
> >>> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
> >>>  	return ret;
> >>>  }
> >>>
> >>> -static void queue_inc_prod(struct arm_smmu_queue *q)
> >>> +static void queue_inc_swprod(struct arm_smmu_queue *q)
> >>>  {
> >>> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
> >>> +	u32 prod = q->prod + 1;
> >>>
> >>>  	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
> >>> +}
> >>> +
> >>> +static void queue_inc_prod(struct arm_smmu_queue *q)
> >>> +{
> >>> +	queue_inc_swprod(q);
> >>>  	writel(q->prod, q->prod_reg);
> >>>  }
> >>>
> >>> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
> >>>  		*dst++ = cpu_to_le64(*src++);
> >>>  }
> >>>
> >>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> >>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
> >>>  {
> >>>  	if (queue_full(q))
> >>>  		return -ENOSPC;
> >>>
> >>>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
> >>> -	queue_inc_prod(q);
> >>> +
> >>> +	/*
> >>> +	 * We don't want too many commands to be delayed, this may lead the
> >>> +	 * followed sync command to wait for a long time.
> >>> +	 */
> >>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
> >>> +		queue_inc_swprod(q);
> >>> +	} else {
> >>> +		queue_inc_prod(q);
> >>> +		q->nr_delay = 0;
> >>> +	}
> >>> +  
> >>
> >> So here, you're effectively putting invalidation commands into the command
> >> queue without updating PROD. Do you actually see a performance advantage
> >> from doing so? Another side of the argument would be that we should be  
> > Yes, my sas ssd performance test showed that it can improve about 100-150K/s(the same to I directly replace
> > writel with writel_relaxed). And the average execution time of iommu_unmap(which called by iommu_dma_unmap_sg)
> > dropped from 10us to 5us.
> >  
> >> moving PROD as soon as we can, so that the SMMU can process invalidation
> >> commands in the background and reduce the cost of the final SYNC operation
> >> when the high-level unmap operation is complete.  
> > There maybe that __iowmb() is more expensive than wait for tlbi complete. Except the time of __iowmb()
> > itself, it also protected by spinlock, lock confliction will rise rapidly in the stress scene. __iowmb()
> > average cost 300-500ns(Sorry, I forget the exact value).
> >
> > In addition, after applied this patcheset and Robin's v2, and my earlier dma64 iova optimization patchset.
> > Our net performance test got the same data to global bypass. But sas ssd still have more than 20% dropped.
> > Maybe we should still focus at map/unamp, because the average execution time of iova alloc/free is only
> > about 400ns.
> >
> > By the way, patch2-5 is more effective than this one, it can improve more than 350K/s. And with it, we can
> > got about 100-150K/s improvement of Robin's v2. Otherwise, I saw non effective of Robin's v2. Sorry, I have
> > not tested how about this patch without patch2-5. Further more, I got the same performance data to global
> > bypass for the traditional mechanical hard disk with only patch2-5(without this patch and Robin's).
> >  
Hi All,
I'm a bit of late entry to this discussion.  Just been running some more
detailed tests on our d05 boards and wanted to bring some more numbers to
the discussion.
All tests against 4.12 with the following additions:
* Robin's series removing the io-pgtable spinlock (and a few recent fixes)
* Cherry picked updates to the sas driver, merged prior to 4.13-rc1
* An additional HNS (network card) bug fix that will be upstreamed shortly.
I've broken the results down into this patch and this patch + the remainder
of the set. As leizhen mentioned we got a nice little performance
bump from Robin's series so that was applied first (as it's in mainline now)
SAS tests were fio with noop scheduler, 4k block size and various io depths
1 process per disk.  Note this is probably a different setup to leizhen's
original numbers.
Precentages are off the performance seen with the smmu disabled.
SAS
4.12 - none of this series.
SMMU disabled
read io-depth 32 -   384K IOPS (100%)
read io-depth 2048 - 950K IOPS (100%)
rw io-depth 32 -     166K IOPS (100%)
rw io-depth 2048 -   340K IOPS (100%)
SMMU enabled
read io-depth 32 -   201K IOPS (52%)
read io-depth 2048 - 306K IOPS (32%)
rw io-depth 32 -     99K  IOPS (60%)
rw io-depth 2048 -   150K IOPS (44%)
Robin's recent series with fixes as seen on list (now merged)
SMMU enabled.
read io-depth 32 -   208K IOPS (54%)
read io-depth 2048 - 335K IOPS (35%)
rw io-depth 32 -     105K IOPS (63%)
rw io-depth 2048 -   165K IOPS (49%)
4.12 + Robin's series + just this patch SMMU enabled
(iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
read io-depth 32 -   225K IOPS (59%)
read io-depth 2048 - 365K IOPS (38%) 
rw io-depth 32 -     110K IOPS (66%)
rw io-depth 2048 -   179K IOPS (53%)
4.12 + Robin's series + Second part of this series
(iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
(iommu: add a new member unmap_tlb_sync into struct iommu_ops)
(iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
(iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
read io-depth 32 -    225K IOPS (59%)
read io-depth 2048 -  833K IOPS (88%)
rw io-depth 32 -      112K IOPS (67%)
rw io-depth 2048 -    220K IOPS (65%)
Robin's series gave us small gains across the board (3-5% recovered)
relative to the no smmu performance (which we are taking as the ideal case)
This first patch gets us back another 2-5% of the no smmu performance
The next few patches get us very little advantage on the small io-depths
but make a large difference to the larger io-depths - in particular the
read IOPS which is over twice as fast as without the series.
For HNS it seems that we are less dependent on the SMMU performance and
can reach the non SMMU speed.
Tests with
iperf -t 30 -i 10 -c IPADDRESS -P 3 last 10 seconds taken to avoid any
initial variability.
The server end of the link was always running with smmu v3 disabled
so as to act as a fast sink of the data. Some variation seen across
repeat runs.
Mainline v4.12 + network card fix
NO SMMU
9.42 GBits/sec
SMMU
4.36 GBits/sec (46%)
Robin's io-pgtable spinlock series
6.68 to 7.34 (71% - 78% variation across runs)
Just this patch SMMU enabled
(iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
7.96-8.8 GBits/sec (85% - 94%  some variation across runs)
Full series
(iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
(iommu: add a new member unmap_tlb_sync into struct iommu_ops)
(iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
(iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
9.42 GBits/Sec (100%)
So HNS test shows a greater boost from Robin's series and this first patch.
This is most likely because the HNS test is not putting as high a load on
the SMMU and associated code as the SAS test.
In both cases however, this shows that both parts of this patch
series are beneficial.
So on to the questions ;)
Will, you mentioned that along with Robin and Nate you were working on
a somewhat related strategy to improve the performance.  Any ETA on that?
As you might imagine, with the above numbers we are very keen to try and
move forward with this as quickly as possible.
If you want additional testing we would be happy to help.
Thanks,
Jonathan
> >>
> >> Will
> >>
> >> .
> >>  
> >  
> 
> 
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-07-17 14:23         ` Jonathan Cameron
@ 2017-07-17 17:28           ` Nate Watterson
  2017-07-18  9:20             ` Jonathan Cameron
  0 siblings, 1 reply; 19+ messages in thread
From: Nate Watterson @ 2017-07-17 17:28 UTC (permalink / raw)
  To: linux-arm-kernel
Hi Jonathan,
On 7/17/2017 10:23 AM, Jonathan Cameron wrote:
> On Mon, 17 Jul 2017 14:06:42 +0100
> John Garry <john.garry@huawei.com> wrote:
> 
>> +
>>
>> On 29/06/2017 03:08, Leizhen (ThunderTown) wrote:
>>>
>>>
>>> On 2017/6/28 17:32, Will Deacon wrote:
>>>> Hi Zhen Lei,
>>>>
>>>> Nate (CC'd), Robin and I have been working on something very similar to
>>>> this series, but this patch is different to what we had planned. More below.
>>>>
>>>> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
>>>>> Because all TLBI commands should be followed by a SYNC command, to make
>>>>> sure that it has been completely finished. So we can just add the TLBI
>>>>> commands into the queue, and put off the execution until meet SYNC or
>>>>> other commands. To prevent the followed SYNC command waiting for a long
>>>>> time because of too many commands have been delayed, restrict the max
>>>>> delayed number.
>>>>>
>>>>> According to my test, I got the same performance data as I replaced writel
>>>>> with writel_relaxed in queue_inc_prod.
>>>>>
>>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
>>>>> ---
>>>>>   drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
>>>>>   1 file changed, 37 insertions(+), 5 deletions(-)
>>>>>
>>>>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
>>>>> index 291da5f..4481123 100644
>>>>> --- a/drivers/iommu/arm-smmu-v3.c
>>>>> +++ b/drivers/iommu/arm-smmu-v3.c
>>>>> @@ -337,6 +337,7 @@
>>>>>   /* Command queue */
>>>>>   #define CMDQ_ENT_DWORDS			2
>>>>>   #define CMDQ_MAX_SZ_SHIFT		8
>>>>> +#define CMDQ_MAX_DELAYED		32
>>>>>
>>>>>   #define CMDQ_ERR_SHIFT			24
>>>>>   #define CMDQ_ERR_MASK			0x7f
>>>>> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
>>>>>   			};
>>>>>   		} cfgi;
>>>>>
>>>>> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
>>>>>   		#define CMDQ_OP_TLBI_NH_ASID	0x11
>>>>>   		#define CMDQ_OP_TLBI_NH_VA	0x12
>>>>>   		#define CMDQ_OP_TLBI_EL2_ALL	0x20
>>>>> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
>>>>>
>>>>>   struct arm_smmu_queue {
>>>>>   	int				irq; /* Wired interrupt */
>>>>> +	u32				nr_delay;
>>>>>
>>>>>   	__le64				*base;
>>>>>   	dma_addr_t			base_dma;
>>>>> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
>>>>>   	return ret;
>>>>>   }
>>>>>
>>>>> -static void queue_inc_prod(struct arm_smmu_queue *q)
>>>>> +static void queue_inc_swprod(struct arm_smmu_queue *q)
>>>>>   {
>>>>> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
>>>>> +	u32 prod = q->prod + 1;
>>>>>
>>>>>   	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
>>>>> +}
>>>>> +
>>>>> +static void queue_inc_prod(struct arm_smmu_queue *q)
>>>>> +{
>>>>> +	queue_inc_swprod(q);
>>>>>   	writel(q->prod, q->prod_reg);
>>>>>   }
>>>>>
>>>>> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
>>>>>   		*dst++ = cpu_to_le64(*src++);
>>>>>   }
>>>>>
>>>>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
>>>>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>>>>>   {
>>>>>   	if (queue_full(q))
>>>>>   		return -ENOSPC;
>>>>>
>>>>>   	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
>>>>> -	queue_inc_prod(q);
>>>>> +
>>>>> +	/*
>>>>> +	 * We don't want too many commands to be delayed, this may lead the
>>>>> +	 * followed sync command to wait for a long time.
>>>>> +	 */
>>>>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
>>>>> +		queue_inc_swprod(q);
>>>>> +	} else {
>>>>> +		queue_inc_prod(q);
>>>>> +		q->nr_delay = 0;
>>>>> +	}
>>>>> +
>>>>
>>>> So here, you're effectively putting invalidation commands into the command
>>>> queue without updating PROD. Do you actually see a performance advantage
>>>> from doing so? Another side of the argument would be that we should be
>>> Yes, my sas ssd performance test showed that it can improve about 100-150K/s(the same to I directly replace
>>> writel with writel_relaxed). And the average execution time of iommu_unmap(which called by iommu_dma_unmap_sg)
>>> dropped from 10us to 5us.
>>>   
>>>> moving PROD as soon as we can, so that the SMMU can process invalidation
>>>> commands in the background and reduce the cost of the final SYNC operation
>>>> when the high-level unmap operation is complete.
>>> There maybe that __iowmb() is more expensive than wait for tlbi complete. Except the time of __iowmb()
>>> itself, it also protected by spinlock, lock confliction will rise rapidly in the stress scene. __iowmb()
>>> average cost 300-500ns(Sorry, I forget the exact value).
>>>
>>> In addition, after applied this patcheset and Robin's v2, and my earlier dma64 iova optimization patchset.
>>> Our net performance test got the same data to global bypass. But sas ssd still have more than 20% dropped.
>>> Maybe we should still focus at map/unamp, because the average execution time of iova alloc/free is only
>>> about 400ns.
>>>
>>> By the way, patch2-5 is more effective than this one, it can improve more than 350K/s. And with it, we can
>>> got about 100-150K/s improvement of Robin's v2. Otherwise, I saw non effective of Robin's v2. Sorry, I have
>>> not tested how about this patch without patch2-5. Further more, I got the same performance data to global
>>> bypass for the traditional mechanical hard disk with only patch2-5(without this patch and Robin's).
>>>   
> Hi All,
> 
> I'm a bit of late entry to this discussion.  Just been running some more
> detailed tests on our d05 boards and wanted to bring some more numbers to
> the discussion.
> 
> All tests against 4.12 with the following additions:
> * Robin's series removing the io-pgtable spinlock (and a few recent fixes)
> * Cherry picked updates to the sas driver, merged prior to 4.13-rc1
> * An additional HNS (network card) bug fix that will be upstreamed shortly.
> 
> I've broken the results down into this patch and this patch + the remainder
> of the set. As leizhen mentioned we got a nice little performance
> bump from Robin's series so that was applied first (as it's in mainline now)
> 
> SAS tests were fio with noop scheduler, 4k block size and various io depths
> 1 process per disk.  Note this is probably a different setup to leizhen's
> original numbers.
> 
> Precentages are off the performance seen with the smmu disabled.
> SAS
> 4.12 - none of this series.
> SMMU disabled
> read io-depth 32 -   384K IOPS (100%)
> read io-depth 2048 - 950K IOPS (100%)
> rw io-depth 32 -     166K IOPS (100%)
> rw io-depth 2048 -   340K IOPS (100%)
> 
> SMMU enabled
> read io-depth 32 -   201K IOPS (52%)
> read io-depth 2048 - 306K IOPS (32%)
> rw io-depth 32 -     99K  IOPS (60%)
> rw io-depth 2048 -   150K IOPS (44%)
> 
> Robin's recent series with fixes as seen on list (now merged)
> SMMU enabled.
> read io-depth 32 -   208K IOPS (54%)
> read io-depth 2048 - 335K IOPS (35%)
> rw io-depth 32 -     105K IOPS (63%)
> rw io-depth 2048 -   165K IOPS (49%)
> 
> 4.12 + Robin's series + just this patch SMMU enabled
> 
> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> 
> read io-depth 32 -   225K IOPS (59%)
> read io-depth 2048 - 365K IOPS (38%)
> rw io-depth 32 -     110K IOPS (66%)
> rw io-depth 2048 -   179K IOPS (53%)
> 
> 4.12 + Robin's series + Second part of this series
> 
> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> 
> read io-depth 32 -    225K IOPS (59%)
> read io-depth 2048 -  833K IOPS (88%)
> rw io-depth 32 -      112K IOPS (67%)
> rw io-depth 2048 -    220K IOPS (65%)
> 
> Robin's series gave us small gains across the board (3-5% recovered)
> relative to the no smmu performance (which we are taking as the ideal case)
> 
> This first patch gets us back another 2-5% of the no smmu performance
> 
> The next few patches get us very little advantage on the small io-depths
> but make a large difference to the larger io-depths - in particular the
> read IOPS which is over twice as fast as without the series.
> 
> For HNS it seems that we are less dependent on the SMMU performance and
> can reach the non SMMU speed.
> 
> Tests with
> iperf -t 30 -i 10 -c IPADDRESS -P 3 last 10 seconds taken to avoid any
> initial variability.
> 
> The server end of the link was always running with smmu v3 disabled
> so as to act as a fast sink of the data. Some variation seen across
> repeat runs.
> 
> Mainline v4.12 + network card fix
> NO SMMU
> 9.42 GBits/sec
> 
> SMMU
> 4.36 GBits/sec (46%)
> 
> Robin's io-pgtable spinlock series
> 
> 6.68 to 7.34 (71% - 78% variation across runs)
> 
> Just this patch SMMU enabled
> 
> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> 
> 7.96-8.8 GBits/sec (85% - 94%  some variation across runs)
> 
> Full series
> 
> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> 
> 9.42 GBits/Sec (100%)
> 
> So HNS test shows a greater boost from Robin's series and this first patch.
> This is most likely because the HNS test is not putting as high a load on
> the SMMU and associated code as the SAS test.
> 
> In both cases however, this shows that both parts of this patch
> series are beneficial.
> 
> So on to the questions ;)
> 
> Will, you mentioned that along with Robin and Nate you were working on
> a somewhat related strategy to improve the performance.  Any ETA on that?
The strategy I was working on is basically equivalent to the second
part of the series. I will test your patches out sometime this week, and
I'll also try to have our performance team run it through their whole
suite.
> 
> As you might imagine, with the above numbers we are very keen to try and
> move forward with this as quickly as possible.
> 
> If you want additional testing we would be happy to help.
> 
> Thanks,
> 
> Jonathan
> 
> 
> 
>>>>
>>>> Will
>>>>
>>>> .
>>>>   
>>>   
>>
>>
> 
> 
-- 
Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-07-17 17:28           ` Nate Watterson
@ 2017-07-18  9:20             ` Jonathan Cameron
  2017-07-20 19:07               ` Nate Watterson
  0 siblings, 1 reply; 19+ messages in thread
From: Jonathan Cameron @ 2017-07-18  9:20 UTC (permalink / raw)
  To: linux-arm-kernel
On Mon, 17 Jul 2017 13:28:47 -0400
Nate Watterson <nwatters@codeaurora.org> wrote:
> Hi Jonathan,
> 
> On 7/17/2017 10:23 AM, Jonathan Cameron wrote:
> > On Mon, 17 Jul 2017 14:06:42 +0100
> > John Garry <john.garry@huawei.com> wrote:
> >   
> >> +
> >>
> >> On 29/06/2017 03:08, Leizhen (ThunderTown) wrote:  
> >>>
> >>>
> >>> On 2017/6/28 17:32, Will Deacon wrote:  
> >>>> Hi Zhen Lei,
> >>>>
> >>>> Nate (CC'd), Robin and I have been working on something very similar to
> >>>> this series, but this patch is different to what we had planned. More below.
> >>>>
> >>>> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:  
> >>>>> Because all TLBI commands should be followed by a SYNC command, to make
> >>>>> sure that it has been completely finished. So we can just add the TLBI
> >>>>> commands into the queue, and put off the execution until meet SYNC or
> >>>>> other commands. To prevent the followed SYNC command waiting for a long
> >>>>> time because of too many commands have been delayed, restrict the max
> >>>>> delayed number.
> >>>>>
> >>>>> According to my test, I got the same performance data as I replaced writel
> >>>>> with writel_relaxed in queue_inc_prod.
> >>>>>
> >>>>> Signed-off-by: Zhen Lei <thunder.leizhen@huawei.com>
> >>>>> ---
> >>>>>   drivers/iommu/arm-smmu-v3.c | 42 +++++++++++++++++++++++++++++++++++++-----
> >>>>>   1 file changed, 37 insertions(+), 5 deletions(-)
> >>>>>
> >>>>> diff --git a/drivers/iommu/arm-smmu-v3.c b/drivers/iommu/arm-smmu-v3.c
> >>>>> index 291da5f..4481123 100644
> >>>>> --- a/drivers/iommu/arm-smmu-v3.c
> >>>>> +++ b/drivers/iommu/arm-smmu-v3.c
> >>>>> @@ -337,6 +337,7 @@
> >>>>>   /* Command queue */
> >>>>>   #define CMDQ_ENT_DWORDS			2
> >>>>>   #define CMDQ_MAX_SZ_SHIFT		8
> >>>>> +#define CMDQ_MAX_DELAYED		32
> >>>>>
> >>>>>   #define CMDQ_ERR_SHIFT			24
> >>>>>   #define CMDQ_ERR_MASK			0x7f
> >>>>> @@ -472,6 +473,7 @@ struct arm_smmu_cmdq_ent {
> >>>>>   			};
> >>>>>   		} cfgi;
> >>>>>
> >>>>> +		#define CMDQ_OP_TLBI_NH_ALL	0x10
> >>>>>   		#define CMDQ_OP_TLBI_NH_ASID	0x11
> >>>>>   		#define CMDQ_OP_TLBI_NH_VA	0x12
> >>>>>   		#define CMDQ_OP_TLBI_EL2_ALL	0x20
> >>>>> @@ -499,6 +501,7 @@ struct arm_smmu_cmdq_ent {
> >>>>>
> >>>>>   struct arm_smmu_queue {
> >>>>>   	int				irq; /* Wired interrupt */
> >>>>> +	u32				nr_delay;
> >>>>>
> >>>>>   	__le64				*base;
> >>>>>   	dma_addr_t			base_dma;
> >>>>> @@ -722,11 +725,16 @@ static int queue_sync_prod(struct arm_smmu_queue *q)
> >>>>>   	return ret;
> >>>>>   }
> >>>>>
> >>>>> -static void queue_inc_prod(struct arm_smmu_queue *q)
> >>>>> +static void queue_inc_swprod(struct arm_smmu_queue *q)
> >>>>>   {
> >>>>> -	u32 prod = (Q_WRP(q, q->prod) | Q_IDX(q, q->prod)) + 1;
> >>>>> +	u32 prod = q->prod + 1;
> >>>>>
> >>>>>   	q->prod = Q_OVF(q, q->prod) | Q_WRP(q, prod) | Q_IDX(q, prod);
> >>>>> +}
> >>>>> +
> >>>>> +static void queue_inc_prod(struct arm_smmu_queue *q)
> >>>>> +{
> >>>>> +	queue_inc_swprod(q);
> >>>>>   	writel(q->prod, q->prod_reg);
> >>>>>   }
> >>>>>
> >>>>> @@ -761,13 +769,24 @@ static void queue_write(__le64 *dst, u64 *src, size_t n_dwords)
> >>>>>   		*dst++ = cpu_to_le64(*src++);
> >>>>>   }
> >>>>>
> >>>>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> >>>>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
> >>>>>   {
> >>>>>   	if (queue_full(q))
> >>>>>   		return -ENOSPC;
> >>>>>
> >>>>>   	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
> >>>>> -	queue_inc_prod(q);
> >>>>> +
> >>>>> +	/*
> >>>>> +	 * We don't want too many commands to be delayed, this may lead the
> >>>>> +	 * followed sync command to wait for a long time.
> >>>>> +	 */
> >>>>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
> >>>>> +		queue_inc_swprod(q);
> >>>>> +	} else {
> >>>>> +		queue_inc_prod(q);
> >>>>> +		q->nr_delay = 0;
> >>>>> +	}
> >>>>> +  
> >>>>
> >>>> So here, you're effectively putting invalidation commands into the command
> >>>> queue without updating PROD. Do you actually see a performance advantage
> >>>> from doing so? Another side of the argument would be that we should be  
> >>> Yes, my sas ssd performance test showed that it can improve about 100-150K/s(the same to I directly replace
> >>> writel with writel_relaxed). And the average execution time of iommu_unmap(which called by iommu_dma_unmap_sg)
> >>> dropped from 10us to 5us.
> >>>     
> >>>> moving PROD as soon as we can, so that the SMMU can process invalidation
> >>>> commands in the background and reduce the cost of the final SYNC operation
> >>>> when the high-level unmap operation is complete.  
> >>> There maybe that __iowmb() is more expensive than wait for tlbi complete. Except the time of __iowmb()
> >>> itself, it also protected by spinlock, lock confliction will rise rapidly in the stress scene. __iowmb()
> >>> average cost 300-500ns(Sorry, I forget the exact value).
> >>>
> >>> In addition, after applied this patcheset and Robin's v2, and my earlier dma64 iova optimization patchset.
> >>> Our net performance test got the same data to global bypass. But sas ssd still have more than 20% dropped.
> >>> Maybe we should still focus at map/unamp, because the average execution time of iova alloc/free is only
> >>> about 400ns.
> >>>
> >>> By the way, patch2-5 is more effective than this one, it can improve more than 350K/s. And with it, we can
> >>> got about 100-150K/s improvement of Robin's v2. Otherwise, I saw non effective of Robin's v2. Sorry, I have
> >>> not tested how about this patch without patch2-5. Further more, I got the same performance data to global
> >>> bypass for the traditional mechanical hard disk with only patch2-5(without this patch and Robin's).
> >>>     
> > Hi All,
> > 
> > I'm a bit of late entry to this discussion.  Just been running some more
> > detailed tests on our d05 boards and wanted to bring some more numbers to
> > the discussion.
> > 
> > All tests against 4.12 with the following additions:
> > * Robin's series removing the io-pgtable spinlock (and a few recent fixes)
> > * Cherry picked updates to the sas driver, merged prior to 4.13-rc1
> > * An additional HNS (network card) bug fix that will be upstreamed shortly.
> > 
> > I've broken the results down into this patch and this patch + the remainder
> > of the set. As leizhen mentioned we got a nice little performance
> > bump from Robin's series so that was applied first (as it's in mainline now)
> > 
> > SAS tests were fio with noop scheduler, 4k block size and various io depths
> > 1 process per disk.  Note this is probably a different setup to leizhen's
> > original numbers.
> > 
> > Precentages are off the performance seen with the smmu disabled.
> > SAS
> > 4.12 - none of this series.
> > SMMU disabled
> > read io-depth 32 -   384K IOPS (100%)
> > read io-depth 2048 - 950K IOPS (100%)
> > rw io-depth 32 -     166K IOPS (100%)
> > rw io-depth 2048 -   340K IOPS (100%)
> > 
> > SMMU enabled
> > read io-depth 32 -   201K IOPS (52%)
> > read io-depth 2048 - 306K IOPS (32%)
> > rw io-depth 32 -     99K  IOPS (60%)
> > rw io-depth 2048 -   150K IOPS (44%)
> > 
> > Robin's recent series with fixes as seen on list (now merged)
> > SMMU enabled.
> > read io-depth 32 -   208K IOPS (54%)
> > read io-depth 2048 - 335K IOPS (35%)
> > rw io-depth 32 -     105K IOPS (63%)
> > rw io-depth 2048 -   165K IOPS (49%)
> > 
> > 4.12 + Robin's series + just this patch SMMU enabled
> > 
> > (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> > 
> > read io-depth 32 -   225K IOPS (59%)
> > read io-depth 2048 - 365K IOPS (38%)
> > rw io-depth 32 -     110K IOPS (66%)
> > rw io-depth 2048 -   179K IOPS (53%)
> > 
> > 4.12 + Robin's series + Second part of this series
> > 
> > (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> > (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> > (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> > (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> > 
> > read io-depth 32 -    225K IOPS (59%)
> > read io-depth 2048 -  833K IOPS (88%)
> > rw io-depth 32 -      112K IOPS (67%)
> > rw io-depth 2048 -    220K IOPS (65%)
> > 
> > Robin's series gave us small gains across the board (3-5% recovered)
> > relative to the no smmu performance (which we are taking as the ideal case)
> > 
> > This first patch gets us back another 2-5% of the no smmu performance
> > 
> > The next few patches get us very little advantage on the small io-depths
> > but make a large difference to the larger io-depths - in particular the
> > read IOPS which is over twice as fast as without the series.
> > 
> > For HNS it seems that we are less dependent on the SMMU performance and
> > can reach the non SMMU speed.
> > 
> > Tests with
> > iperf -t 30 -i 10 -c IPADDRESS -P 3 last 10 seconds taken to avoid any
> > initial variability.
> > 
> > The server end of the link was always running with smmu v3 disabled
> > so as to act as a fast sink of the data. Some variation seen across
> > repeat runs.
> > 
> > Mainline v4.12 + network card fix
> > NO SMMU
> > 9.42 GBits/sec
> > 
> > SMMU
> > 4.36 GBits/sec (46%)
> > 
> > Robin's io-pgtable spinlock series
> > 
> > 6.68 to 7.34 (71% - 78% variation across runs)
> > 
> > Just this patch SMMU enabled
> > 
> > (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> > 
> > 7.96-8.8 GBits/sec (85% - 94%  some variation across runs)
> > 
> > Full series
> > 
> > (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> > (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> > (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> > (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> > 
> > 9.42 GBits/Sec (100%)
> > 
> > So HNS test shows a greater boost from Robin's series and this first patch.
> > This is most likely because the HNS test is not putting as high a load on
> > the SMMU and associated code as the SAS test.
> > 
> > In both cases however, this shows that both parts of this patch
> > series are beneficial.
> > 
> > So on to the questions ;)
> > 
> > Will, you mentioned that along with Robin and Nate you were working on
> > a somewhat related strategy to improve the performance.  Any ETA on that?  
> 
> The strategy I was working on is basically equivalent to the second
> part of the series. I will test your patches out sometime this week, and
> I'll also try to have our performance team run it through their whole
> suite.
Thanks, that's excellent.  Look forward to hearing how it goes.
Particularly useful would be to know if there are particular performance tests
that show up anything interesting that we might want to replicate.
Jonathan and Leizhen
> 
> > 
> > As you might imagine, with the above numbers we are very keen to try and
> > move forward with this as quickly as possible.
> > 
> > If you want additional testing we would be happy to help.
> > 
> > Thanks,
> > 
> > Jonathan
> > 
> > 
> >   
> >>>>
> >>>> Will
> >>>>
> >>>> .
> >>>>     
> >>>     
> >>
> >>  
> > 
> >   
> 
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-07-18  9:20             ` Jonathan Cameron
@ 2017-07-20 19:07               ` Nate Watterson
  2017-07-21 10:57                 ` Jonathan Cameron
  0 siblings, 1 reply; 19+ messages in thread
From: Nate Watterson @ 2017-07-20 19:07 UTC (permalink / raw)
  To: linux-arm-kernel
Hi Jonathan,
[...]
>>>>>      
>>> Hi All,
>>>
>>> I'm a bit of late entry to this discussion.  Just been running some more
>>> detailed tests on our d05 boards and wanted to bring some more numbers to
>>> the discussion.
>>>
>>> All tests against 4.12 with the following additions:
>>> * Robin's series removing the io-pgtable spinlock (and a few recent fixes)
>>> * Cherry picked updates to the sas driver, merged prior to 4.13-rc1
>>> * An additional HNS (network card) bug fix that will be upstreamed shortly.
>>>
>>> I've broken the results down into this patch and this patch + the remainder
>>> of the set. As leizhen mentioned we got a nice little performance
>>> bump from Robin's series so that was applied first (as it's in mainline now)
>>>
>>> SAS tests were fio with noop scheduler, 4k block size and various io depths
>>> 1 process per disk.  Note this is probably a different setup to leizhen's
>>> original numbers.
>>>
>>> Precentages are off the performance seen with the smmu disabled.
>>> SAS
>>> 4.12 - none of this series.
>>> SMMU disabled
>>> read io-depth 32 -   384K IOPS (100%)
>>> read io-depth 2048 - 950K IOPS (100%)
>>> rw io-depth 32 -     166K IOPS (100%)
>>> rw io-depth 2048 -   340K IOPS (100%)
>>>
>>> SMMU enabled
>>> read io-depth 32 -   201K IOPS (52%)
>>> read io-depth 2048 - 306K IOPS (32%)
>>> rw io-depth 32 -     99K  IOPS (60%)
>>> rw io-depth 2048 -   150K IOPS (44%)
>>>
>>> Robin's recent series with fixes as seen on list (now merged)
>>> SMMU enabled.
>>> read io-depth 32 -   208K IOPS (54%)
>>> read io-depth 2048 - 335K IOPS (35%)
>>> rw io-depth 32 -     105K IOPS (63%)
>>> rw io-depth 2048 -   165K IOPS (49%)
>>>
>>> 4.12 + Robin's series + just this patch SMMU enabled
>>>
>>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
>>>
>>> read io-depth 32 -   225K IOPS (59%)
>>> read io-depth 2048 - 365K IOPS (38%)
>>> rw io-depth 32 -     110K IOPS (66%)
>>> rw io-depth 2048 -   179K IOPS (53%)
>>>
>>> 4.12 + Robin's series + Second part of this series
>>>
>>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
>>> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
>>> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
>>> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
>>>
>>> read io-depth 32 -    225K IOPS (59%)
>>> read io-depth 2048 -  833K IOPS (88%)
>>> rw io-depth 32 -      112K IOPS (67%)
>>> rw io-depth 2048 -    220K IOPS (65%)
>>>
>>> Robin's series gave us small gains across the board (3-5% recovered)
>>> relative to the no smmu performance (which we are taking as the ideal case)
>>>
>>> This first patch gets us back another 2-5% of the no smmu performance
>>>
>>> The next few patches get us very little advantage on the small io-depths
>>> but make a large difference to the larger io-depths - in particular the
>>> read IOPS which is over twice as fast as without the series.
>>>
>>> For HNS it seems that we are less dependent on the SMMU performance and
>>> can reach the non SMMU speed.
>>>
>>> Tests with
>>> iperf -t 30 -i 10 -c IPADDRESS -P 3 last 10 seconds taken to avoid any
>>> initial variability.
>>>
>>> The server end of the link was always running with smmu v3 disabled
>>> so as to act as a fast sink of the data. Some variation seen across
>>> repeat runs.
>>>
>>> Mainline v4.12 + network card fix
>>> NO SMMU
>>> 9.42 GBits/sec
>>>
>>> SMMU
>>> 4.36 GBits/sec (46%)
>>>
>>> Robin's io-pgtable spinlock series
>>>
>>> 6.68 to 7.34 (71% - 78% variation across runs)
>>>
>>> Just this patch SMMU enabled
>>>
>>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
>>>
>>> 7.96-8.8 GBits/sec (85% - 94%  some variation across runs)
>>>
>>> Full series
>>>
>>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
>>> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
>>> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
>>> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
>>>
>>> 9.42 GBits/Sec (100%)
>>>
>>> So HNS test shows a greater boost from Robin's series and this first patch.
>>> This is most likely because the HNS test is not putting as high a load on
>>> the SMMU and associated code as the SAS test.
>>>
>>> In both cases however, this shows that both parts of this patch
>>> series are beneficial.
>>>
>>> So on to the questions ;)
>>>
>>> Will, you mentioned that along with Robin and Nate you were working on
>>> a somewhat related strategy to improve the performance.  Any ETA on that?
>>
>> The strategy I was working on is basically equivalent to the second
>> part of the series. I will test your patches out sometime this week, and
>> I'll also try to have our performance team run it through their whole
>> suite.
> 
> Thanks, that's excellent.  Look forward to hearing how it goes.
I tested the patches with 4 NVME drives connected to a single SMMU and
the results seem to be inline with those you've reported.
FIO - 512k blocksize / io-depth 32 / 1 thread per drive
  Baseline 4.13-rc1 w/SMMU enabled: 25% of SMMU bypass performance
  Baseline + Patch 1              : 28%
  Baseline + Patches 2-5          : 86%
  Baseline + Complete series      : 100% [!!]
I saw performance improvements across all of the other FIO profiles I
tested, although not always as substantial as was seen in the 512k/32/1
case. The performance of some of the profiles, especially those with
many threads per drive, remains woeful (often below 20%), but hopefully
Robin's iova series will help improve that.
> 
> Particularly useful would be to know if there are particular performance tests
> that show up anything interesting that we might want to replicate.
> 
> Jonathan and Leizhen
>>
>>>
>>> As you might imagine, with the above numbers we are very keen to try and
>>> move forward with this as quickly as possible.
>>>
>>> If you want additional testing we would be happy to help.
>>>
>>> Thanks,
>>>
>>> Jonathan
[...]
-Nate
-- 
Qualcomm Datacenter Technologies as an affiliate of Qualcomm Technologies, Inc.
Qualcomm Technologies, Inc. is a member of the Code Aurora Forum, a Linux Foundation Collaborative Project.
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-07-20 19:07               ` Nate Watterson
@ 2017-07-21 10:57                 ` Jonathan Cameron
  0 siblings, 0 replies; 19+ messages in thread
From: Jonathan Cameron @ 2017-07-21 10:57 UTC (permalink / raw)
  To: linux-arm-kernel
On Thu, 20 Jul 2017 15:07:05 -0400
Nate Watterson <nwatters@codeaurora.org> wrote:
> Hi Jonathan,
> 
> [...]
> >>>>>        
> >>> Hi All,
> >>>
> >>> I'm a bit of late entry to this discussion.  Just been running some more
> >>> detailed tests on our d05 boards and wanted to bring some more numbers to
> >>> the discussion.
> >>>
> >>> All tests against 4.12 with the following additions:
> >>> * Robin's series removing the io-pgtable spinlock (and a few recent fixes)
> >>> * Cherry picked updates to the sas driver, merged prior to 4.13-rc1
> >>> * An additional HNS (network card) bug fix that will be upstreamed shortly.
> >>>
> >>> I've broken the results down into this patch and this patch + the remainder
> >>> of the set. As leizhen mentioned we got a nice little performance
> >>> bump from Robin's series so that was applied first (as it's in mainline now)
> >>>
> >>> SAS tests were fio with noop scheduler, 4k block size and various io depths
> >>> 1 process per disk.  Note this is probably a different setup to leizhen's
> >>> original numbers.
> >>>
> >>> Precentages are off the performance seen with the smmu disabled.
> >>> SAS
> >>> 4.12 - none of this series.
> >>> SMMU disabled
> >>> read io-depth 32 -   384K IOPS (100%)
> >>> read io-depth 2048 - 950K IOPS (100%)
> >>> rw io-depth 32 -     166K IOPS (100%)
> >>> rw io-depth 2048 -   340K IOPS (100%)
> >>>
> >>> SMMU enabled
> >>> read io-depth 32 -   201K IOPS (52%)
> >>> read io-depth 2048 - 306K IOPS (32%)
> >>> rw io-depth 32 -     99K  IOPS (60%)
> >>> rw io-depth 2048 -   150K IOPS (44%)
> >>>
> >>> Robin's recent series with fixes as seen on list (now merged)
> >>> SMMU enabled.
> >>> read io-depth 32 -   208K IOPS (54%)
> >>> read io-depth 2048 - 335K IOPS (35%)
> >>> rw io-depth 32 -     105K IOPS (63%)
> >>> rw io-depth 2048 -   165K IOPS (49%)
> >>>
> >>> 4.12 + Robin's series + just this patch SMMU enabled
> >>>
> >>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> >>>
> >>> read io-depth 32 -   225K IOPS (59%)
> >>> read io-depth 2048 - 365K IOPS (38%)
> >>> rw io-depth 32 -     110K IOPS (66%)
> >>> rw io-depth 2048 -   179K IOPS (53%)
> >>>
> >>> 4.12 + Robin's series + Second part of this series
> >>>
> >>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> >>> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> >>> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> >>> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> >>>
> >>> read io-depth 32 -    225K IOPS (59%)
> >>> read io-depth 2048 -  833K IOPS (88%)
> >>> rw io-depth 32 -      112K IOPS (67%)
> >>> rw io-depth 2048 -    220K IOPS (65%)
> >>>
> >>> Robin's series gave us small gains across the board (3-5% recovered)
> >>> relative to the no smmu performance (which we are taking as the ideal case)
> >>>
> >>> This first patch gets us back another 2-5% of the no smmu performance
> >>>
> >>> The next few patches get us very little advantage on the small io-depths
> >>> but make a large difference to the larger io-depths - in particular the
> >>> read IOPS which is over twice as fast as without the series.
> >>>
> >>> For HNS it seems that we are less dependent on the SMMU performance and
> >>> can reach the non SMMU speed.
> >>>
> >>> Tests with
> >>> iperf -t 30 -i 10 -c IPADDRESS -P 3 last 10 seconds taken to avoid any
> >>> initial variability.
> >>>
> >>> The server end of the link was always running with smmu v3 disabled
> >>> so as to act as a fast sink of the data. Some variation seen across
> >>> repeat runs.
> >>>
> >>> Mainline v4.12 + network card fix
> >>> NO SMMU
> >>> 9.42 GBits/sec
> >>>
> >>> SMMU
> >>> 4.36 GBits/sec (46%)
> >>>
> >>> Robin's io-pgtable spinlock series
> >>>
> >>> 6.68 to 7.34 (71% - 78% variation across runs)
> >>>
> >>> Just this patch SMMU enabled
> >>>
> >>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> >>>
> >>> 7.96-8.8 GBits/sec (85% - 94%  some variation across runs)
> >>>
> >>> Full series
> >>>
> >>> (iommu/arm-smmu-v3: put of the execution of TLBI* to reduce lock conflict)
> >>> (iommu: add a new member unmap_tlb_sync into struct iommu_ops)
> >>> (iommu/arm-smmu-v3: add supprot for unmap an iova range with only on tlb sync)
> >>> (iommu/arm-smmu: add support for unmap of a memory range with only one tlb sync)
> >>>
> >>> 9.42 GBits/Sec (100%)
> >>>
> >>> So HNS test shows a greater boost from Robin's series and this first patch.
> >>> This is most likely because the HNS test is not putting as high a load on
> >>> the SMMU and associated code as the SAS test.
> >>>
> >>> In both cases however, this shows that both parts of this patch
> >>> series are beneficial.
> >>>
> >>> So on to the questions ;)
> >>>
> >>> Will, you mentioned that along with Robin and Nate you were working on
> >>> a somewhat related strategy to improve the performance.  Any ETA on that?  
> >>
> >> The strategy I was working on is basically equivalent to the second
> >> part of the series. I will test your patches out sometime this week, and
> >> I'll also try to have our performance team run it through their whole
> >> suite.  
> > 
> > Thanks, that's excellent.  Look forward to hearing how it goes.  
> 
> I tested the patches with 4 NVME drives connected to a single SMMU and
> the results seem to be inline with those you've reported.
> 
> FIO - 512k blocksize / io-depth 32 / 1 thread per drive
>   Baseline 4.13-rc1 w/SMMU enabled: 25% of SMMU bypass performance
>   Baseline + Patch 1              : 28%
>   Baseline + Patches 2-5          : 86%
>   Baseline + Complete series      : 100% [!!]
> 
> I saw performance improvements across all of the other FIO profiles I
> tested, although not always as substantial as was seen in the 512k/32/1
> case. The performance of some of the profiles, especially those with
> many threads per drive, remains woeful (often below 20%), but hopefully
> Robin's iova series will help improve that.
Excellent.  Thanks for the info and running the tests.
Even with both series we are still seeing some reduction in over the no-smmu
performance, but to a much lesser extent. 
Jonathan
> 
> > 
> > Particularly useful would be to know if there are particular performance tests
> > that show up anything interesting that we might want to replicate.
> > 
> > Jonathan and Leizhen  
> >>  
> >>>
> >>> As you might imagine, with the above numbers we are very keen to try and
> >>> move forward with this as quickly as possible.
> >>>
> >>> If you want additional testing we would be happy to help.
> >>>
> >>> Thanks,
> >>>
> >>> Jonathan  
> [...]
> 
> -Nate
> 
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 0/5] arm-smmu: performance optimization
  2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
                   ` (4 preceding siblings ...)
  2017-06-26 13:38 ` [PATCH 5/5] iommu/io-pgtable: delete member tlb_sync_pending of struct io_pgtable Zhen Lei
@ 2017-08-17 14:36 ` Will Deacon
  2017-08-18  3:19   ` Leizhen (ThunderTown)
  5 siblings, 1 reply; 19+ messages in thread
From: Will Deacon @ 2017-08-17 14:36 UTC (permalink / raw)
  To: linux-arm-kernel
Thunder, Nate, Robin,
On Mon, Jun 26, 2017 at 09:38:45PM +0800, Zhen Lei wrote:
> I described the optimization more detail in patch 1 and 2, and patch 3-5 are
> the implementation on arm-smmu/arm-smmu-v3 of patch 2.
> 
> Patch 1 is v2. In v1, I directly replaced writel with writel_relaxed in
> queue_inc_prod. But Robin figured that it may lead SMMU consume stale
> memory contents. I thought more than 3 whole days and got this one.
> 
> This patchset is based on Robin Murphy's [PATCH v2 0/8] io-pgtable lock removal.
For the time being, I think we should focus on the new TLB flushing
interface posted by Joerg:
http://lkml.kernel.org/r/1502974596-23835-1-git-send-email-joro at 8bytes.org
which looks like it can give us most of the benefits of this series. Once
we've got that, we can see what's left in the way of performance and focus
on the cmdq batching separately (because I'm still not convinced about it).
Thanks,
Will
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 0/5] arm-smmu: performance optimization
  2017-08-17 14:36 ` [PATCH 0/5] arm-smmu: performance optimization Will Deacon
@ 2017-08-18  3:19   ` Leizhen (ThunderTown)
  2017-08-18  8:39     ` Will Deacon
  0 siblings, 1 reply; 19+ messages in thread
From: Leizhen (ThunderTown) @ 2017-08-18  3:19 UTC (permalink / raw)
  To: linux-arm-kernel
On 2017/8/17 22:36, Will Deacon wrote:
> Thunder, Nate, Robin,
> 
> On Mon, Jun 26, 2017 at 09:38:45PM +0800, Zhen Lei wrote:
>> I described the optimization more detail in patch 1 and 2, and patch 3-5 are
>> the implementation on arm-smmu/arm-smmu-v3 of patch 2.
>>
>> Patch 1 is v2. In v1, I directly replaced writel with writel_relaxed in
>> queue_inc_prod. But Robin figured that it may lead SMMU consume stale
>> memory contents. I thought more than 3 whole days and got this one.
>>
>> This patchset is based on Robin Murphy's [PATCH v2 0/8] io-pgtable lock removal.
> 
> For the time being, I think we should focus on the new TLB flushing
> interface posted by Joerg:
> 
> http://lkml.kernel.org/r/1502974596-23835-1-git-send-email-joro at 8bytes.org
> 
> which looks like it can give us most of the benefits of this series. Once
> we've got that, we can see what's left in the way of performance and focus
> on the cmdq batching separately (because I'm still not convinced about it).
OK, this is a good news.
But I have a review comment(sorry, I have not subscribed it yet, so can not directly reply it):
I don't think we should add tlb sync for map operation
1. at init time, all tlbs will be invalidated
2. when we try to map a new range, there are no related ptes bufferd in tlb, because of above 1 and below 3
3. when we unmap the above range, make sure all related ptes bufferd in tlb to be invalidated before unmap finished
> 
> Thanks,
> 
> Will
> 
> .
> 
-- 
Thanks!
BestRegards
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 0/5] arm-smmu: performance optimization
  2017-08-18  3:19   ` Leizhen (ThunderTown)
@ 2017-08-18  8:39     ` Will Deacon
  0 siblings, 0 replies; 19+ messages in thread
From: Will Deacon @ 2017-08-18  8:39 UTC (permalink / raw)
  To: linux-arm-kernel
On Fri, Aug 18, 2017 at 11:19:00AM +0800, Leizhen (ThunderTown) wrote:
> 
> 
> On 2017/8/17 22:36, Will Deacon wrote:
> > Thunder, Nate, Robin,
> > 
> > On Mon, Jun 26, 2017 at 09:38:45PM +0800, Zhen Lei wrote:
> >> I described the optimization more detail in patch 1 and 2, and patch 3-5 are
> >> the implementation on arm-smmu/arm-smmu-v3 of patch 2.
> >>
> >> Patch 1 is v2. In v1, I directly replaced writel with writel_relaxed in
> >> queue_inc_prod. But Robin figured that it may lead SMMU consume stale
> >> memory contents. I thought more than 3 whole days and got this one.
> >>
> >> This patchset is based on Robin Murphy's [PATCH v2 0/8] io-pgtable lock removal.
> > 
> > For the time being, I think we should focus on the new TLB flushing
> > interface posted by Joerg:
> > 
> > http://lkml.kernel.org/r/1502974596-23835-1-git-send-email-joro at 8bytes.org
> > 
> > which looks like it can give us most of the benefits of this series. Once
> > we've got that, we can see what's left in the way of performance and focus
> > on the cmdq batching separately (because I'm still not convinced about it).
> OK, this is a good news.
> 
> But I have a review comment(sorry, I have not subscribed it yet, so can not directly reply it):
> I don't think we should add tlb sync for map operation
> 1. at init time, all tlbs will be invalidated
> 2. when we try to map a new range, there are no related ptes bufferd in tlb, because of above 1 and below 3
> 3. when we unmap the above range, make sure all related ptes bufferd in tlb to be invalidated before unmap finished
Yup, you're completely correct and I raised that with Joerg, who is looking
into a way to avoid it.
Will
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
  2017-06-28  9:32   ` Will Deacon
@ 2017-08-22 15:41   ` Joerg Roedel
  2017-08-23  1:21     ` Leizhen (ThunderTown)
  1 sibling, 1 reply; 19+ messages in thread
From: Joerg Roedel @ 2017-08-22 15:41 UTC (permalink / raw)
  To: linux-arm-kernel
On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>  {
>  	if (queue_full(q))
>  		return -ENOSPC;
>  
>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
> -	queue_inc_prod(q);
> +
> +	/*
> +	 * We don't want too many commands to be delayed, this may lead the
> +	 * followed sync command to wait for a long time.
> +	 */
> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
> +		queue_inc_swprod(q);
> +	} else {
> +		queue_inc_prod(q);
> +		q->nr_delay = 0;
> +	}
> +
>  	return 0;
>  }
>  
> @@ -909,6 +928,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
>  static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>  				    struct arm_smmu_cmdq_ent *ent)
>  {
> +	int optimize = 0;
>  	u64 cmd[CMDQ_ENT_DWORDS];
>  	unsigned long flags;
>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
> @@ -920,8 +940,17 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>  		return;
>  	}
>  
> +	/*
> +	 * All TLBI commands should be followed by a sync command later.
> +	 * The CFGI commands is the same, but they are rarely executed.
> +	 * So just optimize TLBI commands now, to reduce the "if" judgement.
> +	 */
> +	if ((ent->opcode >= CMDQ_OP_TLBI_NH_ALL) &&
> +	    (ent->opcode <= CMDQ_OP_TLBI_NSNH_ALL))
> +		optimize = 1;
> +
>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
> -	while (queue_insert_raw(q, cmd) == -ENOSPC) {
> +	while (queue_insert_raw(q, cmd, optimize) == -ENOSPC) {
>  		if (queue_poll_cons(q, false, wfe))
>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>  	}
This doesn't look correct. How do you make sure that a given IOVA range
is flushed before the addresses are reused?
Regards,
	Joerg
^ permalink raw reply	[flat|nested] 19+ messages in thread
* [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction
  2017-08-22 15:41   ` Joerg Roedel
@ 2017-08-23  1:21     ` Leizhen (ThunderTown)
  0 siblings, 0 replies; 19+ messages in thread
From: Leizhen (ThunderTown) @ 2017-08-23  1:21 UTC (permalink / raw)
  To: linux-arm-kernel
On 2017/8/22 23:41, Joerg Roedel wrote:
> On Mon, Jun 26, 2017 at 09:38:46PM +0800, Zhen Lei wrote:
>> -static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent)
>> +static int queue_insert_raw(struct arm_smmu_queue *q, u64 *ent, int optimize)
>>  {
>>  	if (queue_full(q))
>>  		return -ENOSPC;
>>  
>>  	queue_write(Q_ENT(q, q->prod), ent, q->ent_dwords);
>> -	queue_inc_prod(q);
>> +
>> +	/*
>> +	 * We don't want too many commands to be delayed, this may lead the
>> +	 * followed sync command to wait for a long time.
>> +	 */
>> +	if (optimize && (++q->nr_delay < CMDQ_MAX_DELAYED)) {
>> +		queue_inc_swprod(q);
>> +	} else {
>> +		queue_inc_prod(q);
>> +		q->nr_delay = 0;
>> +	}
>> +
>>  	return 0;
>>  }
>>  
>> @@ -909,6 +928,7 @@ static void arm_smmu_cmdq_skip_err(struct arm_smmu_device *smmu)
>>  static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>  				    struct arm_smmu_cmdq_ent *ent)
>>  {
>> +	int optimize = 0;
>>  	u64 cmd[CMDQ_ENT_DWORDS];
>>  	unsigned long flags;
>>  	bool wfe = !!(smmu->features & ARM_SMMU_FEAT_SEV);
>> @@ -920,8 +940,17 @@ static void arm_smmu_cmdq_issue_cmd(struct arm_smmu_device *smmu,
>>  		return;
>>  	}
>>  
>> +	/*
>> +	 * All TLBI commands should be followed by a sync command later.
>> +	 * The CFGI commands is the same, but they are rarely executed.
>> +	 * So just optimize TLBI commands now, to reduce the "if" judgement.
>> +	 */
>> +	if ((ent->opcode >= CMDQ_OP_TLBI_NH_ALL) &&
>> +	    (ent->opcode <= CMDQ_OP_TLBI_NSNH_ALL))
>> +		optimize = 1;
>> +
>>  	spin_lock_irqsave(&smmu->cmdq.lock, flags);
>> -	while (queue_insert_raw(q, cmd) == -ENOSPC) {
>> +	while (queue_insert_raw(q, cmd, optimize) == -ENOSPC) {
>>  		if (queue_poll_cons(q, false, wfe))
>>  			dev_err_ratelimited(smmu->dev, "CMDQ timeout\n");
>>  	}
> 
> This doesn't look correct. How do you make sure that a given IOVA range
> is flushed before the addresses are reused?
Hi, Joerg:
	It's actullay guaranteed by the upper layer functions, for example:
	static int arm_lpae_unmap(
        ...
    	unmapped = __arm_lpae_unmap(data, iova, size, lvl, ptep);	//__arm_lpae_unmap will indirectly call arm_smmu_cmdq_issue_cmd to invalidate tlbs
	if (unmapped)
		io_pgtable_tlb_sync(&data->iop);			//a tlb_sync wait all tlbi operations finished
	
	I also described it in the next patch(2/5). Showed below:
Some people might ask: Is it safe to do so? The answer is yes. The standard
processing flow is:
	alloc iova
	map
	process data
	unmap
	tlb invalidation and sync
	free iova
What should be guaranteed is: "free iova" action is behind "unmap" and "tlbi
operation" action, that is what we are doing right now. This ensures that:
all TLBs of an iova-range have been invalidated before the iova reallocated.
Best regards,
	LeiZhen
> 
> 
> Regards,
> 
> 	Joerg
> 
> 
> .
> 
-- 
Thanks!
BestRegards
^ permalink raw reply	[flat|nested] 19+ messages in thread
end of thread, other threads:[~2017-08-23  1:21 UTC | newest]
Thread overview: 19+ messages (download: mbox.gz follow: Atom feed
-- links below jump to the message on this page --
2017-06-26 13:38 [PATCH 0/5] arm-smmu: performance optimization Zhen Lei
2017-06-26 13:38 ` [PATCH 1/5] iommu/arm-smmu-v3: put off the execution of TLBI* to reduce lock confliction Zhen Lei
2017-06-28  9:32   ` Will Deacon
2017-06-29  2:08     ` Leizhen (ThunderTown)
2017-07-17 13:06       ` John Garry
2017-07-17 14:23         ` Jonathan Cameron
2017-07-17 17:28           ` Nate Watterson
2017-07-18  9:20             ` Jonathan Cameron
2017-07-20 19:07               ` Nate Watterson
2017-07-21 10:57                 ` Jonathan Cameron
2017-08-22 15:41   ` Joerg Roedel
2017-08-23  1:21     ` Leizhen (ThunderTown)
2017-06-26 13:38 ` [PATCH 2/5] iommu: add a new member unmap_tlb_sync into struct iommu_ops Zhen Lei
2017-06-26 13:38 ` [PATCH 3/5] iommu/arm-smmu-v3: add support for unmap an iova range with only one tlb sync Zhen Lei
2017-06-26 13:38 ` [PATCH 4/5] iommu/arm-smmu: add support for unmap a memory " Zhen Lei
2017-06-26 13:38 ` [PATCH 5/5] iommu/io-pgtable: delete member tlb_sync_pending of struct io_pgtable Zhen Lei
2017-08-17 14:36 ` [PATCH 0/5] arm-smmu: performance optimization Will Deacon
2017-08-18  3:19   ` Leizhen (ThunderTown)
2017-08-18  8:39     ` Will Deacon
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).